def __init__(self,
              cluster=None,
              interval="1s",
              startup_cost="1s",
              scale_factor=2,
              minimum=0,
              maximum=None,
              wait_count=3,
              target_duration="5s",
              worker_key=None,
              **kwargs):
     interval = parse_timedelta(interval, default="ms")
     self.worker_key = worker_key
     self.cluster = cluster
     self.startup_cost = parse_timedelta(startup_cost, default="s")
     self.scale_factor = scale_factor
     if self.cluster:
         self._adapt_callback = PeriodicCallback(self._adapt,
                                                 interval * 1000,
                                                 io_loop=self.loop)
         self.loop.add_callback(self._adapt_callback.start)
     self._adapting = False
     self._workers_to_close_kwargs = kwargs
     self.minimum = minimum
     self.maximum = maximum
     self.log = deque(maxlen=1000)
     self.close_counts = {}
     self.wait_count = wait_count
     self.target_duration = parse_timedelta(target_duration)
Exemple #2
0
    def _widget(self):
        """ Create IPython widget for display within a notebook """
        try:
            return self._cached_widget
        except AttributeError:
            pass

        from ipywidgets import Layout, VBox, HBox, IntText, Button, HTML

        client = self._dask_client()

        layout = Layout(width='150px')

        title = HTML('<h2>YarnCluster</h2>')

        status = HTML(self._widget_status(), layout=Layout(min_width='150px'))

        request = IntText(0, description='Workers', layout=layout)
        scale = Button(description='Scale', layout=layout)

        @scale.on_click
        def scale_cb(b):
            with log_errors():
                self.scale(request.value)

        elements = [title, HBox([status, request, scale])]

        if self.dashboard_link is not None:
            link = HTML('<p><b>Dashboard: </b><a href="%s" target="_blank">%s'
                        '</a></p>\n' %
                        (self.dashboard_link, self.dashboard_link))
            elements.append(link)

        self._cached_widget = box = VBox(elements)

        def update():
            status.value = self._widget_status()

        pc = PeriodicCallback(update, 500, io_loop=client.loop)
        pc.start()

        return box
Exemple #3
0
    def _widget(self):
        """ Create IPython widget for display within a notebook """
        try:
            return self._cached_widget
        except AttributeError:
            pass

        from ipywidgets import Layout, VBox, HBox, IntText, Button, HTML

        client = self._dask_client()

        layout = Layout(width='150px')

        title = HTML('<h2>YarnCluster</h2>')

        status = HTML(self._widget_status(), layout=Layout(min_width='150px'))

        request = IntText(0, description='Workers', layout=layout)
        scale = Button(description='Scale', layout=layout)

        @scale.on_click
        def scale_cb(b):
            with log_errors():
                self.scale(request.value)

        box = VBox([title,
                    HBox([status, request, scale])])

        self._cached_widget = box

        def update():
            status.value = self._widget_status()

        pc = PeriodicCallback(update, 500, io_loop=client.loop)
        pc.start()

        return box
Exemple #4
0
    def __init__(self, template=None, cleanup_interval=1000, hostname=None,
                 script=None, preexec_commands=(), copy_script=True,
                 ip='',
                 **kwargs):
        """
        Dask workers launched by a DRMAA-compatible cluster

        Parameters
        ----------
        template: dict
            Dictionary specifying options to pass to the DRMAA cluster
            and the worker. Relevant items are:

            jobName: string
                Name of the job as known by the DRMAA cluster.
            args: list
                Extra string arguments to pass to dask-worker
            outputPath: string
                Path to the dask-worker stdout. Must start with ':'.
                Defaults to worker.JOBID.TASKID.out in current directory.
            errorPath: string
                Path to the dask-worker stderr. Must start with ':'
                Defaults to worker.JOBID.TASKID.err in current directory.
            workingDirectory: string
                Where dask-worker runs, defaults to current directory
            nativeSpecification: string
                Options native to the job scheduler

        cleanup_interval: int
            Time interval in seconds at which closed workers are cleaned.
            Defaults to 1000
        hostname: string
            Host on which to start the local scheduler, defaults to localhost
        script: string (optional)
            Path to the dask-worker executable script.
            A temporary file will be made if none is provided (recommended)
        preexec_commands: tuple (optional)
            Commands to be executed first by temporary script. Cannot be
            specified at the same time as script.
        copy_script: bool
            Whether should copy the passed script to the current working
            directory. This is primarily to work around an issue with SGE.
        ip: string
            IP of the scheduler, default is the empty string
            which will listen on the primary ip address of the host
        **kwargs:
            Additional keyword arguments to be passed to the local scheduler

        Examples
        --------
        >>> from dask_drmaa import DRMAACluster          # doctest: +SKIP
        >>> cluster = DRMAACluster()                     # doctest: +SKIP
        >>> cluster.start_workers(10)                    # doctest: +SKIP

        >>> from distributed import Client               # doctest: +SKIP
        >>> client = Client(cluster)                     # doctest: +SKIP

        >>> future = client.submit(lambda x: x + 1, 10)  # doctest: +SKIP
        >>> future.result()                              # doctest: +SKIP
        11
        """
        self.hostname = hostname or socket.gethostname()
        logger.info("Start local scheduler at %s", self.hostname)
        self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs)

        if script is None:
            fn = os.path.abspath(tempfile.mktemp(
                suffix='.sh',
                prefix='dask-worker-script-',
                dir=os.path.curdir,
            ))
            self.script = fn
            self._should_cleanup_script = True

            script_contents = make_job_script(executable=worker_bin_path,
                                              name='%s.%s' % (JOB_ID, TASK_ID),
                                              preexec=preexec_commands)
            with open(fn, 'wt') as f:
                f.write(script_contents)

            @atexit.register
            def remove_script():
                if os.path.exists(fn):
                    os.remove(fn)

            os.chmod(self.script, 0o777)

        else:
            self._should_cleanup_script = False
            if copy_script:
                with ignoring(EnvironmentError):  # may be in the same path
                    shutil.copy(script, os.path.curdir)  # python 2.x returns None
                    script = os.path.join(os.path.curdir, os.path.basename(script))
                    self._should_cleanup_script = True
            self.script = os.path.abspath(script)
            assert not preexec_commands, "Cannot specify both script and preexec_commands"

        # TODO: check that user-provided script is executable

        self.template = merge(default_template,
                              {'remoteCommand': self.script},
                              template or {})

        self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers,
                                                  callback_time=cleanup_interval,
                                                  io_loop=self.scheduler.loop)
        self._cleanup_callback.start()

        self.workers = {}  # {job-id: WorkerSpec}
Exemple #5
0
class DRMAACluster(Cluster):
    def __init__(self, template=None, cleanup_interval=1000, hostname=None,
                 script=None, preexec_commands=(), copy_script=True,
                 ip='',
                 **kwargs):
        """
        Dask workers launched by a DRMAA-compatible cluster

        Parameters
        ----------
        template: dict
            Dictionary specifying options to pass to the DRMAA cluster
            and the worker. Relevant items are:

            jobName: string
                Name of the job as known by the DRMAA cluster.
            args: list
                Extra string arguments to pass to dask-worker
            outputPath: string
                Path to the dask-worker stdout. Must start with ':'.
                Defaults to worker.JOBID.TASKID.out in current directory.
            errorPath: string
                Path to the dask-worker stderr. Must start with ':'
                Defaults to worker.JOBID.TASKID.err in current directory.
            workingDirectory: string
                Where dask-worker runs, defaults to current directory
            nativeSpecification: string
                Options native to the job scheduler

        cleanup_interval: int
            Time interval in seconds at which closed workers are cleaned.
            Defaults to 1000
        hostname: string
            Host on which to start the local scheduler, defaults to localhost
        script: string (optional)
            Path to the dask-worker executable script.
            A temporary file will be made if none is provided (recommended)
        preexec_commands: tuple (optional)
            Commands to be executed first by temporary script. Cannot be
            specified at the same time as script.
        copy_script: bool
            Whether should copy the passed script to the current working
            directory. This is primarily to work around an issue with SGE.
        ip: string
            IP of the scheduler, default is the empty string
            which will listen on the primary ip address of the host
        **kwargs:
            Additional keyword arguments to be passed to the local scheduler

        Examples
        --------
        >>> from dask_drmaa import DRMAACluster          # doctest: +SKIP
        >>> cluster = DRMAACluster()                     # doctest: +SKIP
        >>> cluster.start_workers(10)                    # doctest: +SKIP

        >>> from distributed import Client               # doctest: +SKIP
        >>> client = Client(cluster)                     # doctest: +SKIP

        >>> future = client.submit(lambda x: x + 1, 10)  # doctest: +SKIP
        >>> future.result()                              # doctest: +SKIP
        11
        """
        self.hostname = hostname or socket.gethostname()
        logger.info("Start local scheduler at %s", self.hostname)
        self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs)

        if script is None:
            fn = os.path.abspath(tempfile.mktemp(
                suffix='.sh',
                prefix='dask-worker-script-',
                dir=os.path.curdir,
            ))
            self.script = fn
            self._should_cleanup_script = True

            script_contents = make_job_script(executable=worker_bin_path,
                                              name='%s.%s' % (JOB_ID, TASK_ID),
                                              preexec=preexec_commands)
            with open(fn, 'wt') as f:
                f.write(script_contents)

            @atexit.register
            def remove_script():
                if os.path.exists(fn):
                    os.remove(fn)

            os.chmod(self.script, 0o777)

        else:
            self._should_cleanup_script = False
            if copy_script:
                with ignoring(EnvironmentError):  # may be in the same path
                    shutil.copy(script, os.path.curdir)  # python 2.x returns None
                    script = os.path.join(os.path.curdir, os.path.basename(script))
                    self._should_cleanup_script = True
            self.script = os.path.abspath(script)
            assert not preexec_commands, "Cannot specify both script and preexec_commands"

        # TODO: check that user-provided script is executable

        self.template = merge(default_template,
                              {'remoteCommand': self.script},
                              template or {})

        self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers,
                                                  callback_time=cleanup_interval,
                                                  io_loop=self.scheduler.loop)
        self._cleanup_callback.start()

        self.workers = {}  # {job-id: WorkerSpec}

    def adapt(self, **kwargs):
        """ Turn on adaptivity

        For keyword arguments see dask_drmaa.adaptive.Adaptive

        Examples
        --------
        >>> cluster.adapt(minimum=0, maximum=10, interval='500ms')

        See Also
        --------
        Cluster: an interface for other clusters to inherit from
        """
        from .adaptive import Adaptive

        with ignoring(AttributeError):
            self._adaptive.stop()
        if not hasattr(self, '_adaptive_options'):
            self._adaptive_options = {}

        self._adaptive_options.update(kwargs)
        self._adaptive = Adaptive(
            self, self.scheduler, **self._adaptive_options
        )

        return self._adaptive

    @gen.coroutine
    def _start(self):
        pass

    @property
    def scheduler(self):
        return self.local_cluster.scheduler

    def create_job_template(self, **kwargs):
        template = self.template.copy()
        if kwargs:
            template.update(kwargs)
        template['args'] = [self.scheduler_address] + template['args']

        jt = get_session().createJobTemplate()
        valid_attributes = dir(jt)

        for key, value in template.items():
            if key not in valid_attributes:
                raise ValueError("Invalid job template attribute %s" % key)
            setattr(jt, key, value)

        return jt

    def start_workers(self, n=1, **kwargs):
        if n == 0:
            return

        with log_errors():
            with self.create_job_template(**kwargs) as jt:
                ids = get_session().runBulkJobs(jt, 1, n, 1)
                logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0])
                self.workers.update(
                    {jid: WorkerSpec(job_id=jid, kwargs=kwargs,
                                     stdout=worker_out_path_template % dict(jid=jid, ext='out'),
                                     stderr=worker_out_path_template % dict(jid=jid, ext='err'),
                                     )
                     for jid in ids})

    @gen.coroutine
    def stop_workers(self, worker_ids, sync=False):
        if isinstance(worker_ids, str):
            worker_ids = [worker_ids]
        elif worker_ids:
            worker_ids = list(worker_ids)
        else:
            return

        # Let the scheduler gracefully retire workers first
        ids_to_ips = {
            v['name']: k for k, v in self.scheduler.worker_info.items()
        }
        worker_ips = [ids_to_ips[wid]
                      for wid in worker_ids
                      if wid in ids_to_ips]
        retired = yield self.scheduler.retire_workers(workers=worker_ips,
                                                      close_workers=True)
        logger.info("Retired workers %s", retired)
        for wid in list(worker_ids):
            try:
                get_session().control(wid, drmaa.JobControlAction.TERMINATE)
            except drmaa.errors.InvalidJobException:
                pass
            try:
                self.workers.pop(wid)
            except KeyError:
                # If we have multiple callers at once, it may have already
                # been popped off
                pass

        logger.info("Stop workers %s", worker_ids)
        if sync:
            get_session().synchronize(worker_ids, dispose=True)

    @gen.coroutine
    def scale_up(self, n, **kwargs):
        yield [self.start_workers(**kwargs)
               for _ in range(n - len(self.workers))]

    @gen.coroutine
    def scale_down(self, workers):
        workers = set(workers)
        yield self.scheduler.retire_workers(workers=workers)

    def close(self):
        logger.info("Closing DRMAA cluster")
        self.stop_workers(self.workers, sync=True)

        self.local_cluster.close()
        if self._should_cleanup_script and os.path.exists(self.script):
            os.remove(self.script)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def cleanup_closed_workers(self):
        for jid in list(self.workers):
            if get_session().jobStatus(jid) in ('closed', 'done'):
                logger.info("Removing closed worker %s", jid)
                del self.workers[jid]

    def __del__(self):
        try:
            self.close()
        except:
            pass

    def __str__(self):
        return "<%s: %d workers>" % (self.__class__.__name__, len(self.workers))

    __repr__ = __str__
    def _widget(self):
        """ Create IPython widget for display within a notebook """
        try:
            return self._cached_widget
        except AttributeError:
            pass

        from ipywidgets import (
            Layout,
            VBox,
            HBox,
            IntText,
            Button,
            HTML,
            Accordion,
            Text,
        )

        layout = Layout(width="150px")

        if "dashboard" in self.scheduler.services:
            link = self.dashboard_link
            link = '<p><b>Dashboard: </b><a href="%s" target="_blank">%s</a></p>\n' % (
                link,
                link,
            )
        else:
            link = ""

        title = "<h2>%s</h2>" % type(self).__name__
        title = HTML(title)
        dashboard = HTML(link)

        status = HTML(self._widget_status(), layout=Layout(min_width="150px"))

        request = IntText(0, description="Workers", layout=layout)
        scale = Button(description="Scale", layout=layout)
        request_cores = IntText(0, description="Cores", layout=layout)
        scale_cores = Button(description="Scale", layout=layout)
        request_memory = Text("O GB", description="Memory", layout=layout)
        scale_memory = Button(description="Scale", layout=layout)

        minimum = IntText(0, description="Minimum", layout=layout)
        maximum = IntText(0, description="Maximum", layout=layout)
        adapt = Button(description="Adapt", layout=layout)
        minimum_cores = IntText(0, description="Min cores", layout=layout)
        maximum_cores = IntText(0, description="Max cores", layout=layout)
        adapt_cores = Button(description="Adapt", layout=layout)
        minimum_mem = Text("0 GB", description="Min memory", layout=layout)
        maximum_mem = Text("0 GB", description="Max memory", layout=layout)
        adapt_mem = Button(description="Adapt", layout=layout)

        scale_hbox = [HBox([request, scale])]
        adapt_hbox = [HBox([minimum, maximum, adapt])]
        if hasattr(self, "jobqueue_worker_spec"):
            scale_hbox.append(HBox([request_cores, scale_cores]))
            scale_hbox.append(HBox([request_memory, scale_memory]))
            adapt_hbox.append(HBox([minimum_cores, maximum_cores,
                                    adapt_cores]))
            adapt_hbox.append(HBox([minimum_mem, maximum_mem, adapt_mem]))

        accordion = Accordion(
            [VBox(scale_hbox), VBox(adapt_hbox)],
            layout=Layout(min_width="500px"))
        accordion.selected_index = None
        accordion.set_title(0, "Manual Scaling")
        accordion.set_title(1, "Adaptive Scaling")

        box = VBox([title, HBox([status, accordion]), dashboard])

        self._cached_widget = box

        def adapt_cb(b):
            self.adapt(minimum=minimum.value, maximum=maximum.value)

        def adapt_cores_cb(b):
            self.adapt(minimum_cores=minimum_cores.value,
                       maximum_cores=maximum_cores.value)

        def adapt_mem_cb(b):
            self.adapt(minimum_memory=minimum_mem.value,
                       maximum_memory=maximum_mem.value)

        adapt.on_click(adapt_cb)
        adapt_cores.on_click(adapt_cores_cb)
        adapt_mem.on_click(adapt_mem_cb)

        def scale_cb(request, kwarg):
            def request_cb(b):
                with log_errors():
                    arg = request.value
                    with ignoring(AttributeError):
                        self._adaptive.stop()
                    local_kwargs = dict()
                    local_kwargs[kwarg] = arg
                    self.scale(**local_kwargs)

            return request_cb

        scale.on_click(scale_cb(request, "n"))
        scale_cores.on_click(scale_cb(request_cores, "cores"))
        scale_memory.on_click(scale_cb(request_memory, "memory"))

        def update():
            status.value = self._widget_status()

        pc = PeriodicCallback(update, 500, io_loop=self.scheduler.loop)
        self.scheduler.periodic_callbacks["cluster-repr"] = pc
        pc.start()

        return box
class Adaptive(object):
    '''
    Adaptively allocate workers based on scheduler load.  A superclass.

    Contains logic to dynamically resize a Dask cluster based on current use.
    This class needs to be paired with a system that can create and destroy
    Dask workers using a cluster resource manager.  Typically it is built into
    already existing solutions, rather than used directly by users.
    It is most commonly used from the ``.adapt(...)`` method of various Dask
    cluster classes.

    Parameters
    ----------
    scheduler: distributed.Scheduler
    cluster: object
        Must have scale_up and scale_down methods/coroutines
    startup_cost : timedelta or str, default "1s"
        Estimate of the number of seconds for nnFactor representing how costly it is to start an additional worker.
        Affects quickly to adapt to high tasks per worker loads
    interval : timedelta or str, default "1000 ms"
        Milliseconds between checks
    wait_count: int, default 3
        Number of consecutive times that a worker should be suggested for
        removal before we remove it.
    scale_factor : int, default 2
        Factor to scale by when it's determined additional workers are needed
    target_duration: timedelta or str, default "5s"
        Amount of time we want a computation to take.
        This affects how aggressively we scale up.
    worker_key: Callable[WorkerState]
        Function to group workers together when scaling down
        See Scheduler.workers_to_close for more information
    minimum: int
        Minimum number of workers to keep around
    maximum: int
        Maximum number of workers to keep around
    **kwargs:
        Extra parameters to pass to Scheduler.workers_to_close

    Examples
    --------

    This is commonly used from existing Dask classes, like KubeCluster

    >>> from dask_kubernetes import KubeCluster
    >>> cluster = KubeCluster()
    >>> cluster.adapt(minimum=10, maximum=100)

    Alternatively you can use it from your own Cluster class by subclassing
    from Dask's Cluster superclass

    >>> from distributed.deploy import Cluster
    >>> class MyCluster(Cluster):
    ...     def scale_up(self, n):
    ...         """ Bring worker count up to n """
    ...     def scale_down(self, workers):
    ...        """ Remove worker addresses from cluster """

    >>> cluster = MyCluster()
    >>> cluster.adapt(minimum=10, maximum=100)

    Notes
    -----
    Subclasses can override :meth:`Adaptive.should_scale_up` and
    :meth:`Adaptive.workers_to_close` to control when the cluster should be
    resized. The default implementation checks if there are too many tasks
    per worker or too little memory available (see :meth:`Adaptive.needs_cpu`
    and :meth:`Adaptive.needs_memory`).
    '''
    def __init__(self,
                 cluster=None,
                 interval="1s",
                 startup_cost="1s",
                 scale_factor=2,
                 minimum=0,
                 maximum=None,
                 wait_count=3,
                 target_duration="5s",
                 worker_key=None,
                 **kwargs):
        interval = parse_timedelta(interval, default="ms")
        self.worker_key = worker_key
        self.cluster = cluster
        self.startup_cost = parse_timedelta(startup_cost, default="s")
        self.scale_factor = scale_factor
        if self.cluster:
            self._adapt_callback = PeriodicCallback(self._adapt,
                                                    interval * 1000,
                                                    io_loop=self.loop)
            self.loop.add_callback(self._adapt_callback.start)
        self._adapting = False
        self._workers_to_close_kwargs = kwargs
        self.minimum = minimum
        self.maximum = maximum
        self.log = deque(maxlen=1000)
        self.close_counts = {}
        self.wait_count = wait_count
        self.target_duration = parse_timedelta(target_duration)

    @property
    def scheduler(self):
        return self.cluster.scheduler_comm

    def stop(self):
        if self.cluster:
            self._adapt_callback.stop()
            self._adapt_callback = None
            del self._adapt_callback

    async def workers_to_close(self, **kwargs):
        """
        Determine which, if any, workers should potentially be removed from
        the cluster.

        Notes
        -----
        ``Adaptive.workers_to_close`` dispatches to Scheduler.workers_to_close(),
        but may be overridden in subclasses.

        Returns
        -------
        List of worker addresses to close, if any

        See Also
        --------
        Scheduler.workers_to_close
        """
        if len(self.cluster.workers) <= self.minimum:
            return []

        kw = dict(self._workers_to_close_kwargs)
        kw.update(kwargs)

        if self.maximum is not None and len(
                self.cluster.workers) > self.maximum:
            kw["n"] = len(self.cluster.workers) - self.maximum

        L = await self.scheduler.workers_to_close(**kw)
        if len(self.cluster.workers) - len(L) < self.minimum:
            L = L[:len(self.cluster.workers) - self.minimum]

        return L

    async def _retire_workers(self, workers=None):
        if workers is None:
            workers = await self.workers_to_close(
                key=pickle.dumps(self.worker_key) if self.worker_key else None,
                minimum=self.minimum,
            )
        if not workers:
            raise gen.Return(workers)
        with log_errors():
            await self.scheduler.retire_workers(workers=workers,
                                                remove=True,
                                                close_workers=True)

            logger.info("Retiring workers %s", workers)
            f = self.cluster.scale_down(workers)
            if hasattr(f, "__await__"):
                await f

            return workers

    async def recommendations(self, comm=None):
        n = await self.scheduler.adaptive_target(
            target_duration=self.target_duration)
        if self.maximum is not None:
            n = min(self.maximum, n)
        if self.minimum is not None:
            n = max(self.minimum, n)
        workers = set(await self.workers_to_close(
            key=pickle.dumps(self.worker_key) if self.worker_key else None,
            minimum=self.minimum,
        ))
        try:
            current = len(self.cluster.worker_spec)
        except AttributeError:
            current = len(self.cluster.workers)
        if n > current and workers:
            logger.info(
                "Attempting to scale up and scale down simultaneously.")
            self.close_counts.clear()
            return {
                "status": "error",
                "msg": "Trying to scale up and down simultaneously",
            }

        elif n > current:
            self.close_counts.clear()
            return {"status": "up", "n": n}

        elif workers:
            d = {}
            to_close = []
            for w, c in self.close_counts.items():
                if w in workers:
                    if c >= self.wait_count:
                        to_close.append(w)
                    else:
                        d[w] = c

            for w in workers:
                d[w] = d.get(w, 0) + 1

            self.close_counts = d

            if to_close:
                return {"status": "down", "workers": to_close}
        else:
            self.close_counts.clear()
            return None

    async def _adapt(self):
        if self._adapting:  # Semaphore to avoid overlapping adapt calls
            return

        self._adapting = True
        try:
            recommendations = await self.recommendations()
            if not recommendations:
                return
            status = recommendations.pop("status")
            if status == "up":
                f = self.cluster.scale(**recommendations)
                self.log.append((time(), "up", recommendations))
                if hasattr(f, "__await__"):
                    await f

            elif status == "down":
                self.log.append((time(), "down", recommendations["workers"]))
                workers = await self._retire_workers(
                    workers=recommendations["workers"])
        finally:
            self._adapting = False

    def adapt(self):
        self.loop.add_callback(self._adapt)

    @property
    def loop(self):
        return self.cluster.loop