Exemple #1
0
    def _widget(self):
        """ Create IPython widget for display within a notebook """
        try:
            return self._cached_widget
        except AttributeError:
            pass

        from ipywidgets import Layout, VBox, HBox, IntText, Button, HTML

        client = self._dask_client()

        layout = Layout(width='150px')

        title = HTML('<h2>YarnCluster</h2>')

        status = HTML(self._widget_status(), layout=Layout(min_width='150px'))

        request = IntText(0, description='Workers', layout=layout)
        scale = Button(description='Scale', layout=layout)

        @scale.on_click
        def scale_cb(b):
            with log_errors():
                self.scale(request.value)

        elements = [title, HBox([status, request, scale])]

        if self.dashboard_link is not None:
            link = HTML('<p><b>Dashboard: </b><a href="%s" target="_blank">%s'
                        '</a></p>\n' %
                        (self.dashboard_link, self.dashboard_link))
            elements.append(link)

        self._cached_widget = box = VBox(elements)

        def update():
            status.value = self._widget_status()

        pc = PeriodicCallback(update, 500, io_loop=client.loop)
        pc.start()

        return box
Exemple #2
0
    def _widget(self):
        """ Create IPython widget for display within a notebook """
        try:
            return self._cached_widget
        except AttributeError:
            pass

        from ipywidgets import Layout, VBox, HBox, IntText, Button, HTML

        client = self._dask_client()

        layout = Layout(width='150px')

        title = HTML('<h2>YarnCluster</h2>')

        status = HTML(self._widget_status(), layout=Layout(min_width='150px'))

        request = IntText(0, description='Workers', layout=layout)
        scale = Button(description='Scale', layout=layout)

        @scale.on_click
        def scale_cb(b):
            with log_errors():
                self.scale(request.value)

        box = VBox([title,
                    HBox([status, request, scale])])

        self._cached_widget = box

        def update():
            status.value = self._widget_status()

        pc = PeriodicCallback(update, 500, io_loop=client.loop)
        pc.start()

        return box
Exemple #3
0
class DRMAACluster(Cluster):
    def __init__(self, template=None, cleanup_interval=1000, hostname=None,
                 script=None, preexec_commands=(), copy_script=True,
                 ip='',
                 **kwargs):
        """
        Dask workers launched by a DRMAA-compatible cluster

        Parameters
        ----------
        template: dict
            Dictionary specifying options to pass to the DRMAA cluster
            and the worker. Relevant items are:

            jobName: string
                Name of the job as known by the DRMAA cluster.
            args: list
                Extra string arguments to pass to dask-worker
            outputPath: string
                Path to the dask-worker stdout. Must start with ':'.
                Defaults to worker.JOBID.TASKID.out in current directory.
            errorPath: string
                Path to the dask-worker stderr. Must start with ':'
                Defaults to worker.JOBID.TASKID.err in current directory.
            workingDirectory: string
                Where dask-worker runs, defaults to current directory
            nativeSpecification: string
                Options native to the job scheduler

        cleanup_interval: int
            Time interval in seconds at which closed workers are cleaned.
            Defaults to 1000
        hostname: string
            Host on which to start the local scheduler, defaults to localhost
        script: string (optional)
            Path to the dask-worker executable script.
            A temporary file will be made if none is provided (recommended)
        preexec_commands: tuple (optional)
            Commands to be executed first by temporary script. Cannot be
            specified at the same time as script.
        copy_script: bool
            Whether should copy the passed script to the current working
            directory. This is primarily to work around an issue with SGE.
        ip: string
            IP of the scheduler, default is the empty string
            which will listen on the primary ip address of the host
        **kwargs:
            Additional keyword arguments to be passed to the local scheduler

        Examples
        --------
        >>> from dask_drmaa import DRMAACluster          # doctest: +SKIP
        >>> cluster = DRMAACluster()                     # doctest: +SKIP
        >>> cluster.start_workers(10)                    # doctest: +SKIP

        >>> from distributed import Client               # doctest: +SKIP
        >>> client = Client(cluster)                     # doctest: +SKIP

        >>> future = client.submit(lambda x: x + 1, 10)  # doctest: +SKIP
        >>> future.result()                              # doctest: +SKIP
        11
        """
        self.hostname = hostname or socket.gethostname()
        logger.info("Start local scheduler at %s", self.hostname)
        self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs)

        if script is None:
            fn = os.path.abspath(tempfile.mktemp(
                suffix='.sh',
                prefix='dask-worker-script-',
                dir=os.path.curdir,
            ))
            self.script = fn
            self._should_cleanup_script = True

            script_contents = make_job_script(executable=worker_bin_path,
                                              name='%s.%s' % (JOB_ID, TASK_ID),
                                              preexec=preexec_commands)
            with open(fn, 'wt') as f:
                f.write(script_contents)

            @atexit.register
            def remove_script():
                if os.path.exists(fn):
                    os.remove(fn)

            os.chmod(self.script, 0o777)

        else:
            self._should_cleanup_script = False
            if copy_script:
                with ignoring(EnvironmentError):  # may be in the same path
                    shutil.copy(script, os.path.curdir)  # python 2.x returns None
                    script = os.path.join(os.path.curdir, os.path.basename(script))
                    self._should_cleanup_script = True
            self.script = os.path.abspath(script)
            assert not preexec_commands, "Cannot specify both script and preexec_commands"

        # TODO: check that user-provided script is executable

        self.template = merge(default_template,
                              {'remoteCommand': self.script},
                              template or {})

        self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers,
                                                  callback_time=cleanup_interval,
                                                  io_loop=self.scheduler.loop)
        self._cleanup_callback.start()

        self.workers = {}  # {job-id: WorkerSpec}

    def adapt(self, **kwargs):
        """ Turn on adaptivity

        For keyword arguments see dask_drmaa.adaptive.Adaptive

        Examples
        --------
        >>> cluster.adapt(minimum=0, maximum=10, interval='500ms')

        See Also
        --------
        Cluster: an interface for other clusters to inherit from
        """
        from .adaptive import Adaptive

        with ignoring(AttributeError):
            self._adaptive.stop()
        if not hasattr(self, '_adaptive_options'):
            self._adaptive_options = {}

        self._adaptive_options.update(kwargs)
        self._adaptive = Adaptive(
            self, self.scheduler, **self._adaptive_options
        )

        return self._adaptive

    @gen.coroutine
    def _start(self):
        pass

    @property
    def scheduler(self):
        return self.local_cluster.scheduler

    def create_job_template(self, **kwargs):
        template = self.template.copy()
        if kwargs:
            template.update(kwargs)
        template['args'] = [self.scheduler_address] + template['args']

        jt = get_session().createJobTemplate()
        valid_attributes = dir(jt)

        for key, value in template.items():
            if key not in valid_attributes:
                raise ValueError("Invalid job template attribute %s" % key)
            setattr(jt, key, value)

        return jt

    def start_workers(self, n=1, **kwargs):
        if n == 0:
            return

        with log_errors():
            with self.create_job_template(**kwargs) as jt:
                ids = get_session().runBulkJobs(jt, 1, n, 1)
                logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0])
                self.workers.update(
                    {jid: WorkerSpec(job_id=jid, kwargs=kwargs,
                                     stdout=worker_out_path_template % dict(jid=jid, ext='out'),
                                     stderr=worker_out_path_template % dict(jid=jid, ext='err'),
                                     )
                     for jid in ids})

    @gen.coroutine
    def stop_workers(self, worker_ids, sync=False):
        if isinstance(worker_ids, str):
            worker_ids = [worker_ids]
        elif worker_ids:
            worker_ids = list(worker_ids)
        else:
            return

        # Let the scheduler gracefully retire workers first
        ids_to_ips = {
            v['name']: k for k, v in self.scheduler.worker_info.items()
        }
        worker_ips = [ids_to_ips[wid]
                      for wid in worker_ids
                      if wid in ids_to_ips]
        retired = yield self.scheduler.retire_workers(workers=worker_ips,
                                                      close_workers=True)
        logger.info("Retired workers %s", retired)
        for wid in list(worker_ids):
            try:
                get_session().control(wid, drmaa.JobControlAction.TERMINATE)
            except drmaa.errors.InvalidJobException:
                pass
            try:
                self.workers.pop(wid)
            except KeyError:
                # If we have multiple callers at once, it may have already
                # been popped off
                pass

        logger.info("Stop workers %s", worker_ids)
        if sync:
            get_session().synchronize(worker_ids, dispose=True)

    @gen.coroutine
    def scale_up(self, n, **kwargs):
        yield [self.start_workers(**kwargs)
               for _ in range(n - len(self.workers))]

    @gen.coroutine
    def scale_down(self, workers):
        workers = set(workers)
        yield self.scheduler.retire_workers(workers=workers)

    def close(self):
        logger.info("Closing DRMAA cluster")
        self.stop_workers(self.workers, sync=True)

        self.local_cluster.close()
        if self._should_cleanup_script and os.path.exists(self.script):
            os.remove(self.script)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def cleanup_closed_workers(self):
        for jid in list(self.workers):
            if get_session().jobStatus(jid) in ('closed', 'done'):
                logger.info("Removing closed worker %s", jid)
                del self.workers[jid]

    def __del__(self):
        try:
            self.close()
        except:
            pass

    def __str__(self):
        return "<%s: %d workers>" % (self.__class__.__name__, len(self.workers))

    __repr__ = __str__
    def _widget(self):
        """ Create IPython widget for display within a notebook """
        try:
            return self._cached_widget
        except AttributeError:
            pass

        from ipywidgets import (
            Layout,
            VBox,
            HBox,
            IntText,
            Button,
            HTML,
            Accordion,
            Text,
        )

        layout = Layout(width="150px")

        if "dashboard" in self.scheduler.services:
            link = self.dashboard_link
            link = '<p><b>Dashboard: </b><a href="%s" target="_blank">%s</a></p>\n' % (
                link,
                link,
            )
        else:
            link = ""

        title = "<h2>%s</h2>" % type(self).__name__
        title = HTML(title)
        dashboard = HTML(link)

        status = HTML(self._widget_status(), layout=Layout(min_width="150px"))

        request = IntText(0, description="Workers", layout=layout)
        scale = Button(description="Scale", layout=layout)
        request_cores = IntText(0, description="Cores", layout=layout)
        scale_cores = Button(description="Scale", layout=layout)
        request_memory = Text("O GB", description="Memory", layout=layout)
        scale_memory = Button(description="Scale", layout=layout)

        minimum = IntText(0, description="Minimum", layout=layout)
        maximum = IntText(0, description="Maximum", layout=layout)
        adapt = Button(description="Adapt", layout=layout)
        minimum_cores = IntText(0, description="Min cores", layout=layout)
        maximum_cores = IntText(0, description="Max cores", layout=layout)
        adapt_cores = Button(description="Adapt", layout=layout)
        minimum_mem = Text("0 GB", description="Min memory", layout=layout)
        maximum_mem = Text("0 GB", description="Max memory", layout=layout)
        adapt_mem = Button(description="Adapt", layout=layout)

        scale_hbox = [HBox([request, scale])]
        adapt_hbox = [HBox([minimum, maximum, adapt])]
        if hasattr(self, "jobqueue_worker_spec"):
            scale_hbox.append(HBox([request_cores, scale_cores]))
            scale_hbox.append(HBox([request_memory, scale_memory]))
            adapt_hbox.append(HBox([minimum_cores, maximum_cores,
                                    adapt_cores]))
            adapt_hbox.append(HBox([minimum_mem, maximum_mem, adapt_mem]))

        accordion = Accordion(
            [VBox(scale_hbox), VBox(adapt_hbox)],
            layout=Layout(min_width="500px"))
        accordion.selected_index = None
        accordion.set_title(0, "Manual Scaling")
        accordion.set_title(1, "Adaptive Scaling")

        box = VBox([title, HBox([status, accordion]), dashboard])

        self._cached_widget = box

        def adapt_cb(b):
            self.adapt(minimum=minimum.value, maximum=maximum.value)

        def adapt_cores_cb(b):
            self.adapt(minimum_cores=minimum_cores.value,
                       maximum_cores=maximum_cores.value)

        def adapt_mem_cb(b):
            self.adapt(minimum_memory=minimum_mem.value,
                       maximum_memory=maximum_mem.value)

        adapt.on_click(adapt_cb)
        adapt_cores.on_click(adapt_cores_cb)
        adapt_mem.on_click(adapt_mem_cb)

        def scale_cb(request, kwarg):
            def request_cb(b):
                with log_errors():
                    arg = request.value
                    with ignoring(AttributeError):
                        self._adaptive.stop()
                    local_kwargs = dict()
                    local_kwargs[kwarg] = arg
                    self.scale(**local_kwargs)

            return request_cb

        scale.on_click(scale_cb(request, "n"))
        scale_cores.on_click(scale_cb(request_cores, "cores"))
        scale_memory.on_click(scale_cb(request_memory, "memory"))

        def update():
            status.value = self._widget_status()

        pc = PeriodicCallback(update, 500, io_loop=self.scheduler.loop)
        self.scheduler.periodic_callbacks["cluster-repr"] = pc
        pc.start()

        return box