Ejemplo n.º 1
0
def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False,
                           diagnostics_port=None, loop=loop, start=False)
    cluster.scheduler.allowed_failures = 1000
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    futures = c.map(slowinc, range(100), delay=0.01)

    start = time()
    while not cluster.scheduler.worker_info:
        yield gen.sleep(0.01)
        assert time() < start + 15

    yield c._gather(futures)
    del futures

    start = time()
    while cluster.workers:
        yield gen.sleep(0.01)
        assert time() < start + 5

    assert not cluster.workers
    yield gen.sleep(0.2)
    assert not cluster.workers

    futures = c.map(slowinc, range(100), delay=0.01)
    yield c._gather(futures)

    yield c._shutdown()
    yield cluster._close()
Ejemplo n.º 2
0
def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False,
                      diagnostic_port=None, loop=loop, start=False)
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    for i in range(20):
        futures = c.map(slowinc, range(100), delay=0.01)
        yield c._gather(futures)
        del futures
        yield gen.sleep(0.1)

    yield c._shutdown()
    yield cluster._close()
Ejemplo n.º 3
0
def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0,
                           scheduler_port=0,
                           silence_logs=False,
                           processes=False,
                           diagnostics_port=None,
                           loop=loop,
                           start=False)
    try:
        cluster.scheduler.allowed_failures = 1000
        alc = Adaptive(cluster.scheduler, cluster, interval=100)
        c = yield Client(cluster, asynchronous=True, loop=loop)

        futures = c.map(slowinc, range(100), delay=0.01)

        start = time()
        while not cluster.scheduler.worker_info:
            yield gen.sleep(0.01)
            assert time() < start + 15

        yield c._gather(futures)
        del futures

        start = time()
        while cluster.workers:
            yield gen.sleep(0.01)
            assert time() < start + 5

        assert not cluster.workers
        assert not cluster.scheduler.workers
        yield gen.sleep(0.2)
        assert not cluster.workers
        assert not cluster.scheduler.workers

        futures = c.map(slowinc, range(100), delay=0.01)
        yield c._gather(futures)

    finally:
        yield c._close()
        yield cluster._close()
Ejemplo n.º 4
0
def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0,
                           scheduler_port=0,
                           silence_logs=False,
                           nanny=False,
                           diagnostic_port=None,
                           loop=loop,
                           start=False)
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    for i in range(20):
        futures = c.map(slowinc, range(100), delay=0.01)
        yield c._gather(futures)
        del futures
        yield gen.sleep(0.1)

    yield c._shutdown()
    yield cluster._close()
Ejemplo n.º 5
0
def test_get_scale_up_kwargs(loop):
    with LocalCluster(0, scheduler_port=0, silence_logs=False,
                      diagnostics_port=None, loop=loop) as cluster:

        alc = Adaptive(cluster.scheduler, cluster, interval=100,
                       scale_factor=3)
        assert alc.get_scale_up_kwargs() == {'n': 1}

        with Client(cluster, loop=loop) as c:
            future = c.submit(lambda x: x + 1, 1)
            assert future.result() == 2
            assert c.ncores()
            assert alc.get_scale_up_kwargs() == {'n': 3}
Ejemplo n.º 6
0
def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0,
                           scheduler_port=0,
                           silence_logs=False,
                           nanny=False,
                           diagnostics_port=None,
                           loop=loop,
                           start=False)
    cluster.scheduler.allowed_failures = 1000
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    futures = c.map(slowinc, range(100), delay=0.01)

    start = time()
    while not cluster.workers:
        yield gen.sleep(0.01)
        assert time() < start + 5

    yield c._gather(futures)
    del futures

    start = time()
    while cluster.workers:
        yield gen.sleep(0.01)
        assert time() < start + 5

    assert not cluster.workers
    yield gen.sleep(0.2)
    assert not cluster.workers

    futures = c.map(slowinc, range(100), delay=0.01)
    yield c._gather(futures)

    yield c._shutdown()
    yield cluster._close()
Ejemplo n.º 7
0
def test_min_max():
    loop = IOLoop.current()
    cluster = yield LocalCluster(0, scheduler_port=0, silence_logs=False,
                                 processes=False, diagnostics_port=None,
                                 loop=loop, asynchronous=True)
    yield cluster._start()
    try:
        adapt = Adaptive(cluster.scheduler, cluster, minimum=1, maximum=2,
                         interval=20)
        c = yield Client(cluster, asynchronous=True, loop=loop)

        start = time()
        while not cluster.scheduler.workers:
            yield gen.sleep(0.01)
            assert time() < start + 1

        yield gen.sleep(0.2)
        assert len(cluster.scheduler.workers) == 1
        assert frequencies(pluck(1, adapt.log)) == {'up': 1}

        futures = c.map(slowinc, range(100), delay=0.1)

        start = time()
        while len(cluster.scheduler.workers) < 2:
            yield gen.sleep(0.01)
            assert time() < start + 1

        assert len(cluster.scheduler.workers) == 2
        yield gen.sleep(0.5)
        assert len(cluster.scheduler.workers) == 2
        assert len(cluster.workers) == 2
        assert frequencies(pluck(1, adapt.log)) == {'up': 2}

        del futures

        start = time()
        while len(cluster.scheduler.workers) != 1:
            yield gen.sleep(0.01)
            assert time() < start + 1
        assert frequencies(pluck(1, adapt.log)) == {'up': 2, 'down': 1}
    finally:
        yield c._close()
        yield cluster._close()
Ejemplo n.º 8
0
def test_adaptive_local_cluster(loop):
    with LocalCluster(0, scheduler_port=0, silence_logs=False,
                      diagnostics_port=None, loop=loop) as cluster:
        alc = Adaptive(cluster.scheduler, cluster, interval=100)
        with Client(cluster, loop=loop) as c:
            assert not c.ncores()
            future = c.submit(lambda x: x + 1, 1)
            assert future.result() == 2
            assert c.ncores()

            sleep(0.1)
            assert c.ncores()  # still there after some time

            del future

            start = time()
            while cluster.scheduler.ncores:
                sleep(0.01)
                assert time() < start + 5

            assert not c.ncores()
Ejemplo n.º 9
0
def test_avoid_churn():
    """ We want to avoid creating and deleting workers frequently

    Instead we want to wait a few beats before removing a worker in case the
    user is taking a brief pause between work
    """
    cluster = yield LocalCluster(0, asynchronous=True, processes=False,
                                 scheduler_port=0, silence_logs=False,
                                 diagnostics_port=None)
    client = yield Client(cluster, asynchronous=True)
    try:
        adapt = Adaptive(cluster.scheduler, cluster, interval=20, wait_count=5)

        for i in range(10):
            yield client.submit(slowinc, i, delay=0.040)
            yield gen.sleep(0.040)

        assert frequencies(pluck(1, adapt.log)) == {'up': 1}
    finally:
        yield client._close()
        yield cluster._close()
Ejemplo n.º 10
0
class KubeCluster(Cluster):
    """ Launch a Dask cluster on Kubernetes

    This starts a local Dask scheduler and then dynamically launches
    Dask workers on a Kubernetes cluster. The Kubernetes cluster is taken
    to be either the current one on which this code is running, or as a
    fallback, the default one configured in a kubeconfig file.

    **Environments**

    Your worker pod image should have a similar environment to your local
    environment, including versions of Python, dask, cloudpickle, and any
    libraries that you may wish to use (like NumPy, Pandas, or Scikit-Learn).
    See examples below for suggestions on how to manage and check for this.

    **Network**

    Since the Dask scheduler is launched locally, for it to work, we need to
    be able to open network connections between this local node and all the
    workers nodes on the Kubernetes cluster. If the current process is not
    already on a Kubernetes node, some network configuration will likely be
    required to make this work.

    **Resources**

    Your Kubernetes resource limits and requests should match the
    ``--memory-limit`` and ``--nthreads`` parameters given to the
    ``dask-worker`` command.

    Parameters
    ----------
    pod_template: kubernetes.client.V1PodSpec
        A Kubernetes specification for a Pod for a dask worker.
    name: str (optional)
        Name given to the pods.  Defaults to ``dask-$USER-random``
    namespace: str (optional)
        Namespace in which to launch the workers.
        Defaults to current namespace if available or "default"
    n_workers: int
        Number of workers on initial launch.
        Use ``scale_up`` to increase this number in the future
    env: Dict[str, str]
        Dictionary of environment variables to pass to worker pod
    host: str
        Listen address for local scheduler.  Defaults to 0.0.0.0
    port: int
        Port of local scheduler
    **kwargs: dict
        Additional keyword arguments to pass to LocalCluster

    Examples
    --------
    >>> from dask_kubernetes import KubeCluster, make_pod_spec
    >>> pod_spec = make_pod_spec(image='daskdev/dask:latest',
    ...                          memory_limit='4G', memory_request='4G',
    ...                          cpu_limit=1, cpu_request=1,
    ...                          env={'EXTRA_PIP_PACKAGES': 'fastparquet git+https://github.com/dask/distributed'})
    >>> cluster = KubeCluster(pod_spec)
    >>> cluster.scale_up(10)

    You can also create clusters with worker pod specifications as dictionaries
    or stored in YAML files

    >>> cluster = KubeCluster.from_yaml('worker-template.yml')
    >>> cluster = KubeCluster.from_dict({...})

    Rather than explicitly setting a number of workers you can also ask the
    cluster to allocate workers dynamically based on current workload

    >>> cluster.adapt()

    You can pass this cluster directly to a Dask client

    >>> from dask.distributed import Client
    >>> client = Client(cluster)

    You can verify that your local environment matches your worker environments
    by calling ``client.get_versions(check=True)``.  This will raise an
    informative error if versions do not match.

    >>> client.get_versions(check=True)

    The ``daskdev/dask`` docker images support ``EXTRA_PIP_PACKAGES``,
    ``EXTRA_APT_PACKAGES`` and ``EXTRA_CONDA_PACKAGES`` environment variables
    to help with small adjustments to the worker environments.  We recommend
    the use of pip over conda in this case due to a much shorter startup time.
    These environment variables can be modified directly from the KubeCluster
    constructor methods using the ``env=`` keyword.  You may list as many
    packages as you like in a single string like the following:

    >>> pip = 'pyarrow gcsfs git+https://github.com/dask/distributed'
    >>> conda = '-c conda-forge scikit-learn'
    >>> KubeCluster.from_yaml(..., env={'EXTRA_PIP_PACKAGES': pip,
    ...                                 'EXTRA_CONDA_PACKAGES': conda})

    You can also start a KubeCluster with no arguments *if* the worker template
    is specified in the Dask config files, either as a full template in
    ``kubernetes.worker-template`` or a path to a YAML file in
    ``kubernetes.worker-template-path``.

    See http://dask.pydata.org/en/latest/configuration.html for more
    information about setting configuration values.::

        $ export DASK_KUBERNETES__WORKER_TEMPLATE_PATH=worker_template.yaml

    >>> cluster = KubeCluster()  # automatically finds 'worker_template.yaml'

    See Also
    --------
    KubeCluster.from_yaml
    KubeCluster.from_dict
    KubeCluster.adapt
    """
    def __init__(
            self,
            pod_template=None,
            name=None,
            namespace=None,
            n_workers=None,
            host=None,
            port=None,
            env=None,
            **kwargs
    ):
        name = name or dask.config.get('kubernetes.name')
        namespace = namespace or dask.config.get('kubernetes.namespace')
        n_workers = n_workers if n_workers is not None else dask.config.get('kubernetes.count.start')
        host = host or dask.config.get('kubernetes.host')
        port = port if port is not None else dask.config.get('kubernetes.port')
        env = env if env is not None else dask.config.get('kubernetes.env')

        if not pod_template and dask.config.get('kubernetes.worker-template', None):
            d = dask.config.get('kubernetes.worker-template')
            pod_template = make_pod_from_dict(d)

        if not pod_template and dask.config.get('kubernetes.worker-template-path', None):
            import yaml
            fn = dask.config.get('kubernetes.worker-template-path')
            fn = fn.format(**os.environ)
            with open(fn) as f:
                d = yaml.safe_load(f)
            pod_template = make_pod_from_dict(d)

        if not pod_template:
            msg = ("Worker pod specification not provided. See KubeCluster "
                   "docstring for ways to specify workers")
            raise ValueError(msg)

        self.cluster = LocalCluster(ip=host or socket.gethostname(),
                                    scheduler_port=port,
                                    n_workers=0, **kwargs)
        try:
            kubernetes.config.load_incluster_config()
        except kubernetes.config.ConfigException:
            kubernetes.config.load_kube_config()

        self.core_api = kubernetes.client.CoreV1Api()

        if namespace is None:
            namespace = _namespace_default()
        
        name = name.format(user=getpass.getuser(),
                           uuid=str(uuid.uuid4())[:10],
                           **os.environ)
        name = escape(name)
        
        self.pod_template = clean_pod_template(pod_template)
        # Default labels that can't be overwritten
        self.pod_template.metadata.labels['dask.pydata.org/cluster-name'] = name
        self.pod_template.metadata.labels['user'] = escape(getpass.getuser())
        self.pod_template.metadata.labels['app'] = 'dask'
        self.pod_template.metadata.labels['component'] = 'dask-worker'
        self.pod_template.metadata.namespace = namespace

        self.pod_template.spec.containers[0].env.append(
            kubernetes.client.V1EnvVar(name='DASK_SCHEDULER_ADDRESS',
                                       value=self.scheduler_address)
        )
        if env:
            self.pod_template.spec.containers[0].env.extend([
                kubernetes.client.V1EnvVar(name=k, value=str(v))
                for k, v in env.items()
            ])
        self.pod_template.metadata.generate_name = name

        finalize(self, _cleanup_pods, self.namespace, self.pod_template.metadata.labels)

        if n_workers:
            self.scale(n_workers)

    @classmethod
    def from_dict(cls, pod_spec, **kwargs):
        """ Create cluster with worker pod spec defined by Python dictionary

        Examples
        --------
        >>> spec = {
        ...     'metadata': {},
        ...     'spec': {
        ...         'containers': [{
        ...             'args': ['dask-worker', '$(DASK_SCHEDULER_ADDRESS)',
        ...                      '--nthreads', '1',
        ...                      '--death-timeout', '60'],
        ...             'command': None,
        ...             'image': 'daskdev/dask:latest',
        ...             'name': 'dask-worker',
        ...         }],
        ...     'restartPolicy': 'Never',
        ...     }
        ... }
        >>> cluster = KubeCluster.from_dict(spec, namespace='my-ns')  # doctest: +SKIP

        See Also
        --------
        KubeCluster.from_yaml
        """
        return cls(make_pod_from_dict(pod_spec), **kwargs)

    @classmethod
    def from_yaml(cls, yaml_path, **kwargs):
        """ Create cluster with worker pod spec defined by a YAML file

        We can start a cluster with pods defined in an accompanying YAML file
        like the following:

        .. code-block:: yaml

            kind: Pod
            metadata:
              labels:
                foo: bar
                baz: quux
            spec:
              containers:
              - image: daskdev/dask:latest
                name: dask-worker
                args: [dask-worker, $(DASK_SCHEDULER_ADDRESS), --nthreads, '2', --memory-limit, 8GB]
              restartPolicy: Never

        Examples
        --------
        >>> cluster = KubeCluster.from_yaml('pod.yaml', namespace='my-ns')  # doctest: +SKIP

        See Also
        --------
        KubeCluster.from_dict
        """
        if not yaml:
            raise ImportError("PyYaml is required to use yaml functionality, please install it!")
        with open(yaml_path) as f:
            d = yaml.safe_load(f)
            return cls.from_dict(d, **kwargs)

    @property
    def namespace(self):
        return self.pod_template.metadata.namespace

    @property
    def name(self):
        return self.pod_template.metadata.generate_name

    def __repr__(self):
        return 'KubeCluster("%s", workers=%d)' % (self.scheduler.address,
                                                  len(self.pods()))

    @property
    def scheduler(self):
        return self.cluster.scheduler

    @property
    def scheduler_address(self):
        return self.scheduler.address

    def pods(self):
        """ A list of kubernetes pods corresponding to current workers

        See Also
        --------
        KubeCluster.logs
        """
        return self.core_api.list_namespaced_pod(
            self.namespace,
            label_selector=format_labels(self.pod_template.metadata.labels)
        ).items

    def logs(self, pod=None):
        """ Logs from a worker pod

        You can get this pod object from the ``pods`` method.

        If no pod is specified all pod logs will be returned. On large clusters
        this could end up being rather large.

        Parameters
        ----------
        pod: kubernetes.client.V1Pod
            The pod from which we want to collect logs.

        See Also
        --------
        KubeCluster.pods
        Client.get_worker_logs
        """
        if pod is None:
            return {pod.status.pod_ip: self.logs(pod) for pod in self.pods()}

        return self.core_api.read_namespaced_pod_log(pod.metadata.name,
                                                     pod.metadata.namespace)

    def scale(self, n):
        """ Scale cluster to n workers

        Parameters
        ----------
        n: int
            Target number of workers

        Example
        -------
        >>> cluster.scale(10)  # scale cluster to ten workers

        See Also
        --------
        KubeCluster.scale_up
        KubeCluster.scale_down
        """
        pods = self._cleanup_terminated_pods(self.pods())
        if n >= len(pods):
            return self.scale_up(n, pods=pods)
        else:
            n_to_delete = len(pods) - n
            # Before trying to close running workers, check if we can cancel
            # pending pods (in case the kubernetes cluster was too full to
            # provision those pods in the first place).
            running_workers = list(self.scheduler.workers.keys())
            running_ips = set(urlparse(worker).hostname
                              for worker in running_workers)
            pending_pods = [p for p in pods
                            if p.status.pod_ip not in running_ips]
            if pending_pods:
                pending_to_delete = pending_pods[:n_to_delete]
                logger.debug("Deleting pending pods: %s", pending_to_delete)
                self._delete_pods(pending_to_delete)
                n_to_delete = n_to_delete - len(pending_to_delete)
                if n_to_delete <= 0:
                    return

            to_close = select_workers_to_close(self.scheduler, n_to_delete)
            logger.debug("Closing workers: %s", to_close)
            if len(to_close) < len(self.scheduler.workers):
                # Close workers cleanly to migrate any temporary results to
                # remaining workers.
                @gen.coroutine
                def f(to_close):
                    yield self.scheduler.retire_workers(
                        workers=to_close, remove=True, close_workers=True)
                    yield offload(self.scale_down, to_close)

                self.scheduler.loop.add_callback(f, to_close)
                return

            # Terminate all pods without waiting for clean worker shutdown
            self.scale_down(to_close)

    def _delete_pods(self, to_delete):
        for pod in to_delete:
            try:
                self.core_api.delete_namespaced_pod(
                    pod.metadata.name,
                    self.namespace,
                    kubernetes.client.V1DeleteOptions()
                )
                pod_info = pod.metadata.name
                if pod.status.reason:
                    pod_info += ' [{}]'.format(pod.status.reason)
                if pod.status.message:
                    pod_info += ' {}'.format(pod.status.message)
                logger.info('Deleted pod: %s', pod_info)
            except kubernetes.client.rest.ApiException as e:
                # If a pod has already been removed, just ignore the error
                if e.status != 404:
                    raise

    def _cleanup_terminated_pods(self, pods):
        terminated_phases = {'Succeeded', 'Failed'}
        terminated_pods = [p for p in pods if p.status.phase in terminated_phases]
        self._delete_pods(terminated_pods)
        return [p for p in pods if p.status.phase not in terminated_phases]

    def scale_up(self, n, pods=None, **kwargs):
        """
        Make sure we have n dask-workers available for this cluster

        Examples
        --------
        >>> cluster.scale_up(20)  # ask for twenty workers
        """
        maximum = dask.config.get('kubernetes.count.max')
        if maximum is not None and maximum < n:
            logger.info("Tried to scale beyond maximum number of workers %d > %d",
                        n, maximum)
            n = maximum
        pods = pods or self._cleanup_terminated_pods(self.pods())
        to_create = n - len(pods)
        new_pods = []
        for i in range(3):
            try:
                for _ in range(to_create):
                    new_pods.append(self.core_api.create_namespaced_pod(
                        self.namespace, self.pod_template))
                    to_create -= 1
                break
            except kubernetes.client.rest.ApiException as e:
                if e.status == 500 and 'ServerTimeout' in e.body:
                    logger.info("Server timeout, retry #%d", i + 1)
                    time.sleep(1)
                    last_exception = e
                    continue
                else:
                    raise
        else:
            raise last_exception

        return new_pods
        # fixme: wait for this to be ready before returning!

    def scale_down(self, workers, pods=None):
        """ Remove the pods for the requested list of workers

        When scale_down is called by the _adapt async loop, the workers are
        assumed to have been cleanly closed first and in-memory data has been
        migrated to the remaining workers.

        Note that when the worker process exits, Kubernetes leaves the pods in
        a 'Succeeded' state that we collect here.

        If some workers have not been closed, we just delete the pods with
        matching ip addresses.

        Parameters
        ----------
        workers: List[str] List of addresses of workers to close
        """
        # Get the existing worker pods
        pods = pods or self._cleanup_terminated_pods(self.pods())

        # Work out the list of pods that we are going to delete
        # Each worker to delete is given in the form "tcp://<worker ip>:<port>"
        # Convert this to a set of IPs
        ips = set(urlparse(worker).hostname for worker in workers)
        to_delete = [p for p in pods if p.status.pod_ip in ips]
        if not to_delete:
            return
        self._delete_pods(to_delete)

    def __enter__(self):
        return self

    def close(self):
        """ Close this cluster """
        self.scale_down(self.cluster.scheduler.workers)
        self.cluster.close()

    def __exit__(self, type, value, traceback):
        _cleanup_pods(self.namespace, self.pod_template.metadata.labels)
        self.cluster.__exit__(type, value, traceback)
Ejemplo n.º 11
0
    def __init__(
            self,
            pod_template=None,
            name=None,
            namespace=None,
            n_workers=None,
            host=None,
            port=None,
            env=None,
            **kwargs
    ):
        name = name or dask.config.get('kubernetes.name')
        namespace = namespace or dask.config.get('kubernetes.namespace')
        n_workers = n_workers if n_workers is not None else dask.config.get('kubernetes.count.start')
        host = host or dask.config.get('kubernetes.host')
        port = port if port is not None else dask.config.get('kubernetes.port')
        env = env if env is not None else dask.config.get('kubernetes.env')

        if not pod_template and dask.config.get('kubernetes.worker-template', None):
            d = dask.config.get('kubernetes.worker-template')
            pod_template = make_pod_from_dict(d)

        if not pod_template and dask.config.get('kubernetes.worker-template-path', None):
            import yaml
            fn = dask.config.get('kubernetes.worker-template-path')
            fn = fn.format(**os.environ)
            with open(fn) as f:
                d = yaml.safe_load(f)
            pod_template = make_pod_from_dict(d)

        if not pod_template:
            msg = ("Worker pod specification not provided. See KubeCluster "
                   "docstring for ways to specify workers")
            raise ValueError(msg)

        self.cluster = LocalCluster(ip=host or socket.gethostname(),
                                    scheduler_port=port,
                                    n_workers=0, **kwargs)
        try:
            kubernetes.config.load_incluster_config()
        except kubernetes.config.ConfigException:
            kubernetes.config.load_kube_config()

        self.core_api = kubernetes.client.CoreV1Api()

        if namespace is None:
            namespace = _namespace_default()
        
        name = name.format(user=getpass.getuser(),
                           uuid=str(uuid.uuid4())[:10],
                           **os.environ)
        name = escape(name)
        
        self.pod_template = clean_pod_template(pod_template)
        # Default labels that can't be overwritten
        self.pod_template.metadata.labels['dask.pydata.org/cluster-name'] = name
        self.pod_template.metadata.labels['user'] = escape(getpass.getuser())
        self.pod_template.metadata.labels['app'] = 'dask'
        self.pod_template.metadata.labels['component'] = 'dask-worker'
        self.pod_template.metadata.namespace = namespace

        self.pod_template.spec.containers[0].env.append(
            kubernetes.client.V1EnvVar(name='DASK_SCHEDULER_ADDRESS',
                                       value=self.scheduler_address)
        )
        if env:
            self.pod_template.spec.containers[0].env.extend([
                kubernetes.client.V1EnvVar(name=k, value=str(v))
                for k, v in env.items()
            ])
        self.pod_template.metadata.generate_name = name

        finalize(self, _cleanup_pods, self.namespace, self.pod_template.metadata.labels)

        if n_workers:
            self.scale(n_workers)
Ejemplo n.º 12
0
class KubeCluster(Cluster):
    """ Launch a Dask cluster on Kubernetes

    This starts a local Dask scheduler and then dynamically launches
    Dask workers on a Kubernetes cluster.

    **Environments**

    Your worker pod image should have a similar environment to your local
    environment, including versions of Python, dask, cloudpickle, and any
    libraries that you may wish to use (like NumPy, Pandas, or Scikit-Learn).
    See examples below for suggestions on how to manage and check for this.

    **Resources**

    Your Kubernetes resource limits and requests should match the
    ``--memory-limit`` and ``--nthreads`` parameters given to the
    ``dask-worker`` command.

    Parameters
    ----------
    pod_template: kubernetes.client.V1PodSpec
        A Kubernetes specification for a Pod for a dask worker.
    name: str (optional)
        Name given to the pods.  Defaults to ``dask-$USER-random``
    namespace: str (optional)
        Namespace in which to launch the workers.
        Defaults to current namespace if available or "default"
    n_workers: int
        Number of workers on initial launch.
        Use ``scale_up`` to increase this number in the future
    env: Dict[str, str]
        Dictionary of environment variables to pass to worker pod
    host: str
        Listen address for local scheduler.  Defaults to 0.0.0.0
    port: int
        Port of local scheduler
    **kwargs: dict
        Additional keyword arguments to pass to LocalCluster

    Examples
    --------
    >>> from dask_kubernetes import KubeCluster, make_pod_spec
    >>> pod_spec = make_pod_spec(image='daskdev/dask:latest',
    ...                          memory_limit='4G', memory_request='4G',
    ...                          cpu_limit=1, cpu_request=1,
    ...                          env={'EXTRA_PIP_PACKAGES': 'fastparquet git+https://github.com/dask/distributed'})
    >>> cluster = KubeCluster(pod_spec)
    >>> cluster.scale_up(10)

    You can also create clusters with worker pod specifications as dictionaries
    or stored in YAML files

    >>> cluster = KubeCluster.from_yaml('worker-template.yml')
    >>> cluster = KubeCluster.from_dict({...})

    Rather than explicitly setting a number of workers you can also ask the
    cluster to allocate workers dynamically based on current workload

    >>> cluster.adapt()

    You can pass this cluster directly to a Dask client

    >>> from dask.distributed import Client
    >>> client = Client(cluster)

    You can verify that your local environment matches your worker environments
    by calling ``client.get_versions(check=True)``.  This will raise an
    informative error if versions do not match.

    >>> client.get_versions(check=True)

    The ``daskdev/dask`` docker images support ``EXTRA_PIP_PACKAGES``,
    ``EXTRA_APT_PACKAGES`` and ``EXTRA_CONDA_PACKAGES`` environment variables
    to help with small adjustments to the worker environments.  We recommend
    the use of pip over conda in this case due to a much shorter startup time.
    These environment variables can be modified directly from the KubeCluster
    constructor methods using the ``env=`` keyword.  You may list as many
    packages as you like in a single string like the following:

    >>> pip = 'pyarrow gcsfs git+https://github.com/dask/distributed'
    >>> conda = '-c conda-forge scikit-learn'
    >>> KubeCluster.from_yaml(..., env={'EXTRA_PIP_PACKAGES': pip,
    ...                                 'ExtRA_CONDA_PACKAGES': conda})

    You can also start a KubeCluster with no arguments *if* the YAML file
    defining the worker template is referred to in the
    ``DASKERNETES_WORKER_TEMPLATE_PATH`` environment variable

        $ export DASKERNETES_WORKER_TEMPLATE_PATH=worker_template.yaml

    >>> cluster = KubeCluster()  # automatically finds 'worker_template.yaml'

    See Also
    --------
    KubeCluster.from_yaml
    KubeCluster.from_dict
    KubeCluster.adapt
    """
    def __init__(self,
                 pod_template=None,
                 name=None,
                 namespace=None,
                 n_workers=0,
                 host='0.0.0.0',
                 port=0,
                 env=None,
                 **kwargs):
        if pod_template is None:
            if 'kubernetes-worker-template-path' in config:
                import yaml
                with open(config['kubernetes-worker-template-path']) as f:
                    d = yaml.safe_load(f)
                pod_template = make_pod_from_dict(d)
            else:
                msg = (
                    "Worker pod specification not provided. See KubeCluster "
                    "docstring for ways to specify workers")
                raise ValueError(msg)

        self.cluster = LocalCluster(ip=host or socket.gethostname(),
                                    scheduler_port=port,
                                    n_workers=0,
                                    **kwargs)
        try:
            kubernetes.config.load_incluster_config()
        except kubernetes.config.ConfigException:
            kubernetes.config.load_kube_config()

        self.core_api = kubernetes.client.CoreV1Api()

        if namespace is None:
            namespace = _namespace_default()

        if name is None:
            worker_name = config.get('kubernetes-worker-name',
                                     'dask-{user}-{uuid}')
            name = worker_name.format(user=getpass.getuser(),
                                      uuid=str(uuid.uuid4())[:10],
                                      **os.environ)

        self.pod_template = clean_pod_template(pod_template)
        # Default labels that can't be overwritten
        self.pod_template.metadata.labels[
            'dask.pydata.org/cluster-name'] = name
        self.pod_template.metadata.labels['app'] = 'dask'
        self.pod_template.metadata.labels['component'] = 'dask-worker'
        self.pod_template.metadata.namespace = namespace

        self.pod_template.spec.containers[0].env.append(
            kubernetes.client.V1EnvVar(name='DASK_SCHEDULER_ADDRESS',
                                       value=self.scheduler_address))
        if env:
            self.pod_template.spec.containers[0].env.extend([
                kubernetes.client.V1EnvVar(name=k, value=str(v))
                for k, v in env.items()
            ])
        self.pod_template.metadata.generate_name = name

        finalize(self, _cleanup_pods, self.namespace,
                 self.pod_template.metadata.labels)

        if n_workers:
            self.scale(n_workers)

    @classmethod
    def from_dict(cls, pod_spec, **kwargs):
        """ Create cluster with worker pod spec defined by Python dictionary

        Examples
        --------
        >>> spec = {
        ...     'metadata': {},
        ...     'spec': {
        ...         'containers': [{
        ...             'args': ['dask-worker', '$(DASK_SCHEDULER_ADDRESS)',
        ...                      '--nthreads', '1',
        ...                      '--death-timeout', '60'],
        ...             'command': None,
        ...             'image': 'daskdev/dask:latest',
        ...             'name': 'dask-worker',
        ...         }],
        ...     'restartPolicy': 'Never',
        ...     }
        ... }
        >>> cluster = KubeCluster.from_dict(spec, namespace='my-ns')  # doctest: +SKIP

        See Also
        --------
        KubeCluster.from_yaml
        """
        return cls(make_pod_from_dict(pod_spec), **kwargs)

    @classmethod
    def from_yaml(cls, yaml_path, **kwargs):
        """ Create cluster with worker pod spec defined by a YAML file

        We can start a cluster with pods defined in an accompanying YAML file
        like the following:

        .. code-block:: yaml

            kind: Pod
            metadata:
                labels:
                    foo: bar
                    baz: quux
            spec:
                containers:
                -   image: daskdev/dask:latest
                    name: dask-worker
                    args: [dask-worker, $(DASK_SCHEDULER_ADDRESS), --nthreads, '2', --memory-limit, 8GB]
                restartPolicy: Never

        Examples
        --------
        >>> cluster = KubeCluster.from_yaml('pod.yaml', namespace='my-ns')  # doctest: +SKIP

        See Also
        --------
        KubeCluster.from_dict
        """
        if not yaml:
            raise ImportError(
                "PyYaml is required to use yaml functionality, please install it!"
            )
        with open(yaml_path) as f:
            d = yaml.safe_load(f)
            return cls.from_dict(d, **kwargs)

    @property
    def namespace(self):
        return self.pod_template.metadata.namespace

    @property
    def name(self):
        return self.pod_template.metadata.generate_name

    @property
    def scheduler(self):
        return self.cluster.scheduler

    @property
    def scheduler_address(self):
        return self.scheduler.address

    def pods(self):
        """ A list of kubernetes pods corresponding to current workers

        See Also
        --------
        KubeCluster.logs
        """
        return self.core_api.list_namespaced_pod(
            self.namespace,
            label_selector=format_labels(
                self.pod_template.metadata.labels)).items

    def logs(self, pod):
        """ Logs from a worker pod

        You can get this pod object from the ``pods`` method.

        Parameters
        ----------
        pod: kubernetes.client.V1Pod
            The pod from which we want to collect logs.

        See Also
        --------
        KubeCluster.pods
        Client.get_worker_logs
        """
        return self.core_api.read_namespaced_pod_log(pod.metadata.name,
                                                     pod.metadata.namespace)

    def scale(self, n):
        """ Scale cluster to n workers

        Parameters
        ----------
        n: int
            Target number of workers

        Example
        -------
        >>> cluster.scale(10)  # scale cluster to ten workers

        See Also
        --------
        KubeCluster.scale_up
        KubeCluster.scale_down
        """
        pods = self.pods()
        if n >= len(pods):
            return self.scale_up(n, pods=pods)
        else:
            to_close = select_workers_to_close(self.scheduler, len(pods) - n)
            logger.debug("Closing workers: %s", to_close)
            return self.scale_down(to_close)

    def scale_up(self, n, pods=None, **kwargs):
        """
        Make sure we have n dask-workers available for this cluster

        Examples
        --------
        >>> cluster.scale_up(20)  # ask for twenty workers
        """
        pods = pods or self.pods()

        for i in range(3):
            try:
                out = [
                    self.core_api.create_namespaced_pod(
                        self.namespace, self.pod_template)
                    for _ in range(n - len(pods))
                ]
                break
            except kubernetes.client.rest.ApiException as e:
                if e.status == 500 and 'ServerTimeout' in e.body:
                    logger.info("Server timeout, retry #%d", i + 1)
                    time.sleep(1)
                    last_exception = e
                    continue
                else:
                    raise
        else:
            raise last_exception

        return out
        # fixme: wait for this to be ready before returning!

    def scale_down(self, workers):
        """
        When the worker process exits, Kubernetes leaves the pods in a completed
        state. Kill them when we are asked to.

        Parameters
        ----------
        workers: List[str]
            List of addresses of workers to close
        """
        # Get the existing worker pods
        pods = self.pods()

        # Work out pods that we are going to delete
        # Each worker to delete is given in the form "tcp://<worker ip>:<port>"
        # Convert this to a set of IPs
        ips = set(urlparse(worker).hostname for worker in workers)
        to_delete = [
            p for p in pods
            # Every time we run, purge any completed pods as well as the specified ones
            if p.status.phase == 'Succeeded' or p.status.pod_ip in ips
        ]
        if not to_delete:
            return
        for pod in to_delete:
            try:
                self.core_api.delete_namespaced_pod(
                    pod.metadata.name, self.namespace,
                    kubernetes.client.V1DeleteOptions())
                logger.info('Deleted pod: %s', pod.metadata.name)
            except kubernetes.client.rest.ApiException as e:
                # If a pod has already been removed, just ignore the error
                if e.status != 404:
                    raise

    def __enter__(self):
        return self

    def close(self):
        """ Close this cluster """
        self.scale_down(self.cluster.scheduler.workers)
        self.cluster.close()

    def __exit__(self, type, value, traceback):
        _cleanup_pods(self.namespace, self.pod_template.metadata.labels)
        self.cluster.__exit__(type, value, traceback)
Ejemplo n.º 13
0
    def __init__(self,
                 pod_template=None,
                 name=None,
                 namespace=None,
                 n_workers=None,
                 host=None,
                 port=None,
                 env=None,
                 auth=ClusterAuth.DEFAULT,
                 **kwargs):
        name = name or dask.config.get("kubernetes.name")
        namespace = namespace or dask.config.get("kubernetes.namespace")
        n_workers = (n_workers if n_workers is not None else
                     dask.config.get("kubernetes.count.start"))
        host = host or dask.config.get("kubernetes.host")
        port = port if port is not None else dask.config.get("kubernetes.port")
        env = env if env is not None else dask.config.get("kubernetes.env")

        if not pod_template and dask.config.get("kubernetes.worker-template",
                                                None):
            d = dask.config.get("kubernetes.worker-template")
            d = dask.config.expand_environment_variables(d)
            pod_template = make_pod_from_dict(d)

        if not pod_template and dask.config.get(
                "kubernetes.worker-template-path", None):
            import yaml

            fn = dask.config.get("kubernetes.worker-template-path")
            fn = fn.format(**os.environ)
            with open(fn) as f:
                d = yaml.safe_load(f)
            d = dask.config.expand_environment_variables(d)
            pod_template = make_pod_from_dict(d)

        if not pod_template:
            msg = ("Worker pod specification not provided. See KubeCluster "
                   "docstring for ways to specify workers")
            raise ValueError(msg)

        pod_template = clean_pod_template(pod_template)
        ClusterAuth.load_first(auth)

        self.core_api = kubernetes.client.CoreV1Api()

        if namespace is None:
            namespace = _namespace_default()

        name = name.format(user=getpass.getuser(),
                           uuid=str(uuid.uuid4())[:10],
                           **os.environ)
        name = escape(name)
        self.pod_template = pod_template

        # Default labels that can't be overwritten
        self.pod_template.metadata.labels["dask.org/cluster-name"] = name
        self.pod_template.metadata.labels["user"] = escape(getpass.getuser())
        self.pod_template.metadata.labels["app"] = "dask"
        self.pod_template.metadata.labels["component"] = "dask-worker"
        self.pod_template.metadata.namespace = namespace

        self.cluster = LocalCluster(host=host or socket.gethostname(),
                                    scheduler_port=port,
                                    n_workers=0,
                                    **kwargs)

        # TODO: handle any exceptions here, ensure self.cluster is properly
        # cleaned up.
        self.pod_template.spec.containers[0].env.append(
            kubernetes.client.V1EnvVar(name="DASK_SCHEDULER_ADDRESS",
                                       value=self.scheduler_address))
        if env:
            self.pod_template.spec.containers[0].env.extend([
                kubernetes.client.V1EnvVar(name=k, value=str(v))
                for k, v in env.items()
            ])
        self.pod_template.metadata.generate_name = name

        finalize(self, _cleanup_pods, self.namespace,
                 self.pod_template.metadata.labels)

        if n_workers:
            try:
                self.scale(n_workers)
            except Exception:
                self.cluster.close()
                raise