Esempio n. 1
0
def dask_gateway_object():
    """Connects to Dask Gateway cluster from outside the cluster."""
    os.environ['JUPYTERHUB_API_TOKEN'] = get_jupyterhub_token()
    return dask_gateway.Gateway(
        address=f'https://{constants.QHUB_HOSTNAME}/{constants.GATEWAY_ENDPOINT}',
        auth='jupyterhub',
        proxy_address=f'tcp://{constants.QHUB_HOSTNAME}:8786'
    )
Esempio n. 2
0
async def gateway():
    addr = os.environ.get("TEST_DASK_GATEWAY_KUBE_ADDRESS",
                          "http://localhost:8000")
    auth = dask_gateway.BasicAuth(username="******")
    async with dask_gateway.Gateway(address=addr, asynchronous=True,
                                    auth=auth) as gateway:
        for cluster in await gateway.list_clusters():
            await gateway.stop_cluster(cluster.name)

        yield gateway

        for cluster in await gateway.list_clusters():
            await gateway.stop_cluster(cluster.name)
Esempio n. 3
0
async def _connect_with_gateway_and_create_cluster(
    endpoint: AnyUrl, auth_params: ClusterAuthentication
) -> DaskSubSystem:
    try:
        gateway_auth = await get_gateway_auth_from_params(auth_params)
        gateway = dask_gateway.Gateway(
            address=f"{endpoint}", auth=gateway_auth, asynchronous=True
        )
        # if there is already a cluster that means we can re-connect to it,
        # and IT SHALL BE the first in the list
        cluster_reports_list = await gateway.list_clusters()
        cluster = None
        if cluster_reports_list:
            assert (
                len(cluster_reports_list) == 1
            ), "More than 1 cluster at this location, that is unexpected!!"  # nosec
            cluster = await gateway.connect(
                cluster_reports_list[0].name, shutdown_on_close=False
            )
        else:
            cluster = await gateway.new_cluster(shutdown_on_close=False)
        assert cluster  # nosec
        logger.info("Cluster dashboard available: %s", cluster.dashboard_link)
        # NOTE: we scale to 1 worker as they are global
        await cluster.adapt(active=True)
        client = await cluster.get_client()  # type: ignore
        assert client  # nosec
        return DaskSubSystem(
            client=client,
            scheduler_id=client.scheduler_info()["id"],
            gateway=gateway,
            gateway_cluster=cluster,
        )
    except (TypeError) as exc:
        raise ConfigurationError(
            f"Cluster has invalid configuration: {endpoint=}, {auth_params=}"
        ) from exc
    except (ValueError) as exc:
        # this is when a 404=NotFound,422=MalformedData comes up
        raise DaskClientRequestError(endpoint=endpoint, error=exc) from exc
    except (dask_gateway.GatewayClusterError) as exc:
        # this is when a 409=Conflict/Cannot complete request comes up
        raise DaskClusterError(endpoint=endpoint, error=exc) from exc
    except (dask_gateway.GatewayServerError) as exc:
        # this is when a 500 comes up
        raise DaskGatewayServerError(endpoint=endpoint, error=exc) from exc
Esempio n. 4
0
async def test_gateway_endpoint(
    endpoint: AnyUrl, authentication: ClusterAuthentication
) -> None:
    """This method will try to connect to a gateway endpoint and raise a ConfigurationError in case of problem

    :raises ConfigurationError: contians some information as to why the connection failed
    """
    try:
        gateway_auth = await get_gateway_auth_from_params(authentication)
        async with dask_gateway.Gateway(
            address=f"{endpoint}", auth=gateway_auth, asynchronous=True
        ) as gateway:
            # this does not yet create any connection to the underlying gateway.
            # since using a fct from dask gateway is going to timeout after a long time
            # we bypass the pinging by calling in ourselves with a short timeout
            async with httpx.AsyncClient(
                transport=httpx.AsyncHTTPTransport(retries=2)
            ) as client:
                # try to get something the api shall return fast
                response = await client.get(
                    f"{endpoint}/api/version", timeout=_PING_TIMEOUT_S
                )
                response.raise_for_status()
                # now we try to list the clusters to check the gateway responds in a sensible way
                await gateway.list_clusters()

            logger.debug("Pinging %s, succeeded", f"{endpoint=}")
    except (
        dask_gateway.GatewayServerError,
        ClientConnectionError,
        ClientResponseError,
        httpx.HTTPError,
    ) as exc:
        logger.debug("Pinging %s, failed: %s", f"{endpoint=}", f"{exc=!r}")
        raise ConfigurationError(
            f"Could not connect to cluster in {endpoint}: error: {exc}"
        ) from exc
def _get_cluster_dask_gateway(**kwargs):
    """
    Start dask.kubernetes cluster and dask.distributed client

    All arguments are optional. If not provided, defaults will be used. To view
    defaults, instantiate a :class:`dask_gateway.Gateway` object and call
    `gateway.cluster_options()`.

    Parameters
    ----------
    name : str, optional
        Name of worker image to use (e.g. ``rhodium/worker:latest``). If ``None``
        (default), default to worker specified in ``template_path``.
    tag : str, optional
        Tag of the worker image to use. Cannot be used in combination with
        ``name``, which should include a tag. If provided, overrides the
        tag of the image specified in ``template_path``. If ``None``
        (default), the full image specified in ``name`` or ``template_path``
        is used.
    extra_pip_packages : str, optional
        Extra pip packages to install on worker. Packages are installed
        using ``pip install extra_pip_packages``.
    profile : One of ["micro", "standard", "big", "giant"]
        Determines size of worker. CPUs assigned are slightly under 1, 2, 4, and 8,
        respectively. Memory assigned is slightly over 6, 12, 24, and 48 GB,
        respectively.
    cpus : float, optional
        Set the CPUs requested for your workers as defined by ``profile``. Will
        raise error if >7.5, because our 8-CPU nodes need ~.5 vCPU for kubernetes pods.
        (NOTE 12/15/20: This is currently set to 1 by default to allow for mapping
        big workflows across inputs, see
        https://github.com/dask/dask-gateway/issues/364).
    cred_name : str, optional
        Name of Google Cloud credentials file to use, equivalent to providing
        ``cred_path='/opt/gcsfuse_tokens/{}.json'.format(cred_name)``. May not use
        if ``cred_path`` is specified.
    cred_path : str, optional
        Path to Google Cloud credentials file to use. May not use if ``cred_name`` is
        specified.
    env_items : dict, optional
        A dictionary of env variable 'name'-'value' pairs to append to the env
        variables included in ``template_path``, e.g.

        .. code-block:: python

            {
                'MY_ENV_VAR': 'some string',
            }

    extra_worker_labels : dict, optional
        Dictionary of kubernetes labels to apply to pods. None (default) results
        in no additional labels besides those in the template, as well as
        ``jupyter_user``, which is inferred from the ``JUPYTERHUB_USER``, or, if
        not set, the server's hostname.
    extra_pod_tolerations : list of dict, optional
        List of pod toleration dictionaries. For example, to match a node pool
        NoSchedule toleration, you might provide:

        .. code-block:: python

            extra_pod_tolerations=[
                {
                    "effect": "NoSchedule",
                    "key": "k8s.dask.org_dedicated",
                    "operator": "Equal",
                    "value": "worker-highcpu"
                },
                {
                    "effect": "NoSchedule",
                    "key": "k8s.dask.org/dedicated",
                    "operator": "Equal",
                    "value": "worker-highcpu"
                }
            ]

    keep_default_tolerations : bool, optional
        Whether to append (default) or replace the default tolerations. Ignored if
        ``extra_pod_tolerations`` is ``None`` or has length 0.

    Returns
    -------
    client : object
        :py:class:`dask.distributed.Client` connected to cluster
    cluster : object
        Pre-configured :py:class:`dask_gateway.GatewayCluster`


    See Also
    --------
    :py:func:`get_micro_cluster` :
        A cluster with one-CPU workers
    :py:func:`get_standard_cluster` :
        The default cluster specification
    :py:func:`get_big_cluster` :
        A cluster with workers twice the size of the default
    :py:func:`get_giant_cluster` :
        A cluster with workers four times the size of the default
    """

    gateway = dask_gateway.Gateway()
    default_options = gateway.cluster_options()

    new_kwargs = kwargs.copy()

    if new_kwargs.get("cpus", 0) > 7.25:
        raise ValueError("Must specify ``cpus`` <= 7.25")

    # handle naming changes
    for k, v in kwargs.items():
        if k == "name":
            new_kwargs["worker_image"] = kwargs["name"]
            del new_kwargs["name"]
        elif k == "cred_path":
            if "cred_name" not in kwargs.keys():
                new_kwargs["cred_name"] = Path(v).stem
            del new_kwargs["cred_path"]
        elif k == "extra_pod_tolerations":
            if (
                "keep_default_tolerations" in kwargs.keys()
                and kwargs["keep_default_tolerations"] == False
            ):
                base_tols = {}
            else:
                base_tols = default_options.worker_tolerations
            new_kwargs.pop("keep_default_tolerations", None)
            new_kwargs["worker_tolerations"] = {
                **base_tols,
                **{
                    f"user_{key}": val
                    for key, val in enumerate(new_kwargs.pop("extra_pod_tolerations"))
                },
            }
        elif k not in list(default_options.keys()) + ["tag"]:
            raise KeyError(f"{k} not allowed as a kwarg when using dask-gateway")

    if "worker_image" in new_kwargs and "tag" in new_kwargs:
        raise ValueError("provide either `name` or `tag`, not both")

    if "tag" in new_kwargs:
        img, _ = default_options.worker_image.split(":")
        new_kwargs["worker_image"] = ":".join((img, new_kwargs["tag"]))
        del new_kwargs["tag"]

    cluster = gateway.new_cluster(**new_kwargs)
    client = cluster.get_client()

    return client, cluster