def test_pod_from_minimal_dict(image_name, loop, ns): spec = { 'spec': { 'containers': [{ 'args': [ 'dask-worker', '$(DASK_SCHEDULER_ADDRESS)', '--nthreads', '1', '--death-timeout', '60' ], 'command': None, 'image': image_name, 'imagePullPolicy': 'IfNotPresent', 'name': 'worker' }] } } with KubeCluster.from_dict(spec, loop=loop, namespace=ns) as cluster: cluster.adapt() with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) result = future.result() assert result == 11
def run_flow(self) -> None: """ Run the flow from specified flow_file_path location using a Dask executor """ from prefect.engine import get_default_flow_runner_class from prefect.engine.executors import DaskExecutor from dask_kubernetes import KubeCluster with open(path.join(path.dirname(__file__), "worker_pod.yaml")) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod) cluster = KubeCluster.from_dict(worker_pod) cluster.adapt(minimum=1, maximum=1) # Load serialized flow from file and run it with a DaskExecutor with open( prefect.context.get("flow_file_path", "/root/.prefect/flow_env.prefect"), "rb", ) as f: flow = cloudpickle.load(f) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor)
def run_flow(self) -> None: """ Run the flow from specified flow_file_path location using a Dask executor """ try: from prefect.engine import get_default_flow_runner_class from prefect.engine.executors import DaskExecutor from dask_kubernetes import KubeCluster with open(path.join(path.dirname(__file__), "worker_pod.yaml")) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml( yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace")) cluster.adapt(minimum=1, maximum=1) # Load serialized flow from file and run it with a DaskExecutor with open( prefect.context.get("flow_file_path", "/root/.prefect/flow_env.prefect"), "rb", ) as f: flow = cloudpickle.load(f) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) sys.exit(0) # attempt to force resource cleanup except Exception as exc: self.logger.error( "Unexpected error raised during flow run: {}".format(exc)) raise exc
def test_pod_from_minimal_dict(image_name, loop, ns): spec = { "spec": { "containers": [{ "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1", "--death-timeout", "60", ], "command": None, "image": image_name, "imagePullPolicy": "IfNotPresent", "name": "worker", }] } } with KubeCluster.from_dict(spec, loop=loop, namespace=ns) as cluster: cluster.adapt() with Client(cluster, loop=loop) as client: future = client.submit(lambda x: x + 1, 10) result = future.result() assert result == 11
def test_pod_from_dict(image_name, loop, ns): spec = { 'metadata': {}, 'restartPolicy': 'Never', 'spec': { 'containers': [{ 'args': ['dask-worker', '$(DASK_SCHEDULER_ADDRESS)', '--nthreads', '1', '--death-timeout', '60'], 'command': None, 'image': image_name, 'imagePullPolicy': 'IfNotPresent', 'name': 'dask-worker', }] } } with KubeCluster.from_dict(spec, loop=loop, namespace=ns) as cluster: cluster.scale(2) with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) result = future.result() assert result == 11 while len(cluster.scheduler.workers) < 2: sleep(0.1) # Ensure that inter-worker communication works well futures = client.map(lambda x: x + 1, range(10)) total = client.submit(sum, futures) assert total.result() == sum(map(lambda x: x + 1, range(10))) assert all(client.has_what().values())
async def test_pod_from_minimal_dict(image_name, ns, auth): spec = { "spec": { "containers": [ { "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1", "--death-timeout", "60", ], "command": None, "image": image_name, "imagePullPolicy": "IfNotPresent", "name": "worker", } ] } } async with KubeCluster.from_dict( spec, namespace=ns, auth=auth, **cluster_kwargs ) as cluster: cluster.adapt() async with Client(cluster, asynchronous=True) as client: future = client.submit(lambda x: x + 1, 10) result = await future assert result == 11
def run_flow(self) -> None: """ Run the flow from specified flow_file_path location using a Dask executor """ # Call on_start callback if specified if self.on_start: self.on_start() try: from prefect.engine import get_default_flow_runner_class from prefect.engine.executors import DaskExecutor from dask_kubernetes import KubeCluster if self._worker_spec: worker_pod = self._worker_spec worker_pod = self._populate_worker_spec_yaml( yaml_obj=worker_pod) else: with open(path.join(path.dirname(__file__), "worker_pod.yaml")) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml( yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace")) cluster.adapt(minimum=self.min_workers, maximum=self.max_workers) # Load serialized flow from file and run it with a DaskExecutor with open( prefect.context.get("flow_file_path", "/root/.prefect/flow_env.prefect"), "rb", ) as f: flow = cloudpickle.load(f) ## populate global secrets secrets = prefect.context.get("secrets", {}) for secret in flow.storage.secrets: secrets[secret] = prefect.tasks.secrets.PrefectSecret( name=secret).run() with prefect.context(secrets=secrets): executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) except Exception as exc: self.logger.exception( "Unexpected error raised during flow run: {}".format(exc)) raise exc finally: # Call on_exit callback if specified if self.on_exit: self.on_exit()
def make_kube(pod_spec, **kws): """Create a dask_kubernetes.KubeCluster. pod_spec is either the name of a YAML file containg the worker pod specification or a dict containing the specification directly. kws is passed to KubeCluster.from_yaml or .from_dict. """ from dask_kubernetes import KubeCluster if isistance(pod_spec, str): return KubeCluster.from_yaml(pod_spec, **kws) else: return KubeCluster.from_dict(pod_spec, **kws)
async def test_pod_from_dict(image_name, ns, auth): spec = { "metadata": {}, "restartPolicy": "Never", "spec": { "containers": [{ "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1", "--death-timeout", "60", ], "command": None, "image": image_name, "imagePullPolicy": "IfNotPresent", "name": "dask-worker", }] }, } async with KubeCluster.from_dict(spec, namespace=ns, port=32000, auth=auth, **cluster_kwargs) as cluster: cluster.scale(2) await cluster assert "32000" in cluster.scheduler_address async with Client(cluster, asynchronous=True) as client: future = client.submit(lambda x: x + 1, 10) result = await future assert result == 11 while len(cluster.scheduler_info["workers"]) < 2: await asyncio.sleep(0.1) # Ensure that inter-worker communication works well futures = client.map(lambda x: x + 1, range(10)) total = client.submit(sum, futures) assert (await total) == sum(map(lambda x: x + 1, range(10))) assert all((await client.has_what()).values())
def run(self, flow: "Flow") -> None: """ Run the flow using a temporary dask-kubernetes cluster. Args: - flow (Flow): the flow to run. """ # Call on_start callback if specified if self.on_start: self.on_start() try: from prefect.engine import get_default_flow_runner_class from prefect.executors import DaskExecutor from dask_kubernetes import KubeCluster if self._worker_spec: worker_pod = self._worker_spec worker_pod = self._populate_worker_spec_yaml(yaml_obj=worker_pod) else: with open( path.join(path.dirname(__file__), "worker_pod.yaml") ) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace") ) cluster.adapt(minimum=self.min_workers, maximum=self.max_workers) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) except Exception as exc: self.logger.exception( "Unexpected error raised during flow run: {}".format(exc) ) raise exc finally: # Call on_exit callback if specified if self.on_exit: self.on_exit()
def test_pod_from_dict(image_name, loop, ns): spec = { "metadata": {}, "restartPolicy": "Never", "spec": { "containers": [{ "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1", "--death-timeout", "60", ], "command": None, "image": image_name, "imagePullPolicy": "IfNotPresent", "name": "dask-worker", }] }, } with KubeCluster.from_dict(spec, loop=loop, namespace=ns) as cluster: cluster.scale(2) with Client(cluster, loop=loop) as client: future = client.submit(lambda x: x + 1, 10) result = future.result() assert result == 11 while len(cluster.scheduler_info["workers"]) < 2: sleep(0.1) # Ensure that inter-worker communication works well futures = client.map(lambda x: x + 1, range(10)) total = client.submit(sum, futures) assert total.result() == sum(map(lambda x: x + 1, range(10))) assert all(client.has_what().values())
def run( self, environment_file_path: str = "/root/.prefect/flow_env.prefect" ) -> "prefect.engine.state.State": """ Runs the `Flow` represented by this environment. This creates a dask scheduler with the ability to scale from a single worker to the provided `maximum_workers`. The .prefect flow that was stored in this image is deserialized and has its `run` method called with the `DaskExecutor` pointing to the dask scheduler present on this pod. Args: - environment_file_path (str, optional): File path to the Prefect environment file; this is generally a serialized LocalEnvironment Returns: - prefect.engine.state.State: the state of the flow run """ from prefect.engine.executors import DaskExecutor from dask_kubernetes import KubeCluster with open(path.join(path.dirname(__file__), "worker_pod.yaml")) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod) cluster = KubeCluster.from_dict(worker_pod) cluster.adapt(minimum=1, maximum=self.max_workers) schema = prefect.serialization.environment.EnvironmentSchema() with open(environment_file_path, "r") as f: environment = schema.load(json.load(f)) return environment.run( runner_kwargs={ "executor": DaskExecutor( address=cluster.scheduler_address) })
from prefect.engine.executors import DaskExecutor >>>>>>> prefect clone from dask_kubernetes import KubeCluster if self._worker_spec: worker_pod = self._worker_spec worker_pod = self._populate_worker_spec_yaml(yaml_obj=worker_pod) else: with open( path.join(path.dirname(__file__), "worker_pod.yaml") ) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace") ) cluster.adapt(minimum=self.min_workers, maximum=self.max_workers) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) except Exception as exc: self.logger.exception( "Unexpected error raised during flow run: {}".format(exc) ) raise exc finally: # Call on_exit callback if specified if self.on_exit: self.on_exit()
def run_dask_function(config): """Start a Dask Cluster using dask-kubernetes and run a function. Talks to kubernetes to create `n` amount of new `pods` with a dask worker inside of each forming a `dask` cluster. Then, a function specified from `config` is being imported and run with the given arguments. The tasks created by this `function` are being run on the `dask` cluster for distributed computation. The config dict must contain the following sections: * run * dask_cluster * output Args: config (dict): Config dictionary. """ output_conf = config.get('output') if output_conf: path = output_conf.get('path') if not path: raise ValueError( 'An output path must be provided when providing `output`.') cluster_spec = _generate_cluster_spec(config, kubernetes=False) cluster = KubeCluster.from_dict(cluster_spec) workers = config['dask_cluster'].get('workers') if not workers: cluster.adapt() elif isinstance(workers, int): cluster.scale(workers) else: cluster.adapt(**workers) client = Client(cluster) client.get_versions(check=True) try: run = _import_function(config['run']) kwargs = config['run']['args'] results = run(**kwargs) finally: client.close() cluster.close() if output_conf: bucket = output_conf.get('bucket') try: if bucket: aws_key = output_conf.get('key') aws_secret = output_conf.get('secret_key') _upload_to_s3(bucket, path, results, aws_key, aws_secret) else: os.makedirs(os.path.dirname(path), exist_ok=True) results.to_csv(path) except Exception: print('Error storing results. Falling back to console dump.') print(_df_to_csv_str(results)) else: return results
def run_flow(self) -> None: """ Run the flow using a Dask executor """ # Call on_start callback if specified if self.on_start: self.on_start() try: from prefect.engine import get_default_flow_runner_class from prefect.engine.executors import DaskExecutor from dask_kubernetes import KubeCluster if self._worker_spec: worker_pod = self._worker_spec worker_pod = self._populate_worker_spec_yaml( yaml_obj=worker_pod) else: with open(path.join(path.dirname(__file__), "worker_pod.yaml")) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml( yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace")) cluster.adapt(minimum=self.min_workers, maximum=self.max_workers) flow_run_id = prefect.context.get("flow_run_id") if not flow_run_id: raise ValueError("No flow run ID found in context.") query = { "query": { with_args("flow_run", { "where": { "id": { "_eq": flow_run_id } } }): { "flow": { "name": True, "storage": True, }, } } } client = Client() result = client.graphql(query) flow_run = result.data.flow_run[0] flow_data = flow_run.flow storage_schema = prefect.serialization.storage.StorageSchema() storage = storage_schema.load(flow_data.storage) ## populate global secrets secrets = prefect.context.get("secrets", {}) for secret in storage.secrets: secrets[secret] = prefect.tasks.secrets.PrefectSecret( name=secret).run() with prefect.context(secrets=secrets): flow = storage.get_flow(storage.flows[flow_data.name]) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) except Exception as exc: self.logger.exception( "Unexpected error raised during flow run: {}".format(exc)) raise exc finally: # Call on_exit callback if specified if self.on_exit: self.on_exit()
def get_cluster(name=None, extra_pip_packages=None, extra_conda_packages=None, memory_gb=None, nthreads=None, cpus=None, cred_name=None, cred_path=None, env_items=None, scaling_factor=1, dask_config_dict={}, template_path='~/worker-template.yml', **kwargs): """ Start dask.kubernetes cluster and dask.distributed client All arguments are optional. If not provided, arguments will default to values provided in ``template_path``. Parameters ---------- name : str, optional Name of worker image to use. If None, default to worker specified in ``template_path``. extra_pip_packages : str, optional Extra pip packages to install on worker. Packages are installed using ``pip install extra_pip_packages``. extra_conda_packages :str, optional Extra conda packages to install on worker. Default channel is ``conda-forge``. Packages are installed using ``conda install -y -c conda-forge ${EXTRA_CONDA_PACKAGES}``. memory_gb : float, optional Memory to assign per 'group of workers', where a group consists of nthreads independent workers. nthreads : int, optional Number of independent threads per group of workers. Not sure if this should ever be set to something other than 1. cpus : float, optional Number of virtual CPUs to assign per 'group of workers' cred_name : str, optional Name of Google Cloud credentials file to use, equivalent to providing ``cred_path='/opt/gcsfuse_tokens/{}.json'.format(cred_name)`` cred_path : str, optional Path to Google Cloud credentials file to use. env_items : list of dict, optional A list of env variable 'name'-'value' pairs to append to the env variables included in ``template_path``, e.g. .. code-block:: python [{ 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': '/opt/gcsfuse_tokens/rhg-data.json'}]) scaling_factor: float, optional scale the worker memory & CPU size using a constant multiplier of the specified worker. No constraints in terms of performance or cluster size are enforced - if you request too little the dask worker will not perform; if you request too much you may see an ``InsufficientMemory`` or ``InsufficientCPU`` error on the google cloud Kubernetes console. Recommended scaling factors given our default ``~/worker-template.yml`` specs are [0.5, 1, 2, 4]. dask_config_dict: dict, optional Dask config parameters to modify from their defaults. A '.' is used to access progressive levels of the yaml structure. For instance, the dict could look like {'distributed.worker.profile.interval':'100ms'} template_path : str, optional Path to worker template file. Default ``~/worker-template.yml``. Returns ------- client : object :py:class:`dask.distributed.Client` connected to cluster cluster : object Pre-configured :py:class:`dask_kubernetes.KubeCluster` See Also -------- :py:func:`get_micro_cluster` : A cluster with one-CPU workers :py:func:`get_standard_cluster` : The default cluster specification :py:func:`get_big_cluster` : A cluster with workers twice the size of the default :py:func:`get_giant_cluster` : A cluster with workers four times the size of the default """ # update dask settings dask.config.set(dask_config_dict) template_path = os.path.expanduser(template_path) with open(template_path, 'r') as f: template = yml.load(f, Loader=yml.SafeLoader) container = template['spec']['containers'][0] # replace the defualt image with the new one if name is not None: container['image'] = name if extra_pip_packages is not None: container['env'].append({ 'name': 'EXTRA_PIP_PACKAGES', 'value': extra_pip_packages }) if extra_conda_packages is not None: container['env'].append({ 'name': 'EXTRA_CONDA_PACKAGES', 'value': extra_conda_packages }) if cred_path is not None: # can remove this first env var once # worker docker image is updated to point to # 'GOOGLE_APPLCIATION_CREDENTIALS' container['env'].append({ 'name': 'GCLOUD_DEFAULT_TOKEN_FILE', 'value': cred_path }) container['env'].append({ 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': cred_path }) elif cred_name is not None: # can remove this first env var once # worker docker image is updated to point to # 'GOOGLE_APPLCIATION_CREDENTIALS' container['env'].append({ 'name': 'GCLOUD_DEFAULT_TOKEN_FILE', 'value': '/opt/gcsfuse_tokens/{}.json'.format(cred_name) }) container['env'].append({ 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': '/opt/gcsfuse_tokens/{}.json'.format(cred_name) }) if env_items is not None: container['env'] = container['env'] + env_items # adjust worker creation args args = container['args'] # set nthreads if provided nthreads_ix = args.index('--nthreads') + 1 if nthreads is not None: args[nthreads_ix] = str(nthreads) # then in resources resources = container['resources'] limits = resources['limits'] requests = resources['requests'] msg = '{} limits and requests do not match' if memory_gb is None: memory_gb = float(limits['memory'].strip('G')) mem_request = float(requests['memory'].strip('G')) assert memory_gb == mem_request, msg.format('memory') if cpus is None: cpus = float(limits['cpu']) cpu_request = float(requests['cpu']) assert cpus == cpu_request, msg.format('cpu') format_request = lambda x: '{:04.2f}'.format(np.floor(x * 100) / 100) # set memory-limit if provided mem_ix = args.index('--memory-limit') + 1 args[mem_ix] = (format_request(float(memory_gb) * scaling_factor) + 'GB') limits['memory'] = (format_request(float(memory_gb) * scaling_factor) + 'G') requests['memory'] = (format_request(float(memory_gb) * scaling_factor) + 'G') limits['cpu'] = format_request(float(cpus) * scaling_factor) requests['cpu'] = format_request(float(cpus) * scaling_factor) # start cluster and client and return cluster = KubeCluster.from_dict(template) client = dd.Client(cluster) return client, cluster
def _get_cluster_dask_kubernetes( name=None, tag=None, extra_pip_packages=None, extra_conda_packages=None, memory_gb=None, nthreads=None, cpus=None, cred_name=None, cred_path=None, env_items=None, scaling_factor=1, dask_config_dict={}, deploy_mode="local", idle_timeout=None, template_path="~/worker-template.yml", extra_worker_labels=None, extra_pod_tolerations=None, keep_default_tolerations=True, **kwargs, ): """ **DEPRECATED (12/15/2020) **: Since we no longer maintain clusters using dask-kubernetes schedulers. Only dask-gateway is now supported. Start dask.kubernetes cluster and dask.distributed client All arguments are optional. If not provided, arguments will default to values provided in ``template_path``. Parameters ---------- name : str, optional Name of worker image to use (e.g. ``rhodium/worker:latest``). If ``None`` (default), default to worker specified in ``template_path``. tag : str, optional Tag of the worker image to use. Cannot be used in combination with ``name``, which should include a tag. If provided, overrides the tag of the image specified in ``template_path``. If ``None`` (default), the full image specified in ``name`` or ``template_path`` is used. extra_pip_packages : str, optional Extra pip packages to install on worker. Packages are installed using ``pip install extra_pip_packages``. extra_conda_packages :str, optional Extra conda packages to install on worker. Default channel is ``conda-forge``. Packages are installed using ``conda install -y -c conda-forge ${EXTRA_CONDA_PACKAGES}``. memory_gb : float, optional Memory to assign per 'group of workers', where a group consists of nthreads independent workers. nthreads : int, optional Number of independent threads per group of workers. Not sure if this should ever be set to something other than 1. cpus : float, optional Number of virtual CPUs to assign per 'group of workers' cred_name : str, optional Name of Google Cloud credentials file to use, equivalent to providing ``cred_path='/opt/gcsfuse_tokens/{}.json'.format(cred_name)`` cred_path : str, optional Path to Google Cloud credentials file to use. env_items : dict, optional A dictionary of env variable 'name'-'value' pairs to append to the env variables included in ``template_path``, e.g. .. code-block:: python { 'GOOGLE_APPLICATION_CREDENTIALS': '/opt/gcsfuse_tokens/rhg-data.json', } scaling_factor : float, optional scale the worker memory & CPU size using a constant multiplier of the specified worker. No constraints in terms of performance or cluster size are enforced - if you request too little the dask worker will not perform; if you request too much you may see an ``InsufficientMemory`` or ``InsufficientCPU`` error on the google cloud Kubernetes console. Recommended scaling factors given our default ``~/worker-template.yml`` specs are [0.5, 1, 2, 4]. dask_config_dict : dict, optional Dask config parameters to modify from their defaults. A '.' is used to access progressive levels of the yaml structure. For instance, the dict could look like ``{'distributed.worker.profile.interval': '100ms'}`` deploy_mode : str, optional Where to deploy the scheduler (on the same pod or a different pod) idle_timeout : str, optional Number of seconds without active communication with the client before the remote scheduler shuts down (ignored if ``deploy_mode=='local'``). Default is to not shut down for this reason. template_path : str, optional Path to worker template file. Default ``~/worker-template.yml``. extra_worker_labels : dict, optional Dictionary of kubernetes labels to apply to pods. None (default) results in no additional labels besides those in the template, as well as ``jupyter_user``, which is inferred from the ``JUPYTERHUB_USER``, or, if not set, the server's hostname. extra_pod_tolerations : list of dict, optional List of pod toleration dictionaries. For example, to match a node pool NoSchedule toleration, you might provide: .. code-block:: python extra_pod_tolerations=[ { "effect": "NoSchedule", "key": "k8s.dask.org_dedicated", "operator": "Equal", "value": "worker-highcpu" }, { "effect": "NoSchedule", "key": "k8s.dask.org/dedicated", "operator": "Equal", "value": "worker-highcpu" } ] keep_default_tolerations : bool, optional Whether to append (default) or replace the default tolerations. Ignored if ``extra_pod_tolerations`` is ``None`` or has length 0. Returns ------- client : object :py:class:`dask.distributed.Client` connected to cluster cluster : object Pre-configured :py:class:`dask_kubernetes.KubeCluster` See Also -------- :py:func:`get_micro_cluster` : A cluster with one-CPU workers :py:func:`get_standard_cluster` : The default cluster specification :py:func:`get_big_cluster` : A cluster with workers twice the size of the default :py:func:`get_giant_cluster` : A cluster with workers four times the size of the default """ if (name is not None) and (tag is not None): raise ValueError("provide either `name` or `tag`, not both") # update dask settings dask.config.set(dask_config_dict) template_path = os.path.expanduser(template_path) with open(template_path, "r") as f: template = yml.load(f, Loader=yml.SafeLoader) # update labels with default and user-provided labels if ("metadata" not in template) or (template.get("metadata", {}) is None): template["metadata"] = {} if ("labels" not in template["metadata"]) or ( template["metadata"]["labels"] is None ): template["metadata"]["labels"] = {} labels = template["metadata"]["labels"] if extra_worker_labels is not None: labels.update(extra_worker_labels) labels.update( {"jupyter_user": os.environ.get("JUPYTERHUB_USER", socket.gethostname())} ) template["metadata"]["labels"] = labels if "tolerations" not in template["spec"]: template["spec"]["tolerations"] = [] if (extra_pod_tolerations is not None) and (len(extra_pod_tolerations) > 0): if keep_default_tolerations: template["spec"]["tolerations"].extend(extra_pod_tolerations) else: template["spec"]["tolerations"] = extra_pod_tolerations container = template["spec"]["containers"][0] # replace the defualt image with the new one if name is not None: container["image"] = name if tag is not None: img, _ = container["image"].split(":") container["image"] = ":".join(img, tag) if extra_pip_packages is not None: container["env"].append( {"name": "EXTRA_PIP_PACKAGES", "value": extra_pip_packages} ) if extra_conda_packages is not None: container["env"].append( {"name": "EXTRA_CONDA_PACKAGES", "value": extra_conda_packages} ) if cred_path is not None: container["env"].append( {"name": "GOOGLE_APPLICATION_CREDENTIALS", "value": cred_path} ) elif cred_name is not None: container["env"].append( { "name": "GOOGLE_APPLICATION_CREDENTIALS", "value": "/opt/gcsfuse_tokens/{}.json".format(cred_name), } ) if env_items is not None: if isinstance(env_items, dict): [ container["env"].append({"name": k, "value": v}) for k, v in env_items.items() ] # allow deprecated passing of list of name/value pairs elif isinstance(env_items, Sequence): warnings.warn( "Passing of list of name/value pairs deprecated. " "Please pass a dictionary instead." ) container["env"] = container["env"] + env_items else: raise ValueError("Expected `env_items` of type dict or sequence.") # adjust worker creation args args = container["args"] # set nthreads if provided nthreads_ix = args.index("--nthreads") + 1 if nthreads is not None: args[nthreads_ix] = str(nthreads) nthreads = int(args[nthreads_ix]) # then in resources resources = container["resources"] limits = resources["limits"] requests = resources["requests"] msg = "{} limits and requests do not match" if memory_gb is None: memory_gb = float(limits["memory"].strip("G")) mem_request = float(requests["memory"].strip("G")) assert memory_gb == mem_request, msg.format("memory") if cpus is None: cpus = float(limits["cpu"]) cpu_request = float(requests["cpu"]) assert cpus == cpu_request, msg.format("cpu") # now properly set the threads accessible by multi-threaded libraries # so that there's no competition between dask threads and the threads of # these libraries cpus_rounded = np.round(cpus) lib_threads = int(cpus_rounded / nthreads) for lib in ["OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS"]: container["env"].append({"name": lib, "value": lib_threads}) def format_request(x): return "{:04.2f}".format(np.floor(x * 100) / 100) # set memory-limit if provided mem_ix = args.index("--memory-limit") + 1 args[mem_ix] = format_request(float(memory_gb) * scaling_factor) + "GB" limits["memory"] = format_request(float(memory_gb) * scaling_factor) + "G" requests["memory"] = format_request(float(memory_gb) * scaling_factor) + "G" limits["cpu"] = format_request(float(cpus) * scaling_factor) requests["cpu"] = format_request(float(cpus) * scaling_factor) # start cluster and client and return # need more time to connect to remote scheduler if deploy_mode == "remote": dask.config.set({"kubernetes.idle-timeout": idle_timeout}) cluster = KubeCluster.from_dict( template, deploy_mode=deploy_mode, idle_timeout=None ) client = dd.Client(cluster) return client, cluster