Beispiel #1
0
def deploy_function(function: DaskCluster, secrets=None):

    # TODO: why is this here :|
    try:
        import dask
        from dask.distributed import Client, default_client  # noqa: F401
        from dask_kubernetes import KubeCluster, make_pod_spec  # noqa: F401
        from kubernetes_asyncio import client
    except ImportError as exc:
        print(
            "missing dask or dask_kubernetes, please run "
            '"pip install dask distributed dask_kubernetes", %s',
            exc,
        )
        raise exc

    spec = function.spec
    meta = function.metadata
    spec.remote = True

    image = function.full_image_path() or "daskdev/dask:latest"
    env = spec.env
    namespace = meta.namespace or config.namespace
    if spec.extra_pip:
        env.append(spec.extra_pip)

    pod_labels = get_resource_labels(function,
                                     scrape_metrics=config.scrape_metrics)
    worker_args = ["dask-worker", "--nthreads", str(spec.nthreads)]
    memory_limit = spec.resources.get("limits", {}).get("memory")
    if memory_limit:
        worker_args.extend(["--memory-limit", str(memory_limit)])
    if spec.args:
        worker_args.extend(spec.args)
    scheduler_args = ["dask-scheduler"]

    container_kwargs = {
        "name": "base",
        "image": image,
        "env": env,
        "image_pull_policy": spec.image_pull_policy,
        "volume_mounts": spec.volume_mounts,
    }
    scheduler_container = client.V1Container(
        resources=spec.scheduler_resources,
        args=scheduler_args,
        **container_kwargs)
    worker_container = client.V1Container(resources=spec.worker_resources,
                                          args=worker_args,
                                          **container_kwargs)

    scheduler_pod_spec = kube_resource_spec_to_pod_spec(
        spec, scheduler_container)
    worker_pod_spec = kube_resource_spec_to_pod_spec(spec, worker_container)
    for pod_spec in [scheduler_pod_spec, worker_pod_spec]:
        if spec.image_pull_secret:
            pod_spec.image_pull_secrets = [
                client.V1LocalObjectReference(name=spec.image_pull_secret)
            ]

    scheduler_pod = client.V1Pod(
        metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels),
        # annotations=meta.annotation),
        spec=scheduler_pod_spec,
    )
    worker_pod = client.V1Pod(
        metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels),
        # annotations=meta.annotation),
        spec=worker_pod_spec,
    )

    svc_temp = dask.config.get("kubernetes.scheduler-service-template")
    if spec.service_type or spec.node_port:
        if spec.node_port:
            spec.service_type = "NodePort"
            svc_temp["spec"]["ports"][1]["nodePort"] = spec.node_port
        update_in(svc_temp, "spec.type", spec.service_type)

    norm_name = normalize_name(meta.name)
    dask.config.set({
        "kubernetes.scheduler-service-template": svc_temp,
        "kubernetes.name": "mlrun-" + norm_name + "-{uuid}",
    })

    cluster = KubeCluster(
        worker_pod,
        scheduler_pod_template=scheduler_pod,
        deploy_mode="remote",
        namespace=namespace,
        idle_timeout=spec.scheduler_timeout,
    )

    logger.info(
        f"cluster {cluster.name} started at {cluster.scheduler_address}")

    function.status.scheduler_address = cluster.scheduler_address
    function.status.cluster_name = cluster.name
    if spec.service_type == "NodePort":
        ports = cluster.scheduler.service.spec.ports
        function.status.node_ports = {
            "scheduler": ports[0].node_port,
            "dashboard": ports[1].node_port,
        }

    if spec.replicas:
        cluster.scale(spec.replicas)
    else:
        cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas)

    return cluster
Beispiel #2
0
def deploy_function(function: DaskCluster, secrets=None):
    try:
        from dask_kubernetes import KubeCluster, make_pod_spec
        from dask.distributed import Client, default_client
        from kubernetes_asyncio import client
        import dask
    except ImportError as e:
        print(
            'missing dask or dask_kubernetes, please run '
            '"pip install dask distributed dask_kubernetes", %s', e)
        raise e

    spec = function.spec
    meta = function.metadata
    spec.remote = True

    image = function.full_image_path() or 'daskdev/dask:latest'
    env = spec.env
    namespace = meta.namespace or config.namespace
    if spec.extra_pip:
        env.append(spec.extra_pip)

    pod_labels = get_resource_labels(function)
    args = ['dask-worker', "--nthreads", str(spec.nthreads)]
    if spec.args:
        args += spec.args

    container = client.V1Container(name='base',
                                   image=image,
                                   env=env,
                                   args=args,
                                   image_pull_policy=spec.image_pull_policy,
                                   volume_mounts=spec.volume_mounts,
                                   resources=spec.resources)

    pod_spec = client.V1PodSpec(containers=[container],
                                restart_policy='Never',
                                volumes=spec.volumes,
                                service_account=spec.service_account)
    if spec.image_pull_secret:
        pod_spec.image_pull_secrets = [
            client.V1LocalObjectReference(name=spec.image_pull_secret)
        ]

    pod = client.V1Pod(
        metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels),
        #annotations=meta.annotation),
        spec=pod_spec)

    svc_temp = dask.config.get("kubernetes.scheduler-service-template")
    if spec.service_type or spec.node_port:
        if spec.node_port:
            spec.service_type = 'NodePort'
            svc_temp['spec']['ports'][1]['nodePort'] = spec.node_port
        update_in(svc_temp, 'spec.type', spec.service_type)

    norm_name = normalize_name(meta.name)
    dask.config.set({
        "kubernetes.scheduler-service-template": svc_temp,
        'kubernetes.name': 'mlrun-' + norm_name + '-{uuid}'
    })

    cluster = KubeCluster(pod,
                          deploy_mode='remote',
                          namespace=namespace,
                          scheduler_timeout=spec.scheduler_timeout)

    logger.info('cluster {} started at {}'.format(cluster.name,
                                                  cluster.scheduler_address))

    function.status.scheduler_address = cluster.scheduler_address
    function.status.cluster_name = cluster.name
    if spec.service_type == 'NodePort':
        ports = cluster.scheduler.service.spec.ports
        function.status.node_ports = {
            'scheduler': ports[0].node_port,
            'dashboard': ports[1].node_port
        }

    if spec.replicas:
        cluster.scale(spec.replicas)
    else:
        cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas)

    return cluster
Beispiel #3
0
    image='holdenk/dask:latest',
    memory_limit='8G',
    memory_request='8G',
    cpu_limit=1,
    cpu_request=1,
    extra_container_config={"imagePullPolicy": "Always"})
scheduler_template = make_pod_spec(
    image='holdenk/dask:latest',
    memory_limit='4G',
    memory_request='4G',
    cpu_limit=1,
    cpu_request=1,
    extra_container_config={"imagePullPolicy": "Always"})
cluster = KubeCluster(pod_template=worker_template,
                      scheduler_pod_template=scheduler_template)
cluster.adapt(
    minimum=1)  # or create and destroy workers dynamically based on workload
from dask.distributed import Client
client = Client(cluster)
#end::make_dask_k8s_client[]

# In[ ]:

client

# In[ ]:

client.dashboard_link

# In[ ]:

Beispiel #4
0
class DaskCluster(KubejobRuntime):
    kind = 'dask'
    _is_nested = False

    def __init__(self, spec=None,
                 metadata=None):
        super().__init__(spec, metadata)
        self._cluster = None
        self.spec.build.base_image = self.spec.build.base_image or 'daskdev/dask:latest'
        self.set_label('mlrun/class', self.kind)

    @property
    def spec(self) -> DaskSpec:
        return self._spec

    @spec.setter
    def spec(self, spec):
        self._spec = self._verify_dict(spec, 'spec', DaskSpec)

    def to_pod(self):
        image = self._image_path() or 'daskdev/dask:latest'
        env = self.spec.env
        namespace = self.metadata.namespace or config.namespace
        if self.spec.extra_pip:
            env.append(self.spec.extra_pip)
        container = client.V1Container(name='base',
                                       image=image,
                                       env=env,
                                       command=None,
                                       args=self.spec.args,
                                       image_pull_policy=self.spec.image_pull_policy,
                                       volume_mounts=self.spec.volume_mounts,
                                       resources=self.spec.resources)

        pod_spec = client.V1PodSpec(containers=[container],
                                    restart_policy='Never',
                                    volumes=self.spec.volumes,
                                    service_account=self.spec.service_account)

        meta = client.V1ObjectMeta(namespace=namespace,
                                   labels=self.metadata.labels,
                                   annotations=self.metadata.annotations)

        pod = client.V1Pod(metadata=meta, spec=pod_spec)
        return pod

    @property
    def initialized(self):
        return True if self._cluster else False

    def cluster(self, scale=0):
        if not self._cluster:
            try:
                from dask_kubernetes import KubeCluster
                from dask.distributed import Client
            except ImportError as e:
                print('missing dask_kubernetes, please run "pip install dask_kubernetes"')
                raise e
            self._cluster = KubeCluster(self.to_pod())
            if not scale:
                self._cluster.adapt()
            else:
                self._cluster.scale(scale)
            Client(self._cluster)
        return self._cluster

    @property
    def client(self):
        from dask.distributed import Client, default_client
        try:
            return default_client()
        except ValueError:
            if self._cluster:
                return Client(self._cluster)
            return Client()

    def close(self):
        from dask.distributed import Client, default_client, as_completed
        try:
            client = default_client()
            client.close()
        except ValueError:
            pass
        if self._cluster:
            self._cluster.close()

    def _run(self, runobj: RunObject, execution):
        handler = runobj.spec.handler
        self._force_handler(handler)
        from dask import delayed
        if self.spec.rundb:
            # todo: remote dask via k8s spec env
            environ['MLRUN_DBPATH'] = self.spec.rundb

        arg_list = get_func_arg(handler, runobj, execution)
        try:
            task = delayed(handler)(*arg_list)
            out = task.compute()
        except Exception as e:
            err = str(e)
            execution.set_state(error=err)

        if out:
            execution.log_result('return', out)

        return execution.to_dict()

    def _run_many(self, tasks, execution, runobj: RunObject):
        handler = runobj.spec.handler
        self._force_handler(handler)
        futures = []
        contexts = []
        tasks = list(tasks)
        for task in tasks:
            ctx = MLClientCtx.from_dict(task.to_dict(),
                                        self.spec.rundb,
                                        autocommit=True)
            args = get_func_arg(handler, task, ctx)
            resp = self.client.submit(handler, *args)
            futures.append(resp)
            contexts.append(ctx)

        resps = self.client.gather(futures)
        results = RunList()
        for r, c, t in zip(resps, contexts, tasks):
            if r:
                c.log_result('return', r)
            # todo: handle task errors
            resp = self._post_run(task=t)
            results.append(resp)

        print(resps)
        return results
Beispiel #5
0
def fetch(
    self,
    request_params,
    axis_params,
    start_dt,
    end_dt,
    download=False,
    download_format='netcdf',
    status_dict={},
    max_nfiles=50,
    max_partition_sizes={
        'netcdf': '100MB',
        'csv': '10MB'
    },
):
    self.update_state(
        state="PROGRESS",
        meta=status_dict,
    )
    ds_list = get_delayed_ds(request_params, axis_params)

    status_dict.update({"msg": f"{len(request_params)} datasets requested."})
    self.update_state(state="PROGRESS", meta=status_dict)

    max_data_size = np.sum([v['total_size'] for v in ds_list.values()])
    max_mem_size = max_data_size / 1024**3

    dask_spec = {'min_workers': 1, 'max_workers': 2}
    data_threshold = os.environ.get('DATA_THRESHOLD', 50)

    client = None
    cluster = None

    if max_mem_size > data_threshold:
        image_repo, image_name, image_tag = (
            'cormorack',
            'cava-dask',
            '20210610',
        )
        desired_image = os.environ.get(
            "DASK_DOCKER_IMAGE", f"{image_repo}/{image_name}:{image_tag}")
        match = re.match(r"(.+)/(.+):(.+)", desired_image)
        if match is not None:
            image_repo, image_name, image_tag = match.groups()
        dask_spec = determine_workers(
            max_mem_size,
            image_repo=image_repo,
            image_name=image_name,
            image_tag=image_tag,
        )

        status_dict.update({
            "msg":
            f"Setting up distributed computing cluster. Max data size: {memory_repr(max_data_size)}"
        })
        self.update_state(state="PROGRESS", meta=status_dict)
        cluster = KubeCluster(
            dask_spec['pod_spec'],
            n_workers=dask_spec['min_workers'],
        )
        cluster.adapt(minimum=dask_spec['min_workers'],
                      maximum=dask_spec['max_workers'])
        client = Client(cluster)
    # TODO: Need to add other parameters for multidimensional
    # need a check for nutnr,pco2,ph,optaa add int_ctd_pressure
    # parameters.append("int_ctd_pressure")

    # for spikr
    # parameters.append("spectra")
    status_dict.update({"msg": "Retrieving data from zarr store ..."})
    self.update_state(state="PROGRESS", meta=status_dict)
    data_list = {
        k: v['dataset'].sel(time=(start_dt, end_dt)).dataset
        for k, v in ds_list.items()
    }

    status_dict.update({"msg": "Validating datasets..."})
    self.update_state(state="PROGRESS", meta=status_dict)
    if any(True for v in data_list.values() if v is None):
        # Checks if data_list is None
        status_dict.update(
            {"msg": "One of the dataset does not contain data."})
        self.update_state(state="PROGRESS", meta=status_dict)
        time.sleep(2)
        result = None
    elif any(True for v in data_list.values() if len(v.time) == 0):
        empty_streams = []
        for k, v in data_list.items():
            if len(v.time) == 0:
                empty_streams.append(k)
        # Checks if data_list is None
        status_dict.update(
            {"msg": f"Empty data stream(s) found: {','.join(empty_streams)}."})
        self.update_state(state="PROGRESS", meta=status_dict)
        time.sleep(2)
        status_dict.update({
            "msg":
            "Plot creation is not possible with specified parameters. Please try again."
        })
        self.update_state(state="PROGRESS", meta=status_dict)
        time.sleep(2)
        result = None
    else:
        total_requested_size = np.sum(
            np.fromiter((v.nbytes for v in data_list.values()), dtype=int))
        status_dict.update({
            "msg":
            f"There are {memory_repr(total_requested_size)} of data to be processed."
        })
        self.update_state(state="PROGRESS", meta=status_dict)
        if len(data_list.keys()) > 1:
            merged = _merge_datasets(data_list, start_dt, end_dt)
        else:
            merged = next(ds for _, ds in data_list.items())

        data_count = len(merged.time)

        if data_count == 0:
            status_dict.update(
                {"msg": "Merged dataset does not contain data."})
            self.update_state(state="PROGRESS", meta=status_dict)
            result = None
        elif data_count > 0 and download:
            status_dict.update({"msg": "Preparing dataset for download..."})
            self.update_state(state="PROGRESS", meta=status_dict)
            format_ext = {'netcdf': 'nc', 'csv': 'csv'}
            start_dt_str = parser.parse(start_dt).strftime('%Y%m%dT%H%M%S')
            end_dt_str = parser.parse(end_dt).strftime('%Y%m%dT%H%M%S')
            dstring = f"{start_dt_str}_{end_dt_str}"
            continue_download = True

            if download_format == 'csv':
                ddf = merged.to_dask_dataframe().repartition(
                    partition_size=max_partition_sizes[download_format])
                # Max npartitions to 50
                if ddf.npartitions > max_nfiles:
                    message = "The amount of data to be downloaded is too large for CSV data format. Please make a smaller request."
                    result = {
                        "file_url": None,
                        "msg": message,
                    }
                    continue_download = False
                else:
                    ncfile = dstring
                    outglob = os.path.join(ncfile,
                                           f'*.{format_ext[download_format]}')
                    ddf.to_csv(outglob, index=False)
            elif download_format == 'netcdf':
                max_chunk_size = dask.utils.parse_bytes(
                    max_partition_sizes[download_format])
                smallest_chunk = math.ceil(merged.time.shape[0] /
                                           (merged.nbytes / max_chunk_size))
                slices = [
                    (i, i + smallest_chunk)
                    for i in range(0, merged.time.shape[0], smallest_chunk)
                ]
                # Max npartitions to 50
                if len(slices) > max_nfiles:
                    message = "The amount of data to be downloaded is too large for NetCDF data format. Please make a smaller request."
                    result = {
                        "file_url": None,
                        "msg": message,
                    }
                    continue_download = False
                else:
                    if len(slices) == 1:
                        ncfile = f"{dstring}.{format_ext[download_format]}"
                        merged.to_netcdf(ncfile)
                    else:
                        ncfile = dstring
                        outglob = os.path.join(
                            ncfile, f'*.{format_ext[download_format]}')
                        if not os.path.exists(ncfile):
                            os.mkdir(ncfile)
                        for idx, sl in enumerate(slices):
                            nc_name = f"{idx}.nc"
                            part_ds = merged.isel(time=slice(*sl))
                            part_ds.to_netcdf(os.path.join(ncfile, nc_name))

            if continue_download:
                zipname = (
                    f"CAVA_{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}.zip")

                download_bucket = "ooi-data-download"
                cache_location = f"s3://{download_bucket}"

                fs = fsspec.get_mapper(cache_location).fs

                target_url = os.path.join(cache_location,
                                          os.path.basename(zipname))
                with fs.open(target_url, mode='wb') as f:
                    with zipfile.ZipFile(
                            f, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
                        status_dict.update({"msg": "Creating zip file..."})
                        self.update_state(state="PROGRESS", meta=status_dict)
                        zf.writestr(
                            'meta.yaml',
                            yaml.dump({
                                'reference_designators': request_params,
                                'axis_parameters': axis_params,
                                'start_datetime': start_dt,
                                'end_datetime': end_dt,
                            }),
                        )
                        if os.path.isdir(ncfile):
                            # if ncfile is directory,
                            # there should be an outglob variable
                            data_files = sorted(glob.glob(outglob))
                            for data_file in data_files:
                                zf.write(data_file)
                            shutil.rmtree(ncfile)
                        else:
                            zf.write(ncfile)
                            os.unlink(ncfile)
                download_url = f"https://{download_bucket}.s3.us-west-2.amazonaws.com/{zipname}"
                result = {"file_url": download_url}
        else:
            status_dict.update({"msg": "Plotting merged datasets..."})
            self.update_state(state="PROGRESS", meta=status_dict)
            # Swapping dimensions for plotting to work if time is not
            # an axis selection
            if axis_params["x"] != "time":
                merged = merged.swap_dims({"time": axis_params['x']})
            # Shading process
            final_dct, shaded, color_column = _plot_merged_dataset(
                merged, axis_params)
            x = final_dct.get(axis_params['x'], [])
            y = final_dct.get(axis_params['y'], [])
            z = []
            if axis_params['z']:
                z = final_dct.get(axis_params['z'], np.array([]))
            elif shaded:
                z = final_dct.get(color_column, np.array([]))

            result = ({
                "x": x,
                "y": y,
                "z": z,
                "count": data_count,
                "shaded": shaded,
            }, )
        logger.info("Result done.")
    # ================ End Compute results ========================

    if client is not None:
        # Cleans up dask
        client.close()

    if cluster is not None:
        cluster.close()
    return result