Ejemplo n.º 1
0
class llc4320_benchmarks():
    """Zarr GCP tests on LLC4320 Datasets

    """
    timer = timeit.default_timer
    timeout = 3600
    repeat = 1
    number = 1
    warmup_time = 0.0
    run_nums = np.arange(1, RUNS + 1)
    params = (['GCS'], [1], [60, 80, 100, 120, 140, 160], run_nums)
    #params = (['GCS'], [1], [60], run_nums)
    #params = getTestConfigValue("gcp_kubernetes_read_zarr.llc4320_benchmarks")
    param_names = ['backend', 'z_chunksize', 'n_workers', 'run_num']

    @bmt.test_gcp
    def setup(self, backend, z_chunksize, n_workers, run_num):
        self.cluster = KubeCluster(n_workers=n_workers)
        self.client = Client(self.cluster)
        bmt.cluster_wait(self.client, n_workers)
        self.target = target_zarr.ZarrStore(backend=backend, dask=True)
        # Open Zarr DS
        self.ds_zarr = self.target.open_store(DS_STORE)
        self.ds_zarr_theta = self.ds_zarr.Theta

    @bmt.test_gcp
    def time_read(self, backend, z_chunksize, n_workers, run_num):
        self.ds_zarr_theta.max().load(retries=RETRIES)

    @bmt.test_gcp
    def teardown(self, backend, z_chunksize, n_workers, run_num):
        del self.ds_zarr_theta
        self.cluster.close()
class llc4320_benchmarks():
    """netCDF GCP tests on LLC4320 Datasets

    """
    timer = timeit.default_timer
    timeout = 3600
    repeat = 1
    number = 1
    warmup_time = 0.0
    run_nums = np.arange(1, RUNS + 1)
    #params = (['FUSE'], [90], [60, 80, 100, 120, 140, 160], run_nums)
    params = (['FUSE'], [10], [100], run_nums)
    param_names = ['backend', 'z_chunksize', 'n_workers', 'run_num']

    @bmt.test_gcp
    def setup(self, backend, z_chunksize, n_workers, run_num):
        self.cluster = KubeCluster(n_workers=n_workers)
        self.client = Client(self.cluster)
        bmt.cluster_wait(self.client, n_workers)
        self.target = target_zarr.ZarrStore(backend=backend, dask=True)
        # Open netCDF DS
        self.ds_netcdf = xr.open_mfdataset(DS_FILES,
                                           decode_cf=False,
                                           autoclose=True,
                                           chunks={'Z': z_chunksize})
        self.ds_netcdf_theta = self.ds_netcdf.Theta

    @bmt.test_gcp
    def time_read(self, backend, z_chunksize, n_workers, run_num):
        self.ds_netcdf_theta.max().load(retries=RETRIES)

    @bmt.test_gcp
    def teardown(self, backend, z_chunksize, n_workers, run_num):
        del self.ds_netcdf_theta
        self.cluster.close()
class synthetic_benchmarks():
    """Zarr GCP tests on random synthetic Datasets

    """
    timer = timeit.default_timer
    timeout = 600
    repeat = 1
    number = 1
    warmup_time = 0.0
    run_nums = np.arange(1, RUNS + 1)
    params = (['GCS'], [1, 5, 10], np.arange(60, 160, 20), run_nums)
    param_names = ['backend', 'z_chunksize', 'n_workers', 'run_num']

    @bmt.test_gcp
    def setup(self, backend, z_chunksize, n_workers, run_num):
        self.cluster = KubeCluster(n_workers=n_workers)
        self.client = Client(self.cluster)
        bmt.cluster_wait(self.client, n_workers)
        self.chunks = (3000, 3000, z_chunksize)
        self.da = da.random.normal(10, 0.1, size=DS_DIM, chunks=self.chunks)
        self.target = target_zarr.ZarrStore(backend=backend,
                                            dask=True,
                                            chunksize=self.chunks,
                                            shape=self.da.shape,
                                            dtype=self.da.dtype)
        self.target.get_temp_filepath()

    @bmt.test_gcp
    def time_synthetic_write(self, backend, z_chunksize, n_workers, run_num):
        self.da.store(self.target.storage_obj, lock=False)

    @bmt.test_gcp
    def teardown(self, backend, z_chunksize, n_workers, run_num):
        self.cluster.close()
        del self.da
        self.cluster.close()
        self.target.rm_objects()
Ejemplo n.º 4
0
class DaskCluster(K8sRuntime):
    kind = 'dask'

    def __init__(self,
                 kind=None,
                 command=None,
                 args=None,
                 image=None,
                 metadata=None,
                 build=None,
                 volumes=None,
                 volume_mounts=None,
                 env=None,
                 resources=None,
                 image_pull_policy=None,
                 service_account=None,
                 extra_pip=None):
        args = args or ['dask-worker']
        super().__init__(kind, command, args, image, metadata, build, volumes,
                         volume_mounts, env, resources, image_pull_policy,
                         service_account)
        self._cluster = None
        self.extra_pip = extra_pip
        self.set_label('mlrun/class', self.kind)

    def to_pod(self):
        image = self.image or 'daskdev/dask:latest'
        env = self.env
        if self.extra_pip:
            env.append(self.extra_pip)
        container = client.V1Container(
            name='base',
            image=image,
            env=self.env,
            command=None,
            args=self.args,
            image_pull_policy=self.image_pull_policy,
            volume_mounts=self.volume_mounts,
            resources=self.resources)

        pod_spec = client.V1PodSpec(containers=[container],
                                    restart_policy='Never',
                                    volumes=self.volumes,
                                    service_account=self.service_account)

        meta = client.V1ObjectMeta(namespace=self.metadata.namespace
                                   or 'default-tenant',
                                   labels=self.metadata.labels,
                                   annotations=self.metadata.annotations)

        pod = client.V1Pod(metadata=meta, spec=pod_spec)
        return pod

    @property
    def initialized(self):
        return True if self._cluster else False

    @property
    def cluster(self):
        if not self._cluster:
            try:
                from dask_kubernetes import KubeCluster
            except ImportError as e:
                print(
                    'missing dask_kubernetes, please run "pip install dask_kubernetes"'
                )
                raise e
            self._cluster = KubeCluster(self.to_pod())
        return self._cluster

    @property
    def client(self):
        import distributed
        return distributed.Client(self.cluster)

    def close(self):
        from dask.distributed import Client, default_client, as_completed
        try:
            client = default_client()
            client.close()
        except ValueError:
            pass
        if self._cluster:
            self._cluster.close()
Ejemplo n.º 5
0
class DaskCluster(KubejobRuntime):
    kind = 'dask'
    _is_nested = False

    def __init__(self, spec=None,
                 metadata=None):
        super().__init__(spec, metadata)
        self._cluster = None
        self.spec.build.base_image = self.spec.build.base_image or 'daskdev/dask:latest'
        self.set_label('mlrun/class', self.kind)

    @property
    def spec(self) -> DaskSpec:
        return self._spec

    @spec.setter
    def spec(self, spec):
        self._spec = self._verify_dict(spec, 'spec', DaskSpec)

    def to_pod(self):
        image = self._image_path() or 'daskdev/dask:latest'
        env = self.spec.env
        namespace = self.metadata.namespace or config.namespace
        if self.spec.extra_pip:
            env.append(self.spec.extra_pip)
        container = client.V1Container(name='base',
                                       image=image,
                                       env=env,
                                       command=None,
                                       args=self.spec.args,
                                       image_pull_policy=self.spec.image_pull_policy,
                                       volume_mounts=self.spec.volume_mounts,
                                       resources=self.spec.resources)

        pod_spec = client.V1PodSpec(containers=[container],
                                    restart_policy='Never',
                                    volumes=self.spec.volumes,
                                    service_account=self.spec.service_account)

        meta = client.V1ObjectMeta(namespace=namespace,
                                   labels=self.metadata.labels,
                                   annotations=self.metadata.annotations)

        pod = client.V1Pod(metadata=meta, spec=pod_spec)
        return pod

    @property
    def initialized(self):
        return True if self._cluster else False

    def cluster(self, scale=0):
        if not self._cluster:
            try:
                from dask_kubernetes import KubeCluster
                from dask.distributed import Client
            except ImportError as e:
                print('missing dask_kubernetes, please run "pip install dask_kubernetes"')
                raise e
            self._cluster = KubeCluster(self.to_pod())
            if not scale:
                self._cluster.adapt()
            else:
                self._cluster.scale(scale)
            Client(self._cluster)
        return self._cluster

    @property
    def client(self):
        from dask.distributed import Client, default_client
        try:
            return default_client()
        except ValueError:
            if self._cluster:
                return Client(self._cluster)
            return Client()

    def close(self):
        from dask.distributed import Client, default_client, as_completed
        try:
            client = default_client()
            client.close()
        except ValueError:
            pass
        if self._cluster:
            self._cluster.close()

    def _run(self, runobj: RunObject, execution):
        handler = runobj.spec.handler
        self._force_handler(handler)
        from dask import delayed
        if self.spec.rundb:
            # todo: remote dask via k8s spec env
            environ['MLRUN_DBPATH'] = self.spec.rundb

        arg_list = get_func_arg(handler, runobj, execution)
        try:
            task = delayed(handler)(*arg_list)
            out = task.compute()
        except Exception as e:
            err = str(e)
            execution.set_state(error=err)

        if out:
            execution.log_result('return', out)

        return execution.to_dict()

    def _run_many(self, tasks, execution, runobj: RunObject):
        handler = runobj.spec.handler
        self._force_handler(handler)
        futures = []
        contexts = []
        tasks = list(tasks)
        for task in tasks:
            ctx = MLClientCtx.from_dict(task.to_dict(),
                                        self.spec.rundb,
                                        autocommit=True)
            args = get_func_arg(handler, task, ctx)
            resp = self.client.submit(handler, *args)
            futures.append(resp)
            contexts.append(ctx)

        resps = self.client.gather(futures)
        results = RunList()
        for r, c, t in zip(resps, contexts, tasks):
            if r:
                c.log_result('return', r)
            # todo: handle task errors
            resp = self._post_run(task=t)
            results.append(resp)

        print(resps)
        return results
Ejemplo n.º 6
0
def fetch(
    self,
    request_params,
    axis_params,
    start_dt,
    end_dt,
    download=False,
    download_format='netcdf',
    status_dict={},
    max_nfiles=50,
    max_partition_sizes={
        'netcdf': '100MB',
        'csv': '10MB'
    },
):
    self.update_state(
        state="PROGRESS",
        meta=status_dict,
    )
    ds_list = get_delayed_ds(request_params, axis_params)

    status_dict.update({"msg": f"{len(request_params)} datasets requested."})
    self.update_state(state="PROGRESS", meta=status_dict)

    max_data_size = np.sum([v['total_size'] for v in ds_list.values()])
    max_mem_size = max_data_size / 1024**3

    dask_spec = {'min_workers': 1, 'max_workers': 2}
    data_threshold = os.environ.get('DATA_THRESHOLD', 50)

    client = None
    cluster = None

    if max_mem_size > data_threshold:
        image_repo, image_name, image_tag = (
            'cormorack',
            'cava-dask',
            '20210610',
        )
        desired_image = os.environ.get(
            "DASK_DOCKER_IMAGE", f"{image_repo}/{image_name}:{image_tag}")
        match = re.match(r"(.+)/(.+):(.+)", desired_image)
        if match is not None:
            image_repo, image_name, image_tag = match.groups()
        dask_spec = determine_workers(
            max_mem_size,
            image_repo=image_repo,
            image_name=image_name,
            image_tag=image_tag,
        )

        status_dict.update({
            "msg":
            f"Setting up distributed computing cluster. Max data size: {memory_repr(max_data_size)}"
        })
        self.update_state(state="PROGRESS", meta=status_dict)
        cluster = KubeCluster(
            dask_spec['pod_spec'],
            n_workers=dask_spec['min_workers'],
        )
        cluster.adapt(minimum=dask_spec['min_workers'],
                      maximum=dask_spec['max_workers'])
        client = Client(cluster)
    # TODO: Need to add other parameters for multidimensional
    # need a check for nutnr,pco2,ph,optaa add int_ctd_pressure
    # parameters.append("int_ctd_pressure")

    # for spikr
    # parameters.append("spectra")
    status_dict.update({"msg": "Retrieving data from zarr store ..."})
    self.update_state(state="PROGRESS", meta=status_dict)
    data_list = {
        k: v['dataset'].sel(time=(start_dt, end_dt)).dataset
        for k, v in ds_list.items()
    }

    status_dict.update({"msg": "Validating datasets..."})
    self.update_state(state="PROGRESS", meta=status_dict)
    if any(True for v in data_list.values() if v is None):
        # Checks if data_list is None
        status_dict.update(
            {"msg": "One of the dataset does not contain data."})
        self.update_state(state="PROGRESS", meta=status_dict)
        time.sleep(2)
        result = None
    elif any(True for v in data_list.values() if len(v.time) == 0):
        empty_streams = []
        for k, v in data_list.items():
            if len(v.time) == 0:
                empty_streams.append(k)
        # Checks if data_list is None
        status_dict.update(
            {"msg": f"Empty data stream(s) found: {','.join(empty_streams)}."})
        self.update_state(state="PROGRESS", meta=status_dict)
        time.sleep(2)
        status_dict.update({
            "msg":
            "Plot creation is not possible with specified parameters. Please try again."
        })
        self.update_state(state="PROGRESS", meta=status_dict)
        time.sleep(2)
        result = None
    else:
        total_requested_size = np.sum(
            np.fromiter((v.nbytes for v in data_list.values()), dtype=int))
        status_dict.update({
            "msg":
            f"There are {memory_repr(total_requested_size)} of data to be processed."
        })
        self.update_state(state="PROGRESS", meta=status_dict)
        if len(data_list.keys()) > 1:
            merged = _merge_datasets(data_list, start_dt, end_dt)
        else:
            merged = next(ds for _, ds in data_list.items())

        data_count = len(merged.time)

        if data_count == 0:
            status_dict.update(
                {"msg": "Merged dataset does not contain data."})
            self.update_state(state="PROGRESS", meta=status_dict)
            result = None
        elif data_count > 0 and download:
            status_dict.update({"msg": "Preparing dataset for download..."})
            self.update_state(state="PROGRESS", meta=status_dict)
            format_ext = {'netcdf': 'nc', 'csv': 'csv'}
            start_dt_str = parser.parse(start_dt).strftime('%Y%m%dT%H%M%S')
            end_dt_str = parser.parse(end_dt).strftime('%Y%m%dT%H%M%S')
            dstring = f"{start_dt_str}_{end_dt_str}"
            continue_download = True

            if download_format == 'csv':
                ddf = merged.to_dask_dataframe().repartition(
                    partition_size=max_partition_sizes[download_format])
                # Max npartitions to 50
                if ddf.npartitions > max_nfiles:
                    message = "The amount of data to be downloaded is too large for CSV data format. Please make a smaller request."
                    result = {
                        "file_url": None,
                        "msg": message,
                    }
                    continue_download = False
                else:
                    ncfile = dstring
                    outglob = os.path.join(ncfile,
                                           f'*.{format_ext[download_format]}')
                    ddf.to_csv(outglob, index=False)
            elif download_format == 'netcdf':
                max_chunk_size = dask.utils.parse_bytes(
                    max_partition_sizes[download_format])
                smallest_chunk = math.ceil(merged.time.shape[0] /
                                           (merged.nbytes / max_chunk_size))
                slices = [
                    (i, i + smallest_chunk)
                    for i in range(0, merged.time.shape[0], smallest_chunk)
                ]
                # Max npartitions to 50
                if len(slices) > max_nfiles:
                    message = "The amount of data to be downloaded is too large for NetCDF data format. Please make a smaller request."
                    result = {
                        "file_url": None,
                        "msg": message,
                    }
                    continue_download = False
                else:
                    if len(slices) == 1:
                        ncfile = f"{dstring}.{format_ext[download_format]}"
                        merged.to_netcdf(ncfile)
                    else:
                        ncfile = dstring
                        outglob = os.path.join(
                            ncfile, f'*.{format_ext[download_format]}')
                        if not os.path.exists(ncfile):
                            os.mkdir(ncfile)
                        for idx, sl in enumerate(slices):
                            nc_name = f"{idx}.nc"
                            part_ds = merged.isel(time=slice(*sl))
                            part_ds.to_netcdf(os.path.join(ncfile, nc_name))

            if continue_download:
                zipname = (
                    f"CAVA_{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}.zip")

                download_bucket = "ooi-data-download"
                cache_location = f"s3://{download_bucket}"

                fs = fsspec.get_mapper(cache_location).fs

                target_url = os.path.join(cache_location,
                                          os.path.basename(zipname))
                with fs.open(target_url, mode='wb') as f:
                    with zipfile.ZipFile(
                            f, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
                        status_dict.update({"msg": "Creating zip file..."})
                        self.update_state(state="PROGRESS", meta=status_dict)
                        zf.writestr(
                            'meta.yaml',
                            yaml.dump({
                                'reference_designators': request_params,
                                'axis_parameters': axis_params,
                                'start_datetime': start_dt,
                                'end_datetime': end_dt,
                            }),
                        )
                        if os.path.isdir(ncfile):
                            # if ncfile is directory,
                            # there should be an outglob variable
                            data_files = sorted(glob.glob(outglob))
                            for data_file in data_files:
                                zf.write(data_file)
                            shutil.rmtree(ncfile)
                        else:
                            zf.write(ncfile)
                            os.unlink(ncfile)
                download_url = f"https://{download_bucket}.s3.us-west-2.amazonaws.com/{zipname}"
                result = {"file_url": download_url}
        else:
            status_dict.update({"msg": "Plotting merged datasets..."})
            self.update_state(state="PROGRESS", meta=status_dict)
            # Swapping dimensions for plotting to work if time is not
            # an axis selection
            if axis_params["x"] != "time":
                merged = merged.swap_dims({"time": axis_params['x']})
            # Shading process
            final_dct, shaded, color_column = _plot_merged_dataset(
                merged, axis_params)
            x = final_dct.get(axis_params['x'], [])
            y = final_dct.get(axis_params['y'], [])
            z = []
            if axis_params['z']:
                z = final_dct.get(axis_params['z'], np.array([]))
            elif shaded:
                z = final_dct.get(color_column, np.array([]))

            result = ({
                "x": x,
                "y": y,
                "z": z,
                "count": data_count,
                "shaded": shaded,
            }, )
        logger.info("Result done.")
    # ================ End Compute results ========================

    if client is not None:
        # Cleans up dask
        client.close()

    if cluster is not None:
        cluster.close()
    return result