class llc4320_benchmarks(): """Zarr GCP tests on LLC4320 Datasets """ timer = timeit.default_timer timeout = 3600 repeat = 1 number = 1 warmup_time = 0.0 run_nums = np.arange(1, RUNS + 1) params = (['GCS'], [1], [60, 80, 100, 120, 140, 160], run_nums) #params = (['GCS'], [1], [60], run_nums) #params = getTestConfigValue("gcp_kubernetes_read_zarr.llc4320_benchmarks") param_names = ['backend', 'z_chunksize', 'n_workers', 'run_num'] @bmt.test_gcp def setup(self, backend, z_chunksize, n_workers, run_num): self.cluster = KubeCluster(n_workers=n_workers) self.client = Client(self.cluster) bmt.cluster_wait(self.client, n_workers) self.target = target_zarr.ZarrStore(backend=backend, dask=True) # Open Zarr DS self.ds_zarr = self.target.open_store(DS_STORE) self.ds_zarr_theta = self.ds_zarr.Theta @bmt.test_gcp def time_read(self, backend, z_chunksize, n_workers, run_num): self.ds_zarr_theta.max().load(retries=RETRIES) @bmt.test_gcp def teardown(self, backend, z_chunksize, n_workers, run_num): del self.ds_zarr_theta self.cluster.close()
class llc4320_benchmarks(): """netCDF GCP tests on LLC4320 Datasets """ timer = timeit.default_timer timeout = 3600 repeat = 1 number = 1 warmup_time = 0.0 run_nums = np.arange(1, RUNS + 1) #params = (['FUSE'], [90], [60, 80, 100, 120, 140, 160], run_nums) params = (['FUSE'], [10], [100], run_nums) param_names = ['backend', 'z_chunksize', 'n_workers', 'run_num'] @bmt.test_gcp def setup(self, backend, z_chunksize, n_workers, run_num): self.cluster = KubeCluster(n_workers=n_workers) self.client = Client(self.cluster) bmt.cluster_wait(self.client, n_workers) self.target = target_zarr.ZarrStore(backend=backend, dask=True) # Open netCDF DS self.ds_netcdf = xr.open_mfdataset(DS_FILES, decode_cf=False, autoclose=True, chunks={'Z': z_chunksize}) self.ds_netcdf_theta = self.ds_netcdf.Theta @bmt.test_gcp def time_read(self, backend, z_chunksize, n_workers, run_num): self.ds_netcdf_theta.max().load(retries=RETRIES) @bmt.test_gcp def teardown(self, backend, z_chunksize, n_workers, run_num): del self.ds_netcdf_theta self.cluster.close()
class synthetic_benchmarks(): """Zarr GCP tests on random synthetic Datasets """ timer = timeit.default_timer timeout = 600 repeat = 1 number = 1 warmup_time = 0.0 run_nums = np.arange(1, RUNS + 1) params = (['GCS'], [1, 5, 10], np.arange(60, 160, 20), run_nums) param_names = ['backend', 'z_chunksize', 'n_workers', 'run_num'] @bmt.test_gcp def setup(self, backend, z_chunksize, n_workers, run_num): self.cluster = KubeCluster(n_workers=n_workers) self.client = Client(self.cluster) bmt.cluster_wait(self.client, n_workers) self.chunks = (3000, 3000, z_chunksize) self.da = da.random.normal(10, 0.1, size=DS_DIM, chunks=self.chunks) self.target = target_zarr.ZarrStore(backend=backend, dask=True, chunksize=self.chunks, shape=self.da.shape, dtype=self.da.dtype) self.target.get_temp_filepath() @bmt.test_gcp def time_synthetic_write(self, backend, z_chunksize, n_workers, run_num): self.da.store(self.target.storage_obj, lock=False) @bmt.test_gcp def teardown(self, backend, z_chunksize, n_workers, run_num): self.cluster.close() del self.da self.cluster.close() self.target.rm_objects()
class DaskCluster(K8sRuntime): kind = 'dask' def __init__(self, kind=None, command=None, args=None, image=None, metadata=None, build=None, volumes=None, volume_mounts=None, env=None, resources=None, image_pull_policy=None, service_account=None, extra_pip=None): args = args or ['dask-worker'] super().__init__(kind, command, args, image, metadata, build, volumes, volume_mounts, env, resources, image_pull_policy, service_account) self._cluster = None self.extra_pip = extra_pip self.set_label('mlrun/class', self.kind) def to_pod(self): image = self.image or 'daskdev/dask:latest' env = self.env if self.extra_pip: env.append(self.extra_pip) container = client.V1Container( name='base', image=image, env=self.env, command=None, args=self.args, image_pull_policy=self.image_pull_policy, volume_mounts=self.volume_mounts, resources=self.resources) pod_spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=self.volumes, service_account=self.service_account) meta = client.V1ObjectMeta(namespace=self.metadata.namespace or 'default-tenant', labels=self.metadata.labels, annotations=self.metadata.annotations) pod = client.V1Pod(metadata=meta, spec=pod_spec) return pod @property def initialized(self): return True if self._cluster else False @property def cluster(self): if not self._cluster: try: from dask_kubernetes import KubeCluster except ImportError as e: print( 'missing dask_kubernetes, please run "pip install dask_kubernetes"' ) raise e self._cluster = KubeCluster(self.to_pod()) return self._cluster @property def client(self): import distributed return distributed.Client(self.cluster) def close(self): from dask.distributed import Client, default_client, as_completed try: client = default_client() client.close() except ValueError: pass if self._cluster: self._cluster.close()
class DaskCluster(KubejobRuntime): kind = 'dask' _is_nested = False def __init__(self, spec=None, metadata=None): super().__init__(spec, metadata) self._cluster = None self.spec.build.base_image = self.spec.build.base_image or 'daskdev/dask:latest' self.set_label('mlrun/class', self.kind) @property def spec(self) -> DaskSpec: return self._spec @spec.setter def spec(self, spec): self._spec = self._verify_dict(spec, 'spec', DaskSpec) def to_pod(self): image = self._image_path() or 'daskdev/dask:latest' env = self.spec.env namespace = self.metadata.namespace or config.namespace if self.spec.extra_pip: env.append(self.spec.extra_pip) container = client.V1Container(name='base', image=image, env=env, command=None, args=self.spec.args, image_pull_policy=self.spec.image_pull_policy, volume_mounts=self.spec.volume_mounts, resources=self.spec.resources) pod_spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=self.spec.volumes, service_account=self.spec.service_account) meta = client.V1ObjectMeta(namespace=namespace, labels=self.metadata.labels, annotations=self.metadata.annotations) pod = client.V1Pod(metadata=meta, spec=pod_spec) return pod @property def initialized(self): return True if self._cluster else False def cluster(self, scale=0): if not self._cluster: try: from dask_kubernetes import KubeCluster from dask.distributed import Client except ImportError as e: print('missing dask_kubernetes, please run "pip install dask_kubernetes"') raise e self._cluster = KubeCluster(self.to_pod()) if not scale: self._cluster.adapt() else: self._cluster.scale(scale) Client(self._cluster) return self._cluster @property def client(self): from dask.distributed import Client, default_client try: return default_client() except ValueError: if self._cluster: return Client(self._cluster) return Client() def close(self): from dask.distributed import Client, default_client, as_completed try: client = default_client() client.close() except ValueError: pass if self._cluster: self._cluster.close() def _run(self, runobj: RunObject, execution): handler = runobj.spec.handler self._force_handler(handler) from dask import delayed if self.spec.rundb: # todo: remote dask via k8s spec env environ['MLRUN_DBPATH'] = self.spec.rundb arg_list = get_func_arg(handler, runobj, execution) try: task = delayed(handler)(*arg_list) out = task.compute() except Exception as e: err = str(e) execution.set_state(error=err) if out: execution.log_result('return', out) return execution.to_dict() def _run_many(self, tasks, execution, runobj: RunObject): handler = runobj.spec.handler self._force_handler(handler) futures = [] contexts = [] tasks = list(tasks) for task in tasks: ctx = MLClientCtx.from_dict(task.to_dict(), self.spec.rundb, autocommit=True) args = get_func_arg(handler, task, ctx) resp = self.client.submit(handler, *args) futures.append(resp) contexts.append(ctx) resps = self.client.gather(futures) results = RunList() for r, c, t in zip(resps, contexts, tasks): if r: c.log_result('return', r) # todo: handle task errors resp = self._post_run(task=t) results.append(resp) print(resps) return results
def fetch( self, request_params, axis_params, start_dt, end_dt, download=False, download_format='netcdf', status_dict={}, max_nfiles=50, max_partition_sizes={ 'netcdf': '100MB', 'csv': '10MB' }, ): self.update_state( state="PROGRESS", meta=status_dict, ) ds_list = get_delayed_ds(request_params, axis_params) status_dict.update({"msg": f"{len(request_params)} datasets requested."}) self.update_state(state="PROGRESS", meta=status_dict) max_data_size = np.sum([v['total_size'] for v in ds_list.values()]) max_mem_size = max_data_size / 1024**3 dask_spec = {'min_workers': 1, 'max_workers': 2} data_threshold = os.environ.get('DATA_THRESHOLD', 50) client = None cluster = None if max_mem_size > data_threshold: image_repo, image_name, image_tag = ( 'cormorack', 'cava-dask', '20210610', ) desired_image = os.environ.get( "DASK_DOCKER_IMAGE", f"{image_repo}/{image_name}:{image_tag}") match = re.match(r"(.+)/(.+):(.+)", desired_image) if match is not None: image_repo, image_name, image_tag = match.groups() dask_spec = determine_workers( max_mem_size, image_repo=image_repo, image_name=image_name, image_tag=image_tag, ) status_dict.update({ "msg": f"Setting up distributed computing cluster. Max data size: {memory_repr(max_data_size)}" }) self.update_state(state="PROGRESS", meta=status_dict) cluster = KubeCluster( dask_spec['pod_spec'], n_workers=dask_spec['min_workers'], ) cluster.adapt(minimum=dask_spec['min_workers'], maximum=dask_spec['max_workers']) client = Client(cluster) # TODO: Need to add other parameters for multidimensional # need a check for nutnr,pco2,ph,optaa add int_ctd_pressure # parameters.append("int_ctd_pressure") # for spikr # parameters.append("spectra") status_dict.update({"msg": "Retrieving data from zarr store ..."}) self.update_state(state="PROGRESS", meta=status_dict) data_list = { k: v['dataset'].sel(time=(start_dt, end_dt)).dataset for k, v in ds_list.items() } status_dict.update({"msg": "Validating datasets..."}) self.update_state(state="PROGRESS", meta=status_dict) if any(True for v in data_list.values() if v is None): # Checks if data_list is None status_dict.update( {"msg": "One of the dataset does not contain data."}) self.update_state(state="PROGRESS", meta=status_dict) time.sleep(2) result = None elif any(True for v in data_list.values() if len(v.time) == 0): empty_streams = [] for k, v in data_list.items(): if len(v.time) == 0: empty_streams.append(k) # Checks if data_list is None status_dict.update( {"msg": f"Empty data stream(s) found: {','.join(empty_streams)}."}) self.update_state(state="PROGRESS", meta=status_dict) time.sleep(2) status_dict.update({ "msg": "Plot creation is not possible with specified parameters. Please try again." }) self.update_state(state="PROGRESS", meta=status_dict) time.sleep(2) result = None else: total_requested_size = np.sum( np.fromiter((v.nbytes for v in data_list.values()), dtype=int)) status_dict.update({ "msg": f"There are {memory_repr(total_requested_size)} of data to be processed." }) self.update_state(state="PROGRESS", meta=status_dict) if len(data_list.keys()) > 1: merged = _merge_datasets(data_list, start_dt, end_dt) else: merged = next(ds for _, ds in data_list.items()) data_count = len(merged.time) if data_count == 0: status_dict.update( {"msg": "Merged dataset does not contain data."}) self.update_state(state="PROGRESS", meta=status_dict) result = None elif data_count > 0 and download: status_dict.update({"msg": "Preparing dataset for download..."}) self.update_state(state="PROGRESS", meta=status_dict) format_ext = {'netcdf': 'nc', 'csv': 'csv'} start_dt_str = parser.parse(start_dt).strftime('%Y%m%dT%H%M%S') end_dt_str = parser.parse(end_dt).strftime('%Y%m%dT%H%M%S') dstring = f"{start_dt_str}_{end_dt_str}" continue_download = True if download_format == 'csv': ddf = merged.to_dask_dataframe().repartition( partition_size=max_partition_sizes[download_format]) # Max npartitions to 50 if ddf.npartitions > max_nfiles: message = "The amount of data to be downloaded is too large for CSV data format. Please make a smaller request." result = { "file_url": None, "msg": message, } continue_download = False else: ncfile = dstring outglob = os.path.join(ncfile, f'*.{format_ext[download_format]}') ddf.to_csv(outglob, index=False) elif download_format == 'netcdf': max_chunk_size = dask.utils.parse_bytes( max_partition_sizes[download_format]) smallest_chunk = math.ceil(merged.time.shape[0] / (merged.nbytes / max_chunk_size)) slices = [ (i, i + smallest_chunk) for i in range(0, merged.time.shape[0], smallest_chunk) ] # Max npartitions to 50 if len(slices) > max_nfiles: message = "The amount of data to be downloaded is too large for NetCDF data format. Please make a smaller request." result = { "file_url": None, "msg": message, } continue_download = False else: if len(slices) == 1: ncfile = f"{dstring}.{format_ext[download_format]}" merged.to_netcdf(ncfile) else: ncfile = dstring outglob = os.path.join( ncfile, f'*.{format_ext[download_format]}') if not os.path.exists(ncfile): os.mkdir(ncfile) for idx, sl in enumerate(slices): nc_name = f"{idx}.nc" part_ds = merged.isel(time=slice(*sl)) part_ds.to_netcdf(os.path.join(ncfile, nc_name)) if continue_download: zipname = ( f"CAVA_{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}.zip") download_bucket = "ooi-data-download" cache_location = f"s3://{download_bucket}" fs = fsspec.get_mapper(cache_location).fs target_url = os.path.join(cache_location, os.path.basename(zipname)) with fs.open(target_url, mode='wb') as f: with zipfile.ZipFile( f, 'w', compression=zipfile.ZIP_DEFLATED) as zf: status_dict.update({"msg": "Creating zip file..."}) self.update_state(state="PROGRESS", meta=status_dict) zf.writestr( 'meta.yaml', yaml.dump({ 'reference_designators': request_params, 'axis_parameters': axis_params, 'start_datetime': start_dt, 'end_datetime': end_dt, }), ) if os.path.isdir(ncfile): # if ncfile is directory, # there should be an outglob variable data_files = sorted(glob.glob(outglob)) for data_file in data_files: zf.write(data_file) shutil.rmtree(ncfile) else: zf.write(ncfile) os.unlink(ncfile) download_url = f"https://{download_bucket}.s3.us-west-2.amazonaws.com/{zipname}" result = {"file_url": download_url} else: status_dict.update({"msg": "Plotting merged datasets..."}) self.update_state(state="PROGRESS", meta=status_dict) # Swapping dimensions for plotting to work if time is not # an axis selection if axis_params["x"] != "time": merged = merged.swap_dims({"time": axis_params['x']}) # Shading process final_dct, shaded, color_column = _plot_merged_dataset( merged, axis_params) x = final_dct.get(axis_params['x'], []) y = final_dct.get(axis_params['y'], []) z = [] if axis_params['z']: z = final_dct.get(axis_params['z'], np.array([])) elif shaded: z = final_dct.get(color_column, np.array([])) result = ({ "x": x, "y": y, "z": z, "count": data_count, "shaded": shaded, }, ) logger.info("Result done.") # ================ End Compute results ======================== if client is not None: # Cleans up dask client.close() if cluster is not None: cluster.close() return result