class DaskDelegate(Delegate): type: str = "dask" def __init__(self, delegate_config: DaskDelegateConfig): super() self.delegate_config = delegate_config self.cache_provider = self.delegate_config.cache_provider # Attempt to load the global Dask client. try: self.client = get_client() except ValueError as _: if self.delegate_config.kube_cluster is not None: self.client = Client(self.delegate_config.kube_cluster) print(self.delegate_config.kube_cluster) else: self.client = Client(f"{self.delegate_config.dask_cluster_address}:{self.delegate_config.dask_cluster_port}") # Setup functions to be run on the schedule. def __scheduler_job_exists(dask_scheduler, job_id: str) -> bool: return job_id in dask_scheduler.tasks def __scheduler_job_state(dask_scheduler, job_id: str) -> TaskState: return dask_scheduler.tasks[job_id].state self.scheduler_job_exists = __scheduler_job_exists self.scheduler_job_state = __scheduler_job_state def __job_state(self, job_id: str) -> TaskState: return self.client.run_on_scheduler(self.scheduler_job_state, job_id=job_id) def connect(self) -> bool: # No need to connect. return True def test_connection(self) -> bool: # Shim this out until I figure out a good way to test a Dask and Redis connection. return True def create_job(self, job_id: str) -> bool: # No concept of creating a job. return True def start_job(self, job_id: str, work: Callable, *args, **kwargs) -> bool: if self.job_exists(job_id) or self.job_complete(job_id): return False # Parse and replace instances of the internal `result://` proxy protocol. # In short, this allows for callees to reference an in-progress or remote job without needing direct access. function_args = [(self.client.get_dataset(arg.replace("result://", "")) if isinstance(arg, str) and arg.startswith("result://") else arg) for arg in args] # Create a job to run the desired function. job_future: Future = self.client.submit(work, *function_args, **kwargs, key=job_id, pure=False) # Start additional cache job which depends on the results of the previous. cache_future: Future = self.client.submit(self.cache_provider.put, *[job_id, job_future], pure=False) # Publish the job as a dataset to maintain state across requests. self.client.publish_dataset(job_future, name=job_id, override=True) self.client.publish_dataset(cache_future, override=True) return True def stop_job(self, job_id: str) -> bool: if not self.job_exists(job_id): return False try: # Iterate through the dependencies of this job. dependencies = self.client.run_on_scheduler(lambda dask_scheduler: [(state.key) for state in dask_scheduler.tasks[id].dependencies]) # Filter out any weak depenencies. Strong dependencies are suffixed with "/" and the name of the job. dependencies = [(dependency) for dependency in dependencies if dependency.replace(id, "").startswith("/")] futures = [(Future(key)) for key in dependencies] futures.append(Future(job_id)) except KeyError: # do nothing if no dependencies pass self.client.cancel(Future(job_id)) self.client.unpublish_dataset(job_id) # Hacky fix -- Simulation processes continue executing EVEN IF the parent task is killed. def hacky(): os.system("pkill -f 'Simulation.out'") self.client.run(hacky, nanny=True) return True def job_status(self, job_id: str) -> JobStatus: # If the job is complete (results exist as a dataset or in the vault). if self.job_complete(job_id): status = JobStatus() status.status_id = JobState.DONE status.status_text = "The job is complete." status.has_failed = False status.is_done = True return status # If the job doesn't exist. if not self.job_exists(job_id): status = JobStatus() status.status_id = JobState.DOES_NOT_EXIST status.status_text = f"A job with job_id: '{job_id}' does not exist." status.has_failed = True status.is_done = False return status status_mapping = { "released": (JobState.STOPPED, "The job is known but not actively computing or in memory."), "waiting": (JobState.WAITING, "The job is waiting for dependencies to arrive in memory."), "no-worker": (JobState.WAITING, "The job is waiting for a worker to become available."), "processing": (JobState.RUNNING, "The job is running."), "memory": (JobState.DONE, "The job is done and is being held in memory."), "erred": (JobState.FAILED, "The job has failed."), "done": (JobState.DONE, "The job is done and has been cached / stored on disk.") } # Grab the task state from the scheduler. future_status = self.__job_state(job_id) status = JobStatus() status.status_id = status_mapping[future_status][0] status.status_text = status_mapping[future_status][1] status.is_done = status.status_id is JobState.DONE status.has_failed = status.status_id is JobState.FAILED return status def job_results(self, job_id: str): # The results of this job may exist on the client dataset. if job_id in self.client.datasets: print("[DEBUG] Getting results from dataset.") return self.client.get_dataset(name=job_id).result() # If the results are not in the cache, raise an exception. if not self.cache_provider.exists(job_id): raise Exception(f"Result with ID '{job_id}' does not exist in the cache.") return self.cache_provider.get(job_id) def job_complete(self, job_id: str) -> bool: # Finished job results must exist within the cache for it to be considered 'done'. return self.cache_provider.exists(job_id) def job_exists(self, job_id: str) -> bool: # Check if the job exists in the scheduler. return self.client.run_on_scheduler(self.scheduler_job_exists, job_id=job_id) def get_remote_dependency(self, dependency_id: str): # Check to see if the job exists as a dataset. dependency = self.client.get_dataset(name=dependency_id) if dependency is not None: return dependency raise Exception("Something broke, dependency does not exist within distributed memory.")
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import Client, LocalCluster, as_completed from dask.base import tokenize client = None cluster = None try: if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, len(chunks)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
from distributed import Client import cudf import time if __name__ == '__main__': client = Client('localhost:8786') print(client) # Create a simple dataframe gdf = cudf.read_csv('names.csv') print(gdf.head()) client.publish_dataset(names=gdf) while (1): time.sleep(10) print("tick")
"lat", "lon", "Description", "Status", "mcc", "net", # Hover info ]] # Persist and publish Dask dataframe in memory cell_towers_ddf = cell_towers_ddf.repartition(npartitions=8).persist() # Clear any published datasets for k in client.list_datasets(): client.unpublish_dataset(k) client.publish_dataset(cell_towers_ddf=cell_towers_ddf) data_3857 = dask.compute( [cell_towers_ddf["x_3857"].min(), cell_towers_ddf["y_3857"].min()], [cell_towers_ddf["x_3857"].max(), cell_towers_ddf["y_3857"].max()], ) data_center_3857 = [[ (data_3857[0][0] + data_3857[1][0]) / 2.0, (data_3857[0][1] + data_3857[1][1]) / 2.0, ]] data_4326 = epsg_3857_to_4326(data_3857) data_center_4326 = epsg_3857_to_4326(data_center_3857) client.publish_dataset(data_3857=data_3857) client.publish_dataset(data_4326=data_4326) client.publish_dataset(data_center_3857=data_center_3857)
from distributed import Client from thredds_configuration import file_list_url, data_request, data_folder, thredds_servers, base_url from dask_configuration import dask_scheduler_url from thredds_utils import list_thredds_folder, compute_url_to_thredds_server_map, compute_avg_func import xarray as xr file_list = list_thredds_folder(file_list_url) # connect to dask client = Client(dask_scheduler_url) url_list = [] for f in file_list: url_list.append(base_url + "/" + data_request + "/" + data_folder + "/" + f + "?time1[0],Temperature_surface[0][0:360][0:719]") ds_temp_surface = xr.open_mfdataset(url_list) ds_temp_surface.persist() # this works because of https://github.com/dask/dask/blob/8c080b88c303cd64f41d7c7a7cde4f4f2faa10a9/dask/base.py#L569 # client persist does not currently - work in progress with dask team to fix!! # see discussion at https://github.com/dask/dask/pull/1068 #client.persist(ds_temp_surface) # this loads distributed cube into memory of dask workers #ds_temp_surface.publish() client.publish_dataset(temp_surface=ds_temp_surface) # now this makes distributed cube is available to all clients