Example #1
0
class DaskDelegate(Delegate):
    type: str = "dask"

    def __init__(self, delegate_config: DaskDelegateConfig):
        super()

        self.delegate_config = delegate_config
        self.cache_provider = self.delegate_config.cache_provider

        # Attempt to load the global Dask client.
        try:
            self.client = get_client()

        except ValueError as _:
            if self.delegate_config.kube_cluster is not None:
                self.client = Client(self.delegate_config.kube_cluster)
                print(self.delegate_config.kube_cluster)

            else:
                self.client = Client(f"{self.delegate_config.dask_cluster_address}:{self.delegate_config.dask_cluster_port}")

        # Setup functions to be run on the schedule.
        def __scheduler_job_exists(dask_scheduler, job_id: str) -> bool:
            return job_id in dask_scheduler.tasks

        def __scheduler_job_state(dask_scheduler, job_id: str) -> TaskState:
            return dask_scheduler.tasks[job_id].state

        self.scheduler_job_exists = __scheduler_job_exists
        self.scheduler_job_state = __scheduler_job_state

    def __job_state(self, job_id: str) -> TaskState:
        return self.client.run_on_scheduler(self.scheduler_job_state, job_id=job_id)

    def connect(self) -> bool:
        # No need to connect.
        return True

    def test_connection(self) -> bool:
        # Shim this out until I figure out a good way to test a Dask and Redis connection.
        return True

    def create_job(self, job_id: str) -> bool:
        # No concept of creating a job.
        return True

    def start_job(self, job_id: str, work: Callable, *args, **kwargs) -> bool:
        if self.job_exists(job_id) or self.job_complete(job_id):
            return False

        # Parse and replace instances of the internal `result://` proxy protocol.
        # In short, this allows for callees to reference an in-progress or remote job without needing direct access.
        function_args = [(self.client.get_dataset(arg.replace("result://", "")) if isinstance(arg, str) and arg.startswith("result://") else arg) for arg in args]

        # Create a job to run the desired function.
        job_future: Future = self.client.submit(work, *function_args, **kwargs, key=job_id, pure=False)

        # Start additional cache job which depends on the results of the previous.
        cache_future: Future = self.client.submit(self.cache_provider.put, *[job_id, job_future], pure=False)

        # Publish the job as a dataset to maintain state across requests.
        self.client.publish_dataset(job_future, name=job_id, override=True)
        self.client.publish_dataset(cache_future, override=True)

        return True

    def stop_job(self, job_id: str) -> bool:
        if not self.job_exists(job_id):
            return False

        try:
            # Iterate through the dependencies of this job.
            dependencies = self.client.run_on_scheduler(lambda dask_scheduler: [(state.key) for state in dask_scheduler.tasks[id].dependencies])

            # Filter out any weak depenencies. Strong dependencies are suffixed with "/" and the name of the job.
            dependencies = [(dependency) for dependency in dependencies if dependency.replace(id, "").startswith("/")]

            futures = [(Future(key)) for key in dependencies]
            futures.append(Future(job_id))
        except KeyError:
            # do nothing if no dependencies
            pass

        self.client.cancel(Future(job_id))
        self.client.unpublish_dataset(job_id)

        # Hacky fix -- Simulation processes continue executing EVEN IF the parent task is killed.
        def hacky():
            os.system("pkill -f 'Simulation.out'")

        self.client.run(hacky, nanny=True)

        return True

    def job_status(self, job_id: str) -> JobStatus:
        # If the job is complete (results exist as a dataset or in the vault).
        if self.job_complete(job_id):
            status = JobStatus()
            status.status_id = JobState.DONE
            status.status_text = "The job is complete."
            status.has_failed = False
            status.is_done = True

            return status

        # If the job doesn't exist.
        if not self.job_exists(job_id):
            status = JobStatus()
            status.status_id = JobState.DOES_NOT_EXIST
            status.status_text = f"A job with job_id: '{job_id}' does not exist."
            status.has_failed = True
            status.is_done = False

            return status

        status_mapping = {
            "released": (JobState.STOPPED, "The job is known but not actively computing or in memory."),
            "waiting": (JobState.WAITING, "The job is waiting for dependencies to arrive in memory."),
            "no-worker": (JobState.WAITING, "The job is waiting for a worker to become available."),
            "processing": (JobState.RUNNING, "The job is running."),
            "memory": (JobState.DONE, "The job is done and is being held in memory."),
            "erred": (JobState.FAILED, "The job has failed."),
            "done": (JobState.DONE, "The job is done and has been cached / stored on disk.")
        }

        # Grab the task state from the scheduler.
        future_status = self.__job_state(job_id)

        status = JobStatus()
        status.status_id = status_mapping[future_status][0]
        status.status_text = status_mapping[future_status][1]

        status.is_done = status.status_id is JobState.DONE
        status.has_failed = status.status_id is JobState.FAILED

        return status

    def job_results(self, job_id: str):
        # The results of this job may exist on the client dataset.
        if job_id in self.client.datasets:
            print("[DEBUG] Getting results from dataset.")
            return self.client.get_dataset(name=job_id).result()

        # If the results are not in the cache, raise an exception.
        if not self.cache_provider.exists(job_id):
            raise Exception(f"Result with ID '{job_id}' does not exist in the cache.")

        return self.cache_provider.get(job_id)

    def job_complete(self, job_id: str) -> bool:
        # Finished job results must exist within the cache for it to be considered 'done'.
        return self.cache_provider.exists(job_id)

    def job_exists(self, job_id: str) -> bool:
        # Check if the job exists in the scheduler.
        return self.client.run_on_scheduler(self.scheduler_job_exists, job_id=job_id)

    def get_remote_dependency(self, dependency_id: str):
        # Check to see if the job exists as a dataset.
        dependency = self.client.get_dataset(name=dependency_id)

        if dependency is not None:
            return dependency

        raise Exception("Something broke, dependency does not exist within distributed memory.")
def parallel_calculate_chunks(chunks,
                              features,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import Client, LocalCluster, as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        if 'cluster' in dask_kwargs:
            cluster = dask_kwargs['cluster']
        else:
            diagnostics_port = None
            if 'diagnostics_port' in dask_kwargs:
                diagnostics_port = dask_kwargs['diagnostics_port']
                del dask_kwargs['diagnostics_port']

            workers = n_jobs_to_workers(n_jobs)
            workers = min(workers, len(chunks))
            cluster = LocalCluster(n_workers=workers,
                                   threads_per_worker=1,
                                   diagnostics_port=diagnostics_port,
                                   **dask_kwargs)
            # if cluster has bokeh port, notify user if unxepected port number
            if diagnostics_port is not None:
                if hasattr(cluster, 'scheduler') and cluster.scheduler:
                    info = cluster.scheduler.identity()
                    if 'bokeh' in info['services']:
                        msg = "Dashboard started on port {}"
                        print(msg.format(info['services']['bokeh']))

        client = Client(cluster)
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            print("Using EntitySet persisted on the cluster as dataset %s" %
                  (es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        end = time.time()
        scatter_time = end - start
        scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
        print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
Example #3
0
from distributed import Client
import cudf
import time

if __name__ == '__main__':
    client = Client('localhost:8786')
    print(client)

    # Create a simple dataframe
    gdf = cudf.read_csv('names.csv')
    print(gdf.head())
    client.publish_dataset(names=gdf)

    while (1):
        time.sleep(10)
        print("tick")
Example #4
0
        "lat",
        "lon",
        "Description",
        "Status",
        "mcc",
        "net",  # Hover info
    ]]

    # Persist and publish Dask dataframe in memory
    cell_towers_ddf = cell_towers_ddf.repartition(npartitions=8).persist()

    # Clear any published datasets
    for k in client.list_datasets():
        client.unpublish_dataset(k)

    client.publish_dataset(cell_towers_ddf=cell_towers_ddf)

    data_3857 = dask.compute(
        [cell_towers_ddf["x_3857"].min(), cell_towers_ddf["y_3857"].min()],
        [cell_towers_ddf["x_3857"].max(), cell_towers_ddf["y_3857"].max()],
    )
    data_center_3857 = [[
        (data_3857[0][0] + data_3857[1][0]) / 2.0,
        (data_3857[0][1] + data_3857[1][1]) / 2.0,
    ]]
    data_4326 = epsg_3857_to_4326(data_3857)
    data_center_4326 = epsg_3857_to_4326(data_center_3857)

    client.publish_dataset(data_3857=data_3857)
    client.publish_dataset(data_4326=data_4326)
    client.publish_dataset(data_center_3857=data_center_3857)
from distributed import Client
from thredds_configuration import file_list_url, data_request, data_folder, thredds_servers, base_url
from dask_configuration import dask_scheduler_url
from thredds_utils import list_thredds_folder, compute_url_to_thredds_server_map, compute_avg_func
import xarray as xr

file_list = list_thredds_folder(file_list_url)

# connect to dask
client = Client(dask_scheduler_url)

url_list = []
for f in file_list:
    url_list.append(base_url + "/" + data_request + "/" + data_folder + "/" +
                    f + "?time1[0],Temperature_surface[0][0:360][0:719]")

ds_temp_surface = xr.open_mfdataset(url_list)

ds_temp_surface.persist()
# this works because of https://github.com/dask/dask/blob/8c080b88c303cd64f41d7c7a7cde4f4f2faa10a9/dask/base.py#L569

# client persist does not currently - work in progress with dask team to fix!!
# see discussion at https://github.com/dask/dask/pull/1068
#client.persist(ds_temp_surface)
# this loads distributed cube into memory of dask workers

#ds_temp_surface.publish()
client.publish_dataset(temp_surface=ds_temp_surface)

# now this makes  distributed cube is available to all clients