Python Client.get_dataset Examples, distributed.Client.get_dataset Python Examples

Example #1

0

Show file

File: app.py Project: donrv/plotly-dash-rapids-census-demo

def publish_dataset_to_cluster():

    census_data_url = 'https://s3.us-east-2.amazonaws.com/rapidsai-data/viz-data/census_data.parquet.tar.gz'
    data_path = "../data/census_data.parquet"
    check_dataset(census_data_url, data_path)

    # Note: The creation of a Dask LocalCluster must happen inside the `__main__` block,
    cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="0")
    client = Client(cluster)
    print(f"Dask status: {cluster.dashboard_link}")

    # Load dataset and persist dataset on cluster
    def load_and_publish_dataset():
        # cudf DataFrame
        c_df_d = delayed(load_dataset)(data_path).persist()
        # pandas DataFrame
        pd_df_d = delayed(c_df_d.to_pandas)().persist()

        # print(type(c_df_d))
        # Unpublish datasets if present
        for ds_name in ['pd_df_d', 'c_df_d']:
            if ds_name in client.datasets:
                client.unpublish_dataset(ds_name)

        # Publish datasets to the cluster
        client.publish_dataset(pd_df_d=pd_df_d)
        client.publish_dataset(c_df_d=c_df_d)

    load_and_publish_dataset()

    # Precompute field bounds
    c_df_d = client.get_dataset('c_df_d')

    # Register top-level callback that updates plots
    register_update_plots_callback(client)

Example #2

0

Show file

def publish_dataset_to_cluster():

    data_path = "/home/ajay/new_dev/plotly/census_large/data/census_data_epsg_3857.parquet/*"

    # Note: The creation of a Dask LocalCluster must happen inside the `__main__` block,
    cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="0")
    client = Client(cluster)
    print(f"Dask status: {cluster.dashboard_link}")

    # Load dataset and persist dataset on cluster
    def load_and_publish_dataset():
        # cudf DataFrame
        c_df_d = delayed(load_dataset)(data_path).persist()
        # pandas DataFrame
        pd_df_d = delayed(c_df_d.to_pandas)().persist()

        # print(type(c_df_d))
        # Unpublish datasets if present
        for ds_name in ['pd_df_d', 'c_df_d']:
            if ds_name in client.datasets:
                client.unpublish_dataset(ds_name)

        # Publish datasets to the cluster
        client.publish_dataset(pd_df_d=pd_df_d)
        client.publish_dataset(c_df_d=c_df_d)

    load_and_publish_dataset()

    # Precompute field bounds
    c_df_d = client.get_dataset('c_df_d')
    
    # Define callback to restart cluster and reload datasets
    @app.callback(
        Output('reset-gpu-complete', 'children'),
        [Input('reset-gpu', 'n_clicks')]
    )
    def restart_cluster(n_clicks):
        if n_clicks:
            print("Restarting LocalCUDACluster")
            client.unpublish_dataset('pd_df_d')
            client.unpublish_dataset('c_df_d')
            client.restart()
            load_and_publish_dataset()

    # Register top-level callback that updates plots
    register_update_plots_callback(client)

Example #3

0

Show file

from distributed import Client
import cudf
import time

if __name__ == '__main__':
    client = Client('localhost:8786')
    print(client)

    # Create a simple dataframe
    print("Readings 'names' published dataset from another process")
    gdf = client.get_dataset('names')
    print(gdf.head())

Example #4

0

Show file

class DaskDelegate(Delegate):
    type: str = "dask"

    def __init__(self, delegate_config: DaskDelegateConfig):
        super()

        self.delegate_config = delegate_config
        self.cache_provider = self.delegate_config.cache_provider

        # Attempt to load the global Dask client.
        try:
            self.client = get_client()

        except ValueError as _:
            if self.delegate_config.kube_cluster is not None:
                self.client = Client(self.delegate_config.kube_cluster)
                print(self.delegate_config.kube_cluster)

            else:
                self.client = Client(f"{self.delegate_config.dask_cluster_address}:{self.delegate_config.dask_cluster_port}")

        # Setup functions to be run on the schedule.
        def __scheduler_job_exists(dask_scheduler, job_id: str) -> bool:
            return job_id in dask_scheduler.tasks

        def __scheduler_job_state(dask_scheduler, job_id: str) -> TaskState:
            return dask_scheduler.tasks[job_id].state

        self.scheduler_job_exists = __scheduler_job_exists
        self.scheduler_job_state = __scheduler_job_state

    def __job_state(self, job_id: str) -> TaskState:
        return self.client.run_on_scheduler(self.scheduler_job_state, job_id=job_id)

    def connect(self) -> bool:
        # No need to connect.
        return True

    def test_connection(self) -> bool:
        # Shim this out until I figure out a good way to test a Dask and Redis connection.
        return True

    def create_job(self, job_id: str) -> bool:
        # No concept of creating a job.
        return True

    def start_job(self, job_id: str, work: Callable, *args, **kwargs) -> bool:
        if self.job_exists(job_id) or self.job_complete(job_id):
            return False

        # Parse and replace instances of the internal `result://` proxy protocol.
        # In short, this allows for callees to reference an in-progress or remote job without needing direct access.
        function_args = [(self.client.get_dataset(arg.replace("result://", "")) if isinstance(arg, str) and arg.startswith("result://") else arg) for arg in args]

        # Create a job to run the desired function.
        job_future: Future = self.client.submit(work, *function_args, **kwargs, key=job_id, pure=False)

        # Start additional cache job which depends on the results of the previous.
        cache_future: Future = self.client.submit(self.cache_provider.put, *[job_id, job_future], pure=False)

        # Publish the job as a dataset to maintain state across requests.
        self.client.publish_dataset(job_future, name=job_id, override=True)
        self.client.publish_dataset(cache_future, override=True)

        return True

    def stop_job(self, job_id: str) -> bool:
        if not self.job_exists(job_id):
            return False

        try:
            # Iterate through the dependencies of this job.
            dependencies = self.client.run_on_scheduler(lambda dask_scheduler: [(state.key) for state in dask_scheduler.tasks[id].dependencies])

            # Filter out any weak depenencies. Strong dependencies are suffixed with "/" and the name of the job.
            dependencies = [(dependency) for dependency in dependencies if dependency.replace(id, "").startswith("/")]

            futures = [(Future(key)) for key in dependencies]
            futures.append(Future(job_id))
        except KeyError:
            # do nothing if no dependencies
            pass

        self.client.cancel(Future(job_id))
        self.client.unpublish_dataset(job_id)

        # Hacky fix -- Simulation processes continue executing EVEN IF the parent task is killed.
        def hacky():
            os.system("pkill -f 'Simulation.out'")

        self.client.run(hacky, nanny=True)

        return True

    def job_status(self, job_id: str) -> JobStatus:
        # If the job is complete (results exist as a dataset or in the vault).
        if self.job_complete(job_id):
            status = JobStatus()
            status.status_id = JobState.DONE
            status.status_text = "The job is complete."
            status.has_failed = False
            status.is_done = True

            return status

        # If the job doesn't exist.
        if not self.job_exists(job_id):
            status = JobStatus()
            status.status_id = JobState.DOES_NOT_EXIST
            status.status_text = f"A job with job_id: '{job_id}' does not exist."
            status.has_failed = True
            status.is_done = False

            return status

        status_mapping = {
            "released": (JobState.STOPPED, "The job is known but not actively computing or in memory."),
            "waiting": (JobState.WAITING, "The job is waiting for dependencies to arrive in memory."),
            "no-worker": (JobState.WAITING, "The job is waiting for a worker to become available."),
            "processing": (JobState.RUNNING, "The job is running."),
            "memory": (JobState.DONE, "The job is done and is being held in memory."),
            "erred": (JobState.FAILED, "The job has failed."),
            "done": (JobState.DONE, "The job is done and has been cached / stored on disk.")
        }

        # Grab the task state from the scheduler.
        future_status = self.__job_state(job_id)

        status = JobStatus()
        status.status_id = status_mapping[future_status][0]
        status.status_text = status_mapping[future_status][1]

        status.is_done = status.status_id is JobState.DONE
        status.has_failed = status.status_id is JobState.FAILED

        return status

    def job_results(self, job_id: str):
        # The results of this job may exist on the client dataset.
        if job_id in self.client.datasets:
            print("[DEBUG] Getting results from dataset.")
            return self.client.get_dataset(name=job_id).result()

        # If the results are not in the cache, raise an exception.
        if not self.cache_provider.exists(job_id):
            raise Exception(f"Result with ID '{job_id}' does not exist in the cache.")

        return self.cache_provider.get(job_id)

    def job_complete(self, job_id: str) -> bool:
        # Finished job results must exist within the cache for it to be considered 'done'.
        return self.cache_provider.exists(job_id)

    def job_exists(self, job_id: str) -> bool:
        # Check if the job exists in the scheduler.
        return self.client.run_on_scheduler(self.scheduler_job_exists, job_id=job_id)

    def get_remote_dependency(self, dependency_id: str):
        # Check to see if the job exists as a dataset.
        dependency = self.client.get_dataset(name=dependency_id)

        if dependency is not None:
            return dependency

        raise Exception("Something broke, dependency does not exist within distributed memory.")

Example #5

0

Show file

File: calculate_feature_matrix.py Project: shannonyu/featuretools

def parallel_calculate_chunks(chunks,
                              features,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import Client, LocalCluster, as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        if 'cluster' in dask_kwargs:
            cluster = dask_kwargs['cluster']
        else:
            diagnostics_port = None
            if 'diagnostics_port' in dask_kwargs:
                diagnostics_port = dask_kwargs['diagnostics_port']
                del dask_kwargs['diagnostics_port']

            workers = n_jobs_to_workers(n_jobs)
            workers = min(workers, len(chunks))
            cluster = LocalCluster(n_workers=workers,
                                   threads_per_worker=1,
                                   diagnostics_port=diagnostics_port,
                                   **dask_kwargs)
            # if cluster has bokeh port, notify user if unxepected port number
            if diagnostics_port is not None:
                if hasattr(cluster, 'scheduler') and cluster.scheduler:
                    info = cluster.scheduler.identity()
                    if 'bokeh' in info['services']:
                        msg = "Dashboard started on port {}"
                        print(msg.format(info['services']['bokeh']))

        client = Client(cluster)
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            print("Using EntitySet persisted on the cluster as dataset %s" %
                  (es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        end = time.time()
        scatter_time = end - start
        scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
        print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix

Example #6

0

Show file


from distributed import Client
from dask_configuration import dask_scheduler_url


# connect to dask
client = Client(dask_scheduler_url)
temp_cube=client.get_dataset('temp_surface')
mean_temp=temp_cube.mean()
print(mean_temp)