コード例 #1
0
    def stop_job(self, job_id: str) -> bool:
        if not self.job_exists(job_id):
            return False

        try:
            # Iterate through the dependencies of this job.
            dependencies = self.client.run_on_scheduler(lambda dask_scheduler: [(state.key) for state in dask_scheduler.tasks[id].dependencies])

            # Filter out any weak depenencies. Strong dependencies are suffixed with "/" and the name of the job.
            dependencies = [(dependency) for dependency in dependencies if dependency.replace(id, "").startswith("/")]

            futures = [(Future(key)) for key in dependencies]
            futures.append(Future(job_id))
        except KeyError:
            # do nothing if no dependencies
            pass

        self.client.cancel(Future(job_id))
        self.client.unpublish_dataset(job_id)

        # Hacky fix -- Simulation processes continue executing EVEN IF the parent task is killed.
        def hacky():
            os.system("pkill -f 'Simulation.out'")

        self.client.run(hacky, nanny=True)

        return True
コード例 #2
0
def query_results(owner, app_name, job_id):

    cluster_type = get_cluster_type(owner, app_name)
    if cluster_type == "single-core":
        async_result = AsyncResult(job_id)
        print("celery result", async_result.state)
        if async_result.ready() and async_result.successful():
            return "YES"
        elif async_result.failed():
            return "FAIL"
        else:
            return "NO"
    elif cluster_type == "dask":
        addr = dask_scheduler_address(owner, app_name)
        with Client(addr) as client:
            fut = Future(job_id, client=client)
            print("dask result", fut.status)
            if fut.done() and fut.status != "error":
                return "YES"
            elif fut.done() and fut.status in ("error", "cancelled"):
                return "FAIL"
            else:
                return "NO"
    else:
        return json.dumps({"error": "model does not exist."}), 404
コード例 #3
0
def results(owner, app_name, job_id):
    cluster_type = get_cluster_type(owner, app_name)
    if cluster_type == "single-core":
        async_result = AsyncResult(job_id)
        if async_result.ready() and async_result.successful():
            return json.dumps(async_result.result)
        elif async_result.failed():
            print("traceback", async_result.traceback)
            return json.dumps({
                "status": "WORKER_FAILURE",
                "traceback": async_result.traceback
            })
        else:
            return make_response("not ready", 202)
    elif cluster_type == "dask":
        addr = dask_scheduler_address(owner, app_name)
        with Client(addr) as client:
            fut = Future(job_id, client=client)
            if fut.done() and fut.status != "error":
                return fut.result()
            elif fut.done() and fut.status in ("error", "cancelled"):
                return json.dumps({
                    "status": "WORKER_FAILURE",
                    "traceback": fut.traceback()
                })
            else:
                return make_response("not ready", 202)
    else:
        return json.dumps({"error": "model does not exist."}), 404
コード例 #4
0
 def cancel(self, scheduler_address=None):
     if not scheduler_address:
         scheduler_address = self.scheduler_address
     client = Client(scheduler_address)
     f = Future(self.name + "_graph", client=client)
     f.cancel(force=True)
     client.close()
コード例 #5
0
ファイル: registry.py プロジェクト: bing9/grizly
 def _cancel(self, scheduler_address: Optional[str] = None) -> None:
     if not scheduler_address:
         scheduler_address = self.scheduler_address
     client = Client(scheduler_address)
     f = Future(self.name + " graph", client=client)
     f.cancel(force=True)
     client.close()
コード例 #6
0
    def test_job_cache(self):
        """ Test the cache behavior for completed jobs. """

        model = MichaelisMenten()
        model.namee = "test_job_cache_MichaelisMenten"

        job = RemoteSimulation.on(self.compute_server).with_model(model).run()

        # Wait for the job to complete and get the results.
        results = job.resolve().to_json()

        # Assert that results can still be resolved even if the job was cancelled.
        Future(job.result_id, client=self.client).cancel()
        assert (results == job.resolve().to_json())

        # Assert that job results can be resolved from the cache.
        self.client.unpublish_dataset(job.result_id)
        assert (results == job.resolve().to_json())
コード例 #7
0
    def test_run_model_consistency(self):
        """ Test to ensure the API and Cluster return consistent job statuses. """

        model = MichaelisMenten()
        model.name = "test_run_model_consistency_MichaelisMenten"

        job = RemoteSimulation.on(self.compute_server).with_model(model).run()

        # Assert that the job hasn't failed yet, and that it does exist on the scheduler.
        assert (job.status().status_id != JobState.FAILED)

        # Wait for the job to complete.
        job.wait()

        # Assert that the API reports status_id as JobState.DONE.
        assert (job.status().status_id == JobState.DONE)

        # Assert that results can be retrieved from the API. This will raise an exception if the request fails.
        Future(job.result_id, client=self.client).result(timeout=5)
コード例 #8
0
def parallel_calculate_chunks(chunks,
                              feature_set,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import as_completed, Future
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        client, cluster = create_client_and_cluster(
            n_jobs=n_jobs,
            num_tasks=len(chunks),
            dask_kwargs=dask_kwargs,
            entityset_size=entityset.__sizeof__())
        # scatter the entityset
        # denote future with leading underscore
        if verbose:
            start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            if verbose:
                msg = "Using EntitySet persisted on the cluster as dataset {}"
                print(msg.format(es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(feature_set)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        num_scattered_workers = len(
            client.who_has([Future(es_token)]).get(es_token, []))
        num_workers = len(client.scheduler_info()['workers'].values())

        scatter_warning(num_scattered_workers, num_workers)
        if verbose:
            end = time.time()
            scatter_time = round(end - start)
            scatter_string = "EntitySet scattered to {} workers in {} seconds"
            print(scatter_string.format(num_scattered_workers, scatter_time))
        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             feature_set=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
コード例 #9
0
def parallel_calculate_chunks(cutoff_time,
                              chunk_size,
                              feature_set,
                              approximate,
                              training_window,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              progress_bar,
                              dask_kwargs=None,
                              progress_callback=None):
    from distributed import as_completed, Future
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        client, cluster = create_client_and_cluster(
            n_jobs=n_jobs,
            dask_kwargs=dask_kwargs,
            entityset_size=entityset.__sizeof__())
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            msg = "Using EntitySet persisted on the cluster as dataset {}"
            progress_bar.write(msg.format(es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(feature_set)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        num_scattered_workers = len(
            client.who_has([Future(es_token)]).get(es_token, []))
        num_workers = len(client.scheduler_info()['workers'].values())

        chunks = cutoff_time.groupby(cutoff_df_time_var)

        if not chunk_size:
            chunk_size = _handle_chunk_size(1.0 / num_workers,
                                            cutoff_time.shape[0])

        chunks = _chunk_dataframe_groups(chunks, chunk_size)

        chunks = [df for _, df in chunks]

        if len(chunks) < num_workers:
            chunk_warning = "Fewer chunks ({}), than workers ({}) consider reducing the chunk size"
            warning_string = chunk_warning.format(len(chunks), num_workers)
            progress_bar.write(warning_string)

        scatter_warning(num_scattered_workers, num_workers)
        end = time.time()
        scatter_time = round(end - start)

        # if enabled, reset timer after scatter for better time remaining estimates
        if not progress_bar.disable:
            progress_bar.reset()

        scatter_string = "EntitySet scattered to {} workers in {} seconds"
        progress_bar.write(
            scatter_string.format(num_scattered_workers, scatter_time))
        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             feature_set=_saved_features,
                             chunk_size=None,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns,
                             progress_bar=None,
                             progress_callback=progress_callback)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                previous_progress = progress_bar.n
                progress_bar.update(result.shape[0])
                if progress_callback is not None:
                    update, progress_percent, time_elapsed = update_progress_callback_parameters(
                        progress_bar, previous_progress)
                    progress_callback(update, progress_percent, time_elapsed)

    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()

        if client is not None:
            client.close()

    feature_matrix = pd.concat(feature_matrix)

    return feature_matrix