def stop_job(self, job_id: str) -> bool: if not self.job_exists(job_id): return False try: # Iterate through the dependencies of this job. dependencies = self.client.run_on_scheduler(lambda dask_scheduler: [(state.key) for state in dask_scheduler.tasks[id].dependencies]) # Filter out any weak depenencies. Strong dependencies are suffixed with "/" and the name of the job. dependencies = [(dependency) for dependency in dependencies if dependency.replace(id, "").startswith("/")] futures = [(Future(key)) for key in dependencies] futures.append(Future(job_id)) except KeyError: # do nothing if no dependencies pass self.client.cancel(Future(job_id)) self.client.unpublish_dataset(job_id) # Hacky fix -- Simulation processes continue executing EVEN IF the parent task is killed. def hacky(): os.system("pkill -f 'Simulation.out'") self.client.run(hacky, nanny=True) return True
def query_results(owner, app_name, job_id): cluster_type = get_cluster_type(owner, app_name) if cluster_type == "single-core": async_result = AsyncResult(job_id) print("celery result", async_result.state) if async_result.ready() and async_result.successful(): return "YES" elif async_result.failed(): return "FAIL" else: return "NO" elif cluster_type == "dask": addr = dask_scheduler_address(owner, app_name) with Client(addr) as client: fut = Future(job_id, client=client) print("dask result", fut.status) if fut.done() and fut.status != "error": return "YES" elif fut.done() and fut.status in ("error", "cancelled"): return "FAIL" else: return "NO" else: return json.dumps({"error": "model does not exist."}), 404
def results(owner, app_name, job_id): cluster_type = get_cluster_type(owner, app_name) if cluster_type == "single-core": async_result = AsyncResult(job_id) if async_result.ready() and async_result.successful(): return json.dumps(async_result.result) elif async_result.failed(): print("traceback", async_result.traceback) return json.dumps({ "status": "WORKER_FAILURE", "traceback": async_result.traceback }) else: return make_response("not ready", 202) elif cluster_type == "dask": addr = dask_scheduler_address(owner, app_name) with Client(addr) as client: fut = Future(job_id, client=client) if fut.done() and fut.status != "error": return fut.result() elif fut.done() and fut.status in ("error", "cancelled"): return json.dumps({ "status": "WORKER_FAILURE", "traceback": fut.traceback() }) else: return make_response("not ready", 202) else: return json.dumps({"error": "model does not exist."}), 404
def cancel(self, scheduler_address=None): if not scheduler_address: scheduler_address = self.scheduler_address client = Client(scheduler_address) f = Future(self.name + "_graph", client=client) f.cancel(force=True) client.close()
def _cancel(self, scheduler_address: Optional[str] = None) -> None: if not scheduler_address: scheduler_address = self.scheduler_address client = Client(scheduler_address) f = Future(self.name + " graph", client=client) f.cancel(force=True) client.close()
def test_job_cache(self): """ Test the cache behavior for completed jobs. """ model = MichaelisMenten() model.namee = "test_job_cache_MichaelisMenten" job = RemoteSimulation.on(self.compute_server).with_model(model).run() # Wait for the job to complete and get the results. results = job.resolve().to_json() # Assert that results can still be resolved even if the job was cancelled. Future(job.result_id, client=self.client).cancel() assert (results == job.resolve().to_json()) # Assert that job results can be resolved from the cache. self.client.unpublish_dataset(job.result_id) assert (results == job.resolve().to_json())
def test_run_model_consistency(self): """ Test to ensure the API and Cluster return consistent job statuses. """ model = MichaelisMenten() model.name = "test_run_model_consistency_MichaelisMenten" job = RemoteSimulation.on(self.compute_server).with_model(model).run() # Assert that the job hasn't failed yet, and that it does exist on the scheduler. assert (job.status().status_id != JobState.FAILED) # Wait for the job to complete. job.wait() # Assert that the API reports status_id as JobState.DONE. assert (job.status().status_id == JobState.DONE) # Assert that results can be retrieved from the API. This will raise an exception if the request fails. Future(job.result_id, client=self.client).result(timeout=5)
def parallel_calculate_chunks(chunks, feature_set, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import as_completed, Future from dask.base import tokenize client = None cluster = None try: client, cluster = create_client_and_cluster( n_jobs=n_jobs, num_tasks=len(chunks), dask_kwargs=dask_kwargs, entityset_size=entityset.__sizeof__()) # scatter the entityset # denote future with leading underscore if verbose: start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): if verbose: msg = "Using EntitySet persisted on the cluster as dataset {}" print(msg.format(es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(feature_set) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) num_scattered_workers = len( client.who_has([Future(es_token)]).get(es_token, [])) num_workers = len(client.scheduler_info()['workers'].values()) scatter_warning(num_scattered_workers, num_workers) if verbose: end = time.time() scatter_time = round(end - start) scatter_string = "EntitySet scattered to {} workers in {} seconds" print(scatter_string.format(num_scattered_workers, scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, feature_set=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
def parallel_calculate_chunks(cutoff_time, chunk_size, feature_set, approximate, training_window, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, progress_bar, dask_kwargs=None, progress_callback=None): from distributed import as_completed, Future from dask.base import tokenize client = None cluster = None try: client, cluster = create_client_and_cluster( n_jobs=n_jobs, dask_kwargs=dask_kwargs, entityset_size=entityset.__sizeof__()) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): msg = "Using EntitySet persisted on the cluster as dataset {}" progress_bar.write(msg.format(es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(feature_set) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) num_scattered_workers = len( client.who_has([Future(es_token)]).get(es_token, [])) num_workers = len(client.scheduler_info()['workers'].values()) chunks = cutoff_time.groupby(cutoff_df_time_var) if not chunk_size: chunk_size = _handle_chunk_size(1.0 / num_workers, cutoff_time.shape[0]) chunks = _chunk_dataframe_groups(chunks, chunk_size) chunks = [df for _, df in chunks] if len(chunks) < num_workers: chunk_warning = "Fewer chunks ({}), than workers ({}) consider reducing the chunk size" warning_string = chunk_warning.format(len(chunks), num_workers) progress_bar.write(warning_string) scatter_warning(num_scattered_workers, num_workers) end = time.time() scatter_time = round(end - start) # if enabled, reset timer after scatter for better time remaining estimates if not progress_bar.disable: progress_bar.reset() scatter_string = "EntitySet scattered to {} workers in {} seconds" progress_bar.write( scatter_string.format(num_scattered_workers, scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, feature_set=_saved_features, chunk_size=None, entityset=_es, approximate=approximate, training_window=training_window, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, progress_bar=None, progress_callback=progress_callback) feature_matrix = [] iterator = as_completed(_chunks).batches() for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) previous_progress = progress_bar.n progress_bar.update(result.shape[0]) if progress_callback is not None: update, progress_percent, time_elapsed = update_progress_callback_parameters( progress_bar, previous_progress) progress_callback(update, progress_percent, time_elapsed) except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() feature_matrix = pd.concat(feature_matrix) return feature_matrix