def done_dask_callback( dask_future: distributed.Future, task_to_future_map: Dict[str, distributed.Future], user_callback: UserCompleteCB, main_loop: asyncio.AbstractEventLoop, ): # NOTE: BEWARE we are called in a separate thread!! job_id = dask_future.key event_data: Optional[TaskStateEvent] = None logger.debug("task '%s' completed with status %s", job_id, dask_future.status) try: if dask_future.status == "error": task_exception = dask_future.exception( timeout=_DASK_FUTURE_TIMEOUT_S) task_traceback = dask_future.traceback( timeout=_DASK_FUTURE_TIMEOUT_S) event_data = TaskStateEvent( job_id=job_id, state=RunningState.FAILED, msg=json_dumps( traceback.format_exception(type(task_exception), value=task_exception, tb=task_traceback)), ) elif dask_future.cancelled(): event_data = TaskStateEvent(job_id=job_id, state=RunningState.ABORTED) else: task_result = cast( TaskOutputData, dask_future.result(timeout=_DASK_FUTURE_TIMEOUT_S)) assert task_result # no sec event_data = TaskStateEvent( job_id=job_id, state=RunningState.SUCCESS, msg=task_result.json(), ) except distributed.TimeoutError: event_data = TaskStateEvent( job_id=job_id, state=RunningState.FAILED, msg=f"Timeout error getting results of '{job_id}'", ) logger.error( "fetching result of '%s' timed-out, please check", job_id, exc_info=True, ) finally: # remove the future from the dict to remove any handle to the future, so the worker can free the memory task_to_future_map.pop(job_id) logger.debug("dispatching callback to finish task '%s'", job_id) assert event_data # nosec try: asyncio.run_coroutine_threadsafe(user_callback(event_data), main_loop) except Exception: # pylint: disable=broad-except logger.exception( "Unexpected issue while transmitting state to main thread")
def results(owner, app_name, job_id): cluster_type = get_cluster_type(owner, app_name) if cluster_type == "single-core": async_result = AsyncResult(job_id) if async_result.ready() and async_result.successful(): return json.dumps(async_result.result) elif async_result.failed(): print("traceback", async_result.traceback) return json.dumps({ "status": "WORKER_FAILURE", "traceback": async_result.traceback }) else: return make_response("not ready", 202) elif cluster_type == "dask": addr = dask_scheduler_address(owner, app_name) with Client(addr) as client: fut = Future(job_id, client=client) if fut.done() and fut.status != "error": return fut.result() elif fut.done() and fut.status in ("error", "cancelled"): return json.dumps({ "status": "WORKER_FAILURE", "traceback": fut.traceback() }) else: return make_response("not ready", 202) else: return json.dumps({"error": "model does not exist."}), 404