def _register_worker(worker_tag: str) -> int: """ Registers the worker with the DB and returns its unique id """ with sql.session_scope() as session: record = WorkerRecord(last_heartbeat=datetime.now(), tag=worker_tag) session.add(record) session.flush() return int(record.id_)
def _set_worker_task_to_none(worker_id: int): """ Setting a worker task to None means the worker is not considered to be working on any tasks any more. This has no effect on the worker behaviour, its purpose is only to stop cleanups from resetting a task if the worker is considered non-responsive (dead) """ with sql.session_scope() as session: worker = session.query(WorkerRecord).get(worker_id) worker.working_on_task_id = None session.merge(worker)
def get_task_result(task_id: int) -> Any: """ Returns the result of running the task, which can :param task_id: unique identifier for the task, you get this number when you call submit(). :return: either be the return value of the function submitted (if status is DONE, or the exception raised (if status is ERROR) """ with session_scope() as session: rec = session.query(TaskRecord).get(task_id) results = rec.results_dill return dill.loads(eval(results)) if results else None
def get_task_status(task_id: int) -> TaskStatus: """ A task status represents what part of the execution journey it is in, possible journeys: PENDING -> RUNNING -> DONE (use get_task_result() to get the result) PENDING -> RUNNING -> ERROR (use get_task_result() to get the error) PENDING -> RUNNING -> PENDING -> RUNNING -> TIMEOUT (task takes too long or hangs) """ with session_scope() as session: return TaskStatus(session.query(TaskRecord).get(task_id).status)
def _hearbeat(worker_id: int) -> bool: """ Interacts with the Worker table, checking if worker needs to die, and updating the worker heartbeat to signal it is still alive :returns: True if all is normal, False if execution should stop """ with sql.session_scope() as session: record = session.query(WorkerRecord).get(worker_id) if record is None: return False record.last_heartbeat = datetime.now() # TODO: only if diff > 3 sec session.merge(record) return True
def submit( func: Callable[..., Any], kwargs: Dict[str, Any], creator: int, requires_tag: str = "", n_retries_if_worker_hangs: int = 1, ) -> int: """ Sends the function to be executed remotely, with the given kwargs :param creator: identifier for the creator of this task, so that it can later be filtered/retrieved :param requires_tag: if specified, only workers started with this tag will be allowed to run this task :param func: the function to run :param kwargs: the keyword arguments to pass to the function :param creator: a unique identifier for the creator of this task, to ease later queries such as 'get tasks for this creator' :param requires_tag: only workers that have this tag will pick up this task. Defaults to '' (any worker) :param n_retries_if_worker_hangs: how many times should this task be retried if it makes the workers hang. queueless reassign tasks from non-responsive workers to new workers. If a task takes too long, or has resource problems, it may be its fault that the worker executing it died. This number limits the chances a task has to complete before it is marked as TIMEOUT :return: a unique identifier for the task, which can later be used to query its status or get the results """ func_str = str(dill.dumps(func)) kwargs_str = str(dill.dumps(kwargs)) status = TaskStatus.PENDING.value rec = TaskRecord( creator=creator, owner=NO_OWNER, status=status, function_dill=func_str, kwargs_dill=kwargs_str, results_dill="", retries=n_retries_if_worker_hangs, last_updated=datetime.now(), requires_tag=requires_tag, ) with session_scope() as session: session.add(rec) session.flush() task_id = rec.id_ return task_id
def _claim_task(worker_id: int, worker_tag: str = "") -> Optional[Task]: """ Grabs a PENDING task from the DB and marks it as owned by this worker, and set is to RUNNING :param worker_id: identity of the worker that is claiming the task. Both the worker record and the task record will be modified, to have references to one another :param worker_tag: an arbitrary string, if set, only tasks with this tag will be claimed. :return: either the task claimed or None if no suitable (PENDING and correct tag) tasks were found """ with sql.session_scope() as session: rec = ( session.query(TaskRecord) .with_for_update() .filter_by(status=TaskStatus.PENDING.value, owner=NO_OWNER) .filter(TaskRecord.requires_tag.in_([worker_tag, ""])) .first() ) if rec is None: return None rec.owner = worker_id rec.status = TaskStatus.RUNNING.value session.merge(rec) func = rec.function_dill kwargs = rec.kwargs_dill id_ = rec.id_ creator = rec.creator worker = session.query(WorkerRecord).get(worker_id) worker.working_on_task_id = id_ session.merge(worker) return Task( id_=id_, owner=worker_id, creator=creator, status=TaskStatus.RUNNING.value, func=func, kwargs=kwargs, results="", )
def _search_for_dead_workers_and_disown_their_tasks(cleanup_timeout: float) -> None: """ If a worker is dead, the task it was working on should be reset, so another worker can pick it up. :param cleanup_timeout: a worker will be considered dead if it hasnt updated its heartbeat on the DB in the last `cleanup_timeout` seconds """ too_long_ago = datetime.now() - timedelta(seconds=cleanup_timeout) with sql.session_scope() as session: # Find any workers whose last heartbeat was more than `cleanup_timeout` ago dead_workers = ( session.query(WorkerRecord) .with_for_update() .filter(WorkerRecord.last_heartbeat < too_long_ago) .all() ) for dead_worker in dead_workers: task_id = dead_worker.working_on_task_id if task_id is not None: # Reset the task (if any) that the worker was working on orphan_task = session.query(TaskRecord).with_for_update().get(task_id) log( f"Worker {dead_worker.id_} has not responded in {cleanup_timeout} " f"seconds. Its task {task_id} will be disowned, and..." ) orphan_task.owner = NO_OWNER retries = orphan_task.retries if retries == 0: log(f"...Task Status set to TIMEOUT. No more retries left") orphan_task.status = TaskStatus.TIMEOUT.value else: log(f"...Task status set to PENDING. {retries} retries left") orphan_task.status = TaskStatus.PENDING.value orphan_task.retries = retries - 1 session.merge(orphan_task) dead_worker.working_on_task_id = None
def _save_results( task_id: int, results: Any, worker_id: int, status: Optional[TaskStatus] = None ) -> None: """ Saves the result for a given task (either the return value of the function executed or the exception raised). :param task_id: unique identifier for the task. This is created when the task is submitted. :param results: any object that can be serialised :param worker_id: the identifier for the worker attempting to save the results. This is needed because only workers that legitimally own a task are allowed to save results for it. This is to prevent multiple workers working on the same task :param status: if set, the task status will also be updated to this """ serialised_results = _serialise(results) with sql.session_scope() as session: task = session.query(TaskRecord).get(task_id) # Only save results if tasks is RUNNING and this worker still owns it if task.status == TaskStatus.RUNNING.value and task.owner == worker_id: log( f"Saving result for task {task_id}, with size = {len(serialised_results)}" ) task.results_dill = serialised_results if status is not None: task.status = status.value session.merge(task) elif task.status != TaskStatus.RUNNING.value: log(f"Task {task_id} not RUNNING. Status={task.status}. Results discarded.") elif task.owner != worker_id: log( f"Worker {worker_id} running task {task_id}, but task owner is: {task.owner}. Results discarded." ) worker = session.query(WorkerRecord).get(worker_id) worker.working_on_task_id = None session.merge(worker)