def _run_worker( db_url: str, tick_seconds: float = 1, worker_tag: str = "", cleanup_timeout: float = 300, ) -> None: """ Infinite loop that continuously monitors the DB for tasks, claims tasks, executes their code, and saves results. It will also periodically do a clean up (reset orphaned tasks, delete old results) :param db_url: full database url string, including credentials, e.g. postgres://postgres:test@localhost:5000/qless :param tick_seconds: worker will sleep for this long between polls to the DB for tasks and cleanup attempts. :param worker_tag: enables this worker to execute tasks with this tag :param cleanup_timeout: when performing cleanup, any worker which has not reported a heartbeat in `cleanup_timeout` seconds will be considered dead, and its tasks reset to PENDING """ sql.startup(db_url) me = _register_worker(worker_tag) log(f"Worker started. Tag: {worker_tag}. Id = {me}") while _hearbeat(me): _cleanup(cleanup_timeout) sleep(tick_seconds) task = _claim_task(me, worker_tag) if task is not None: _run_task(task, me)
def start_local_postgres_docker_db() -> str: db_url = "postgres://*****:*****@localhost:5000/qless" os.system("docker kill pg-test") assert not os.system( "docker run --rm --name pg-test -e POSTGRES_PASSWORD=test -d -p 5000:5432 postgres:11" ) while not "database system is ready" in os.popen( "docker logs pg-test").read(): log.log("Waiting for DB to be ready...") sleep(0.2) sleep(0.5) return db_url
def _make_qless_db_if_not_present(db): # We need an engine without the `qless` db name # NB: Autocommit is required to create databases engine = create_engine(db.replace("qless", ""), isolation_level="AUTOCOMMIT") databases = engine.execute("SELECT datname FROM pg_database;").fetchall() databases = [d[0] for d in databases] if "qless" not in databases: conn = engine.connect() conn.execute("CREATE DATABASE qless") conn.close() log("Created database '/qless'")
def run_test_e2e(db_url): client.startup(db_url) worker.start_local_workers(n_workers=1, db_url=db_url, worker_tag="tag A") worker.start_local_workers(n_workers=4, db_url=db_url, worker_tag="tag B", cleanup_timeout=1) func = _make_test_function() # run a simple task task_id = client.submit(func, {"param": "abc"}, 123, requires_tag="tag B") _wait_for_true(lambda: client.get_task_result(task_id) is not None) result = client.get_task_result(task_id) assert result == len("abc") + 42 log.log("[OK] Tasks run") # Tasks are rescheduled if their worker is dead # start a task which takes longer than the expected heartbeat, # resulting in the worker being considered 'dead' and its task # rescheduled, this should happen `n_retries` times task_id = client.submit(_sleep, {"seconds": 5}, 123, n_retries_if_worker_hangs=2) _wait_for_true( lambda: client.get_task_status(task_id) == TaskStatus.TIMEOUT) log.log("[OK] Orphaned tasks rescheduled") log.log("[OK] All OK! :)")
def _run_task(task: Task, worker_id: int) -> None: """ Execute the task function and save its result if it completes or the exception if it errors :param task: the task to execute :param worker_id: the id of the current worker, running the task. This is needed as a worker id is compared against the DB to check the worker still owns the task, to avoid multiple workers working on the same task """ func = _deserialise(task.func) params = _deserialise(task.kwargs) args = str(params)[:20] log( f"Starting task {task.id_}. Function: {func.__name__}. Args: {args}. Worker: {worker_id}" ) try: _save_results(task.id_, func(**params), worker_id, TaskStatus.DONE) log(f"Task {task.id_} completed successfully") except Exception as err: _save_results(task.id_, err, worker_id, TaskStatus.ERROR) log(f"Error while running task {task.id_}: {err}") finally: _set_worker_task_to_none(worker_id)
def _search_for_dead_workers_and_disown_their_tasks(cleanup_timeout: float) -> None: """ If a worker is dead, the task it was working on should be reset, so another worker can pick it up. :param cleanup_timeout: a worker will be considered dead if it hasnt updated its heartbeat on the DB in the last `cleanup_timeout` seconds """ too_long_ago = datetime.now() - timedelta(seconds=cleanup_timeout) with sql.session_scope() as session: # Find any workers whose last heartbeat was more than `cleanup_timeout` ago dead_workers = ( session.query(WorkerRecord) .with_for_update() .filter(WorkerRecord.last_heartbeat < too_long_ago) .all() ) for dead_worker in dead_workers: task_id = dead_worker.working_on_task_id if task_id is not None: # Reset the task (if any) that the worker was working on orphan_task = session.query(TaskRecord).with_for_update().get(task_id) log( f"Worker {dead_worker.id_} has not responded in {cleanup_timeout} " f"seconds. Its task {task_id} will be disowned, and..." ) orphan_task.owner = NO_OWNER retries = orphan_task.retries if retries == 0: log(f"...Task Status set to TIMEOUT. No more retries left") orphan_task.status = TaskStatus.TIMEOUT.value else: log(f"...Task status set to PENDING. {retries} retries left") orphan_task.status = TaskStatus.PENDING.value orphan_task.retries = retries - 1 session.merge(orphan_task) dead_worker.working_on_task_id = None
def _save_results( task_id: int, results: Any, worker_id: int, status: Optional[TaskStatus] = None ) -> None: """ Saves the result for a given task (either the return value of the function executed or the exception raised). :param task_id: unique identifier for the task. This is created when the task is submitted. :param results: any object that can be serialised :param worker_id: the identifier for the worker attempting to save the results. This is needed because only workers that legitimally own a task are allowed to save results for it. This is to prevent multiple workers working on the same task :param status: if set, the task status will also be updated to this """ serialised_results = _serialise(results) with sql.session_scope() as session: task = session.query(TaskRecord).get(task_id) # Only save results if tasks is RUNNING and this worker still owns it if task.status == TaskStatus.RUNNING.value and task.owner == worker_id: log( f"Saving result for task {task_id}, with size = {len(serialised_results)}" ) task.results_dill = serialised_results if status is not None: task.status = status.value session.merge(task) elif task.status != TaskStatus.RUNNING.value: log(f"Task {task_id} not RUNNING. Status={task.status}. Results discarded.") elif task.owner != worker_id: log( f"Worker {worker_id} running task {task_id}, but task owner is: {task.owner}. Results discarded." ) worker = session.query(WorkerRecord).get(worker_id) worker.working_on_task_id = None session.merge(worker)
def _create_all_tables() -> None: global _engine if "task" not in MetaData().tables: BASE.metadata.create_all(_engine) log("Created all tables")
def reset() -> None: """ Destroy all task data. Drops all tables and recreates them """ BASE.metadata.drop_all(_engine) log("Dropped all tables") _create_all_tables()
creator=creator, status=TaskStatus.RUNNING.value, func=func, kwargs=kwargs, results="", ) def _help() -> str: return """ Usage: $ python -m queueless.worker POSTGRES_DB_URL [TAG] POSTGRES_DB_URL: full postgres connection string TAG: arbitrary string which enables this worker to execute tasks with the same tag Example: $ python -m queueless.worker postgres://postgres:test@localhost:5000/qless my_tag_1 """ if __name__ == "__main__": args = sys.argv if len(args) < 2: log(_help()) else: db_connection_string = args[1] tag = args[2] if len(args) > 2 else "" tick = float(args[3]) if len(args) > 3 else 1.0 _run_worker(db_connection_string, tick, tag)