def get(self): """Return a list of tasks""" request_args = request.args.to_dict() request_args["status"] = request.args.getlist("status") request_args = TasksSchema().load(request_args) # unpack query parameter skip, limit = request_args["skip"], request_args["limit"] statuses = request_args.get("status") schedule_name = request_args.get("schedule_name") # get tasks from database query = {} if statuses: query["status"] = {"$in": statuses} if schedule_name: query["schedule_name"] = schedule_name count = Tasks().count_documents(query) cursor = Tasks().aggregate([ { "$match": query }, { "$project": { "schedule_name": 1, "status": 1, "timestamp": 1, "worker": 1, "config.resources": 1, "updated_at": { "$arrayElemAt": ["$events.timestamp", -1] }, } }, { "$sort": { "updated_at": pymongo.DESCENDING } }, { "$skip": skip }, { "$limit": limit }, ]) tasks = list(cursor) return jsonify({ "meta": { "skip": skip, "limit": limit, "count": count }, "items": tasks })
def handle_notification(task_id, event): # alias for all complete status if event in TaskStatus.complete(): event = "ended" # exit early if not a triggering event if event not in GlobalNotifications.events: return task = Tasks().find_one({"_id": task_id}) or RequestedTasks().find_one( {"_id": task_id}) if not task: return # serialize/unserialize task so we use a safe version from now-on task = json.loads(json.dumps(task, cls=Encoder)) global_notifs = GlobalNotifications.entries.get(event, {}) task_notifs = task.get("notification", {}).get(event, {}) # exit early if we don't have notification requests for the event if not global_notifs and not task_notifs: return for method, recipients in list(task_notifs.items()) + list( global_notifs.items()): func = { "mailgun": handle_mailgun_notification, "webhook": handle_webhook_notification, "slack": handle_slack_notification, }.get(method) if func and recipients: func(task, recipients)
def history_cleanup(): """removes tasks for which the schedule has been run multiple times after Uses HISTORY_TASK_PER_SCHEDULE""" logger.info(f":: removing tasks history (>{HISTORY_TASK_PER_SCHEDULE})") cursor = Tasks().aggregate([ { "$group": { "_id": "$schedule_name", "count": { "$sum": 1 } } }, { "$match": { "count": { "$gt": HISTORY_TASK_PER_SCHEDULE } } }, ]) schedules_with_too_much_tasks = [s["_id"] for s in cursor] task_ids_to_delete = [] for schedule_name in schedules_with_too_much_tasks: cursor = Tasks().aggregate([ { "$match": { "schedule_name": schedule_name } }, { "$project": { "schedule_name": 1, "updated_at": { "$arrayElemAt": ["$events.timestamp", -1] }, } }, { "$sort": { "updated_at": pymongo.DESCENDING } }, { "$skip": HISTORY_TASK_PER_SCHEDULE }, ]) task_ids_to_delete += [t["_id"] for t in cursor] result = Tasks().delete_many({"_id": {"$in": task_ids_to_delete}}) logger.info( f"::: deleted {result.deleted_count}/{len(task_ids_to_delete)} tasks")
def update_schedule_duration(schedule_name): """set/update the `duration` object of a schedule by looking at its recent tasks value is computed with `scraper_completed - started` timestamps""" schedule_query = {"name": schedule_name} # retrieve last tasks that completed the resources intensive part query = { "schedule_name": schedule_name, f"timestamp.{TaskStatus.scraper_completed}": { "$exists": True }, f"timestamp.{TaskStatus.started}": { "$exists": True }, "container.exit_code": 0, } document = { "default": get_default_duration(), } # we have no finished task for this schedule, using default duration if Tasks().count_documents(query) == 0: document.update({"available": False, "workers": {}}) # compute duration from last completed tasks else: tasks = (Tasks().find(query, { "timestamp": 1, "worker": 1 }).sort(f"timestamp.{TaskStatus.scraper_completed}", pymongo.ASCENDING)) workers = { task["worker"]: { "worker": task["worker"], "task": task["_id"], "value": int((task["timestamp"]["scraper_completed"] - task["timestamp"]["started"]).total_seconds()), "on": task["timestamp"][TaskStatus.scraper_completed], } for task in tasks } if workers: document.update({"available": True, "workers": workers}) Schedules().update_one(schedule_query, {"$set": {"duration": document}})
def staled_statuses(): """ set the status for tasks in an unfinished state """ now = getnow() # `started` statuses status_to_cancel(now, TaskStatus.started, STALLED_STARTED_TIMEOUT) # `reserved` statuses status_to_cancel(now, TaskStatus.reserved, STALLED_RESERVED_TIMEOUT) # `cancel_requested` statuses status_to_cancel(now, TaskStatus.cancel_requested, STALLED_CANCELREQ_TIMEOUT) # `scraper_completed` statuses: either success or failure status = TaskStatus.scraper_completed logger.info( f":: closing tasks `{status}` for more than {STALLED_COMPLETED_TIMEOUT}s" ) ago = now - datetime.timedelta(seconds=STALLED_COMPLETED_TIMEOUT) query = {"status": status, f"timestamp.{status}": {"$lte": ago}} query_success = {"container.exit_code": 0} query_success.update(query) result = Tasks().update_many( query_success, { "$set": { "status": TaskStatus.succeeded, f"timestamp.{TaskStatus.succeeded}": now, } }, ) logger.info( f"::: succeeded {result.modified_count}/{result.matched_count} tasks") query_failed = {"container.exit_code": {"$ne": 0}} query_failed.update(query) result = Tasks().update_many( query_failed, { "$set": { "status": TaskStatus.failed, f"timestamp.{TaskStatus.failed}": now } }, ) logger.info( f"::: failed {result.modified_count}/{result.matched_count} tasks")
def task_canceled_event_handler(task_id, payload): logger.info(f"Task Cancelled: {task_id}") # if canceled event carries a `canceled_by` and we have none on the task # then store it, otherwise keep what's in the task (manual request) canceled_by = None task = Tasks().find_one({"_id": task_id}, {"canceled_by": 1}) if payload.get("canceled_by") and task and not task.get("canceled_by"): canceled_by = payload.get("canceled_by") save_event( task_id, TaskStatus.canceled, get_timestamp_from_event(payload), task_log=payload.get("log"), canceled_by=canceled_by, )
def get(self, task_id: str): task = Tasks().find_one({"_id": task_id}) if task is None: raise TaskNotFound() task["updated_at"] = task["events"][-1]["timestamp"] return jsonify(task)
def post(self, task_id: str, token: AccessToken.Payload): """ create a task from a requested_task_id """ requested_task = RequestedTasks().find_one({"_id": task_id}) if requested_task is None: raise TaskNotFound() request_args = TaskCreateSchema().load(request.args.to_dict()) document = {} document.update(requested_task) try: Tasks().insert_one(requested_task) except pymongo.errors.DuplicateKeyError as exc: logger.exception(exc) response = jsonify({}) response.status_code = 423 # Locked return response except Exception as exc: logger.exception(exc) raise exc payload = {"worker": request_args["worker_name"]} try: task_event_handler(task_id, TaskStatus.reserved, payload) except Exception as exc: logger.exception(exc) logger.error("unable to create task. reverting.") try: Tasks().delete_one({"_id": task_id}) except Exception: logger.debug(f"unable to revert deletion of task {task_id}") raise exc try: RequestedTasks().delete_one({"_id": task_id}) except Exception as exc: logger.exception(exc) # and pass BROADCASTER.broadcast_updated_task(task_id, TaskStatus.reserved, payload) return make_response(jsonify(Tasks().find_one({"_id": task_id})), HTTPStatus.CREATED)
def request_tasks_using_schedule(): """ create requested_tasks based on schedule's periodicity field Expected to be ran periodically to compute what needs to be scheduled """ requester = "period-scheduler" priority = 0 worker = None query = {"enabled": True} projection = {"name": 1, "config": 1, "most_recent_task": 1} for period, period_data in { p: PERIODICITIES.get(p) for p in SchedulePeriodicity.all() }.items(): if not period_data: continue # manually has no data period_start = getnow() - datetime.timedelta(days=period_data["days"]) logger.debug(f"requesting for `{period}` schedules (before {period_start})") # find non-requested schedules which last run started before our period start query["periodicity"] = period for schedule in Schedules().find(query, projection): # don't bother if the schedule's already requested if ( RequestedTasks().count_documents({"schedule_name": schedule["name"]}) > 0 ): continue if schedule.get("most_recent_task"): last_run = Tasks().find_one( {"_id": schedule["most_recent_task"]["_id"]}, {"timestamp": 1} ) # don't bother if it started after this rolling period's start if ( last_run and last_run["timestamp"].get( "started", datetime.datetime(2019, 1, 1) ) > period_start ): continue if request_a_schedule(schedule["name"], requester, worker, priority): logger.debug(f"requested {schedule['name']}") else: logger.debug(f"could not request {schedule['name']}")
def status_to_cancel(now, status, timeout): logger.info(f":: canceling tasks `{status}` for more than {timeout}s") ago = now - datetime.timedelta(seconds=timeout) query = {"status": status, f"timestamp.{status}": {"$lte": ago}} result = Tasks().update_many( query, { "$set": { "status": TaskStatus.canceled, "canceled_by": NAME, f"timestamp.{TaskStatus.canceled}": now, } }, ) logger.info( f"::: canceled {result.modified_count}/{result.matched_count} tasks")
def get_currently_running_tasks(worker_name): """ list of tasks being run by worker at this moment, including ETA """ running_tasks = list( Tasks().find( {"status": {"$nin": TaskStatus.complete()}, "worker": worker_name}, { "config.resources": 1, "config.platform": 1, "schedule_name": 1, "timestamp": 1, }, ) ) # calculate ETAs of the tasks we are currently running for task in running_tasks: task.update(get_task_eta(task, worker_name)) return running_tasks
def get(self, task_id: str, token: AccessToken.Payload = None): # exclude notification to not expose private information (privacy) # on anonymous requests and requests for users without schedules_update projection = (None if token and token.get_permission("schedules", "update") else { "notification": 0 }) task = Tasks().find_one({"_id": task_id}, projection) if task is None: raise TaskNotFound() task["updated_at"] = task["events"][-1]["timestamp"] if not token or not token.get_permission("tasks", "create"): remove_secrets_from_response(task) return jsonify(task)
def post(self, task_id: str, token: AccessToken.Payload): task = Tasks().find_one( { "status": { "$in": TaskStatus.incomplete() }, "_id": task_id }, {"_id": 1}) if task is None: raise TaskNotFound() task_event_handler(task["_id"], TaskStatus.cancel_requested, {"canceled_by": token.username}) # broadcast cancel-request to worker BROADCASTER.broadcast_cancel_task(task_id) return Response(status=HTTPStatus.NO_CONTENT)
def patch(self, task_id: str, token: AccessToken.Payload): task = Tasks().find_one({"_id": task_id}, {"_id": 1}) if task is None: raise TaskNotFound() try: request_json = TasKUpdateSchema().load(request.get_json()) # empty dict passes the validator but troubles mongo if not request.get_json(): raise ValidationError("Update can't be empty") except ValidationError as e: raise InvalidRequestJSON(e.messages) task_event_handler(task["_id"], request_json["event"], request_json["payload"]) BROADCASTER.broadcast_updated_task(task_id, request_json["event"], request_json["payload"]) return Response(status=HTTPStatus.NO_CONTENT)
def _update_schedule_most_recent_task_status(task_id): """ update `most_recent_task` value of associated schedule """ # get schedule and last event cursor = Tasks().aggregate([ { "$match": { "_id": task_id } }, { "$project": { "schedule_name": 1, "last_event": { "$arrayElemAt": ["$events", -1] }, } }, ]) tasks = [task for task in cursor] task = tasks[0] if tasks else None if not task: return # update schedule most recent task schedule_name = task["schedule_name"] last_event_code = task["last_event"]["code"] last_event_timestamp = task["last_event"]["timestamp"] if "container" in last_event_code: return schedule_updates = { "most_recent_task": { "_id": task_id, "status": last_event_code, "updated_at": last_event_timestamp, } } Schedules().update_one({"name": schedule_name}, {"$set": schedule_updates})
def patch(self, schedule_name: str, token: AccessToken.Payload): """Update all properties of a schedule but _id and most_recent_task""" query = {"name": schedule_name} schedule = Schedules().find_one(query, {"config.task_name": 1}) if not schedule: raise ScheduleNotFound() try: update = UpdateSchema().load(request.get_json()) # , partial=True # empty dict passes the validator but troubles mongo if not request.get_json(): raise ValidationError("Update can't be empty") # ensure we test flags according to new task_name if present if "task_name" in update: if "flags" not in update: raise ValidationError( "Can't update offliner without updating flags" ) flags_schema = ScheduleConfigSchema.get_offliner_schema( update["task_name"] ) else: flags_schema = ScheduleConfigSchema.get_offliner_schema( schedule["config"]["task_name"] ) if "flags" in update: flags_schema().load(update["flags"]) except ValidationError as e: raise InvalidRequestJSON(e.messages) if "name" in update: if Schedules().count_documents({"name": update["name"]}): raise BadRequest( "Schedule with name `{}` already exists".format(update["name"]) ) config_keys = [ "task_name", "warehouse_path", "image", "resources", "platform", "flags", ] mongo_update = { f"config.{key}" if key in config_keys else key: value for key, value in update.items() } matched_count = ( Schedules().update_one(query, {"$set": mongo_update}).matched_count ) if matched_count: tasks_query = {"schedule_name": schedule_name} if "name" in update: Tasks().update_many( tasks_query, {"$set": {"schedule_name": update["name"]}} ) RequestedTasks().update_many( tasks_query, {"$set": {"schedule_name": update["name"]}} ) return Response(status=HTTPStatus.NO_CONTENT) raise ScheduleNotFound()
def make_task(database, make_event): task_ids = [] tasks = Tasks(database=database) def _make_task( schedule_id=ObjectId(), schedule_name="", status=TaskStatus.succeeded, hostname="zimfarm_worker.com", ): if status == TaskStatus.requested: events = [TaskStatus.requested] elif status == TaskStatus.reserved: events = [TaskStatus.requested, TaskStatus.reserved] elif status == TaskStatus.started: events = [ TaskStatus.requested, TaskStatus.reserved, TaskStatus.started ] elif status == TaskStatus.succeeded: events = [ TaskStatus.requested, TaskStatus.reserved, TaskStatus.started, TaskStatus.succeeded, ] else: events = [ TaskStatus.requested, TaskStatus.reserved, TaskStatus.started, TaskStatus.failed, ] timestamp = {event: datetime.now() for event in events} events = [make_event(event, timestamp[event]) for event in events] container = { "command": "mwoffliner --mwUrl=https://example.com", "image": { "name": "mwoffliner", "tag": "1.8.0" }, "exit_code": 0, "stderr": "example_stderr", "stdout": "example_stdout", } debug = {"args": [], "kwargs": {}} if status == TaskStatus.failed: debug["exception"] = "example_exception" debug["traceback"] = "example_traceback" files = {} else: files = { "mwoffliner_1.zim": { "name": "mwoffliner_1.zim", "size": 1000 } } task = { "_id": ObjectId(), "status": status, "worker": hostname, "schedule_name": schedule_name, "timestamp": timestamp, "events": events, "container": container, "debug": debug, "files": files, } tasks.insert_one(task) task_ids.append(task["_id"]) return task yield _make_task tasks.delete_many({"_id": {"$in": task_ids}})
def save_event(task_id: ObjectId, code: str, timestamp: datetime.datetime, **kwargs): """ save event and its accompagning data to database """ task_updates = {} if "file" not in code: # don't update timestamp for file events as not unique task_updates[f"timestamp.{code}"] = timestamp # insert event and sort by timestamp Tasks().update_one( {"_id": task_id}, { "$push": { "events": { "$each": [{ "code": code, "timestamp": timestamp }], "$sort": { "timestamp": 1 }, } } }, ) # update task status, timestamp and other fields if "file" not in code: task_updates["status"] = code def add_to_update_if_present(payload_key, update_key): if payload_key in kwargs: task_updates[update_key] = kwargs[payload_key] add_to_update_if_present("worker", "worker") add_to_update_if_present("canceled_by", "canceled_by") add_to_update_if_present("command", "container.command") add_to_update_if_present("image", "container.image") add_to_update_if_present("exit_code", "container.exit_code") add_to_update_if_present("stdout", "container.stdout") add_to_update_if_present("stderr", "container.stderr") add_to_update_if_present("timeout", "container.timeout") add_to_update_if_present("log", "container.log") add_to_update_if_present("task_log", "debug.log") add_to_update_if_present("task_name", "debug.task_name") add_to_update_if_present("task_args", "debug.task_args") add_to_update_if_present("task_kwargs", "debug.task_kwargs") add_to_update_if_present("traceback", "debug.traceback") add_to_update_if_present("exception", "debug.exception") # files are uploaded as there are created ; 2 events: # - one on file creation with name, size and status=created # - one on file upload complete with name and status=uploaded if kwargs.get("file", {}).get("name"): # mongo doesn't support `.` in keys (so we replace with Unicode Full Stop) fkey = kwargs["file"]["name"].replace(".", ".") fstatus = kwargs["file"].get("status") if fstatus == "created": task_updates[f"files.{fkey}"] = { "name": kwargs["file"]["name"], "size": kwargs["file"].get("size"), # missing in uploaded, "status": fstatus, f"{fstatus}_timestamp": timestamp, } elif fstatus in ("uploaded", "failed"): task_updates[f"files.{fkey}.status"] = fstatus task_updates[f"files.{fkey}.{fstatus}_timestamp"] = timestamp Tasks().update_one({"_id": task_id}, {"$set": task_updates}) _update_schedule_most_recent_task_status(task_id) if code == TaskStatus.scraper_completed: schedule_name = Tasks().find_one({"_id": task_id}, {"schedule_name": 1})["schedule_name"] update_schedule_duration(schedule_name)