def patch(self, task_id: str, token: AccessToken.Payload): task = Tasks().find_one({"_id": task_id}, {"_id": 1}) if task is None: raise TaskNotFound() try: request_json = TasKUpdateSchema().load(request.get_json()) # empty dict passes the validator but troubles mongo if not request.get_json(): raise ValidationError("Update can't be empty") except ValidationError as e: raise InvalidRequestJSON(e.messages) task_event_handler(task["_id"], request_json["event"], request_json["payload"]) BROADCASTER.broadcast_updated_task(task_id, request_json["event"], request_json["payload"]) return Response(status=HTTPStatus.NO_CONTENT)
def status_to_cancel(now, status, timeout): logger.info(f":: canceling tasks `{status}` for more than {timeout}s") ago = now - datetime.timedelta(seconds=timeout) query = {"status": status, f"timestamp.{status}": {"$lte": ago}} result = Tasks().update_many( query, { "$set": { "status": TaskStatus.canceled, "canceled_by": NAME, f"timestamp.{TaskStatus.canceled}": now, }, "$push": { "events": { "code": TaskStatus.canceled, "timestamp": now, } }, }, ) logger.info( f"::: canceled {result.modified_count}/{result.matched_count} tasks")
def get_currently_running_tasks(worker_name): """ list of tasks being run by worker at this moment, including ETA """ running_tasks = list(Tasks().find( { "status": { "$nin": TaskStatus.complete() }, "worker": worker_name }, { "config.resources": 1, "config.platform": 1, "schedule_name": 1, "timestamp": 1, }, )) # calculate ETAs of the tasks we are currently running for task in running_tasks: task.update(get_task_eta(task, worker_name)) return running_tasks
def _update_schedule_most_recent_task_status(task_id): """ update `most_recent_task` value of associated schedule """ # get schedule and last event cursor = Tasks().aggregate([ { "$match": { "_id": task_id } }, { "$project": { "schedule_name": 1, "last_event": { "$arrayElemAt": ["$events", -1] }, } }, ]) tasks = [task for task in cursor] task = tasks[0] if tasks else None if not task: return # update schedule most recent task schedule_name = task["schedule_name"] last_event_code = task["last_event"]["code"] last_event_timestamp = task["last_event"]["timestamp"] if "container" in last_event_code: return schedule_updates = { "most_recent_task": { "_id": task_id, "status": last_event_code, "updated_at": last_event_timestamp, } } Schedules().update_one({"name": schedule_name}, {"$set": schedule_updates})
def save_event(task_id: ObjectId, code: str, timestamp: datetime.datetime, **kwargs): """ save event and its accompagning data to database """ task_updates = {} if "file" not in code: # don't update timestamp for file events as not unique task_updates[f"timestamp.{code}"] = timestamp # insert event and sort by timestamp Tasks().update_one( {"_id": task_id}, { "$push": { "events": { "$each": [{ "code": code, "timestamp": timestamp }], "$sort": { "timestamp": 1 }, } } }, ) # update task status, timestamp and other fields if "file" not in code: task_updates["status"] = code def add_to_update_if_present(payload_key, update_key): if payload_key in kwargs: task_updates[update_key] = kwargs[payload_key] add_to_update_if_present("worker", "worker") add_to_update_if_present("canceled_by", "canceled_by") add_to_update_if_present("command", "container.command") add_to_update_if_present("image", "container.image") add_to_update_if_present("exit_code", "container.exit_code") add_to_update_if_present("stdout", "container.stdout") add_to_update_if_present("stderr", "container.stderr") add_to_update_if_present("timeout", "container.timeout") add_to_update_if_present("log", "container.log") add_to_update_if_present("task_log", "debug.log") add_to_update_if_present("task_name", "debug.task_name") add_to_update_if_present("task_args", "debug.task_args") add_to_update_if_present("task_kwargs", "debug.task_kwargs") add_to_update_if_present("traceback", "debug.traceback") add_to_update_if_present("exception", "debug.exception") # files are uploaded as there are created ; 2 events: # - one on file creation with name, size and status=created # - one on file upload complete with name and status=uploaded if kwargs.get("file", {}).get("name"): # mongo doesn't support `.` in keys (so we replace with Unicode Full Stop) fkey = kwargs["file"]["name"].replace(".", ".") fstatus = kwargs["file"].get("status") if fstatus == "created": task_updates[f"files.{fkey}"] = { "name": kwargs["file"]["name"], "size": kwargs["file"].get("size"), # missing in uploaded, "status": fstatus, f"{fstatus}_timestamp": timestamp, } elif fstatus in ("uploaded", "failed"): task_updates[f"files.{fkey}.status"] = fstatus task_updates[f"files.{fkey}.{fstatus}_timestamp"] = timestamp Tasks().update_one({"_id": task_id}, {"$set": task_updates}) _update_schedule_most_recent_task_status(task_id) if code == TaskStatus.scraper_completed: schedule_name = Tasks().find_one({"_id": task_id}, {"schedule_name": 1})["schedule_name"] update_schedule_duration(schedule_name)
def patch(self, schedule_name: str, token: AccessToken.Payload): """Update all properties of a schedule but _id and most_recent_task""" query = {"name": schedule_name} schedule = Schedules().find_one(query, {"config.task_name": 1}) if not schedule: raise ScheduleNotFound() try: update = UpdateSchema().load(request.get_json()) # , partial=True # empty dict passes the validator but troubles mongo if not request.get_json(): raise ValidationError("Update can't be empty") # ensure we test flags according to new task_name if present if "task_name" in update: if "flags" not in update: raise ValidationError( "Can't update offliner without updating flags" ) flags_schema = ScheduleConfigSchema.get_offliner_schema( update["task_name"] ) else: flags_schema = ScheduleConfigSchema.get_offliner_schema( schedule["config"]["task_name"] ) if "flags" in update: flags_schema().load(update["flags"]) except ValidationError as e: raise InvalidRequestJSON(e.messages) if "name" in update: if Schedules().count_documents({"name": update["name"]}): raise BadRequest( "Schedule with name `{}` already exists".format(update["name"]) ) config_keys = [ "task_name", "warehouse_path", "image", "resources", "platform", "flags", ] mongo_update = { f"config.{key}" if key in config_keys else key: value for key, value in update.items() } matched_count = ( Schedules().update_one(query, {"$set": mongo_update}).matched_count ) if matched_count: tasks_query = {"schedule_name": schedule_name} if "name" in update: Tasks().update_many( tasks_query, {"$set": {"schedule_name": update["name"]}} ) RequestedTasks().update_many( tasks_query, {"$set": {"schedule_name": update["name"]}} ) return Response(status=HTTPStatus.NO_CONTENT) raise ScheduleNotFound()
def make_task(database, make_event): task_ids = [] tasks = Tasks(database=database) def _make_task( schedule_id=ObjectId(), schedule_name="", status=TaskStatus.succeeded, hostname="zimfarm_worker.com", ): if status == TaskStatus.requested: events = [TaskStatus.requested] elif status == TaskStatus.reserved: events = [TaskStatus.requested, TaskStatus.reserved] elif status == TaskStatus.started: events = [ TaskStatus.requested, TaskStatus.reserved, TaskStatus.started ] elif status == TaskStatus.succeeded: events = [ TaskStatus.requested, TaskStatus.reserved, TaskStatus.started, TaskStatus.succeeded, ] else: events = [ TaskStatus.requested, TaskStatus.reserved, TaskStatus.started, TaskStatus.failed, ] timestamp = {event: datetime.now() for event in events} events = [make_event(event, timestamp[event]) for event in events] container = { "command": "mwoffliner --mwUrl=https://example.com", "image": { "name": "mwoffliner", "tag": "1.8.0" }, "exit_code": 0, "stderr": "example_stderr", "stdout": "example_stdout", } debug = {"args": [], "kwargs": {}} if status == TaskStatus.failed: debug["exception"] = "example_exception" debug["traceback"] = "example_traceback" files = {} else: files = { "mwoffliner_1.zim": { "name": "mwoffliner_1.zim", "size": 1000 } } task = { "_id": ObjectId(), "status": status, "worker": hostname, "schedule_name": schedule_name, "timestamp": timestamp, "events": events, "container": container, "debug": debug, "files": files, } tasks.insert_one(task) task_ids.append(task["_id"]) return task yield _make_task tasks.delete_many({"_id": {"$in": task_ids}})