예제 #1
0
    def get(self):
        """Return a list of tasks"""

        request_args = request.args.to_dict()
        request_args["status"] = request.args.getlist("status")
        request_args = TasksSchema().load(request_args)

        # unpack query parameter
        skip, limit = request_args["skip"], request_args["limit"]
        statuses = request_args.get("status")
        schedule_name = request_args.get("schedule_name")

        # get tasks from database
        query = {}
        if statuses:
            query["status"] = {"$in": statuses}
        if schedule_name:
            query["schedule_name"] = schedule_name

        count = Tasks().count_documents(query)

        cursor = Tasks().aggregate([
            {
                "$match": query
            },
            {
                "$project": {
                    "schedule_name": 1,
                    "status": 1,
                    "timestamp": 1,
                    "worker": 1,
                    "config.resources": 1,
                    "updated_at": {
                        "$arrayElemAt": ["$events.timestamp", -1]
                    },
                }
            },
            {
                "$sort": {
                    "updated_at": pymongo.DESCENDING
                }
            },
            {
                "$skip": skip
            },
            {
                "$limit": limit
            },
        ])

        tasks = list(cursor)

        return jsonify({
            "meta": {
                "skip": skip,
                "limit": limit,
                "count": count
            },
            "items": tasks
        })
예제 #2
0
def handle_notification(task_id, event):
    # alias for all complete status
    if event in TaskStatus.complete():
        event = "ended"

    # exit early if not a triggering event
    if event not in GlobalNotifications.events:
        return

    task = Tasks().find_one({"_id": task_id}) or RequestedTasks().find_one(
        {"_id": task_id})
    if not task:
        return

    # serialize/unserialize task so we use a safe version from now-on
    task = json.loads(json.dumps(task, cls=Encoder))
    global_notifs = GlobalNotifications.entries.get(event, {})
    task_notifs = task.get("notification", {}).get(event, {})

    # exit early if we don't have notification requests for the event
    if not global_notifs and not task_notifs:
        return

    for method, recipients in list(task_notifs.items()) + list(
            global_notifs.items()):
        func = {
            "mailgun": handle_mailgun_notification,
            "webhook": handle_webhook_notification,
            "slack": handle_slack_notification,
        }.get(method)
        if func and recipients:
            func(task, recipients)
예제 #3
0
def history_cleanup():
    """removes tasks for which the schedule has been run multiple times after

    Uses HISTORY_TASK_PER_SCHEDULE"""

    logger.info(f":: removing tasks history (>{HISTORY_TASK_PER_SCHEDULE})")
    cursor = Tasks().aggregate([
        {
            "$group": {
                "_id": "$schedule_name",
                "count": {
                    "$sum": 1
                }
            }
        },
        {
            "$match": {
                "count": {
                    "$gt": HISTORY_TASK_PER_SCHEDULE
                }
            }
        },
    ])

    schedules_with_too_much_tasks = [s["_id"] for s in cursor]

    task_ids_to_delete = []
    for schedule_name in schedules_with_too_much_tasks:
        cursor = Tasks().aggregate([
            {
                "$match": {
                    "schedule_name": schedule_name
                }
            },
            {
                "$project": {
                    "schedule_name": 1,
                    "updated_at": {
                        "$arrayElemAt": ["$events.timestamp", -1]
                    },
                }
            },
            {
                "$sort": {
                    "updated_at": pymongo.DESCENDING
                }
            },
            {
                "$skip": HISTORY_TASK_PER_SCHEDULE
            },
        ])
        task_ids_to_delete += [t["_id"] for t in cursor]

    result = Tasks().delete_many({"_id": {"$in": task_ids_to_delete}})
    logger.info(
        f"::: deleted {result.deleted_count}/{len(task_ids_to_delete)} tasks")
예제 #4
0
def update_schedule_duration(schedule_name):
    """set/update the `duration` object of a schedule by looking at its recent tasks

    value is computed with `scraper_completed - started` timestamps"""

    schedule_query = {"name": schedule_name}

    # retrieve last tasks that completed the resources intensive part
    query = {
        "schedule_name": schedule_name,
        f"timestamp.{TaskStatus.scraper_completed}": {
            "$exists": True
        },
        f"timestamp.{TaskStatus.started}": {
            "$exists": True
        },
        "container.exit_code": 0,
    }

    document = {
        "default": get_default_duration(),
    }

    # we have no finished task for this schedule, using default duration
    if Tasks().count_documents(query) == 0:
        document.update({"available": False, "workers": {}})

    # compute duration from last completed tasks
    else:
        tasks = (Tasks().find(query, {
            "timestamp": 1,
            "worker": 1
        }).sort(f"timestamp.{TaskStatus.scraper_completed}",
                pymongo.ASCENDING))

        workers = {
            task["worker"]: {
                "worker":
                task["worker"],
                "task":
                task["_id"],
                "value":
                int((task["timestamp"]["scraper_completed"] -
                     task["timestamp"]["started"]).total_seconds()),
                "on":
                task["timestamp"][TaskStatus.scraper_completed],
            }
            for task in tasks
        }
        if workers:
            document.update({"available": True, "workers": workers})

    Schedules().update_one(schedule_query, {"$set": {"duration": document}})
예제 #5
0
def staled_statuses():
    """ set the status for tasks in an unfinished state """

    now = getnow()

    # `started` statuses
    status_to_cancel(now, TaskStatus.started, STALLED_STARTED_TIMEOUT)

    # `reserved` statuses
    status_to_cancel(now, TaskStatus.reserved, STALLED_RESERVED_TIMEOUT)

    # `cancel_requested` statuses
    status_to_cancel(now, TaskStatus.cancel_requested,
                     STALLED_CANCELREQ_TIMEOUT)

    # `scraper_completed` statuses: either success or failure
    status = TaskStatus.scraper_completed
    logger.info(
        f":: closing tasks `{status}` for more than {STALLED_COMPLETED_TIMEOUT}s"
    )
    ago = now - datetime.timedelta(seconds=STALLED_COMPLETED_TIMEOUT)
    query = {"status": status, f"timestamp.{status}": {"$lte": ago}}
    query_success = {"container.exit_code": 0}
    query_success.update(query)
    result = Tasks().update_many(
        query_success,
        {
            "$set": {
                "status": TaskStatus.succeeded,
                f"timestamp.{TaskStatus.succeeded}": now,
            }
        },
    )
    logger.info(
        f"::: succeeded {result.modified_count}/{result.matched_count} tasks")
    query_failed = {"container.exit_code": {"$ne": 0}}
    query_failed.update(query)
    result = Tasks().update_many(
        query_failed,
        {
            "$set": {
                "status": TaskStatus.failed,
                f"timestamp.{TaskStatus.failed}": now
            }
        },
    )
    logger.info(
        f"::: failed {result.modified_count}/{result.matched_count} tasks")
예제 #6
0
def task_canceled_event_handler(task_id, payload):
    logger.info(f"Task Cancelled: {task_id}")

    # if canceled event carries a `canceled_by` and we have none on the task
    # then store it, otherwise keep what's in the task (manual request)
    canceled_by = None
    task = Tasks().find_one({"_id": task_id}, {"canceled_by": 1})
    if payload.get("canceled_by") and task and not task.get("canceled_by"):
        canceled_by = payload.get("canceled_by")

    save_event(
        task_id,
        TaskStatus.canceled,
        get_timestamp_from_event(payload),
        task_log=payload.get("log"),
        canceled_by=canceled_by,
    )
예제 #7
0
    def get(self, task_id: str):
        task = Tasks().find_one({"_id": task_id})
        if task is None:
            raise TaskNotFound()

        task["updated_at"] = task["events"][-1]["timestamp"]

        return jsonify(task)
예제 #8
0
    def post(self, task_id: str, token: AccessToken.Payload):
        """ create a task from a requested_task_id """

        requested_task = RequestedTasks().find_one({"_id": task_id})
        if requested_task is None:
            raise TaskNotFound()

        request_args = TaskCreateSchema().load(request.args.to_dict())

        document = {}
        document.update(requested_task)

        try:
            Tasks().insert_one(requested_task)
        except pymongo.errors.DuplicateKeyError as exc:
            logger.exception(exc)
            response = jsonify({})
            response.status_code = 423  # Locked
            return response
        except Exception as exc:
            logger.exception(exc)
            raise exc

        payload = {"worker": request_args["worker_name"]}
        try:
            task_event_handler(task_id, TaskStatus.reserved, payload)
        except Exception as exc:
            logger.exception(exc)
            logger.error("unable to create task. reverting.")
            try:
                Tasks().delete_one({"_id": task_id})
            except Exception:
                logger.debug(f"unable to revert deletion of task {task_id}")
            raise exc

        try:
            RequestedTasks().delete_one({"_id": task_id})
        except Exception as exc:
            logger.exception(exc)  # and pass

        BROADCASTER.broadcast_updated_task(task_id, TaskStatus.reserved,
                                           payload)

        return make_response(jsonify(Tasks().find_one({"_id": task_id})),
                             HTTPStatus.CREATED)
예제 #9
0
def request_tasks_using_schedule():
    """ create requested_tasks based on schedule's periodicity field

        Expected to be ran periodically to compute what needs to be scheduled """

    requester = "period-scheduler"
    priority = 0
    worker = None

    query = {"enabled": True}
    projection = {"name": 1, "config": 1, "most_recent_task": 1}

    for period, period_data in {
        p: PERIODICITIES.get(p) for p in SchedulePeriodicity.all()
    }.items():
        if not period_data:
            continue  # manually has no data

        period_start = getnow() - datetime.timedelta(days=period_data["days"])
        logger.debug(f"requesting for `{period}` schedules (before {period_start})")

        # find non-requested schedules which last run started before our period start
        query["periodicity"] = period
        for schedule in Schedules().find(query, projection):
            # don't bother if the schedule's already requested
            if (
                RequestedTasks().count_documents({"schedule_name": schedule["name"]})
                > 0
            ):
                continue

            if schedule.get("most_recent_task"):
                last_run = Tasks().find_one(
                    {"_id": schedule["most_recent_task"]["_id"]}, {"timestamp": 1}
                )
                # don't bother if it started after this rolling period's start
                if (
                    last_run
                    and last_run["timestamp"].get(
                        "started", datetime.datetime(2019, 1, 1)
                    )
                    > period_start
                ):
                    continue

            if request_a_schedule(schedule["name"], requester, worker, priority):
                logger.debug(f"requested {schedule['name']}")
            else:
                logger.debug(f"could not request {schedule['name']}")
예제 #10
0
def status_to_cancel(now, status, timeout):
    logger.info(f":: canceling tasks `{status}` for more than {timeout}s")
    ago = now - datetime.timedelta(seconds=timeout)
    query = {"status": status, f"timestamp.{status}": {"$lte": ago}}
    result = Tasks().update_many(
        query,
        {
            "$set": {
                "status": TaskStatus.canceled,
                "canceled_by": NAME,
                f"timestamp.{TaskStatus.canceled}": now,
            }
        },
    )
    logger.info(
        f"::: canceled {result.modified_count}/{result.matched_count} tasks")
예제 #11
0
def get_currently_running_tasks(worker_name):
    """ list of tasks being run by worker at this moment, including ETA """
    running_tasks = list(
        Tasks().find(
            {"status": {"$nin": TaskStatus.complete()}, "worker": worker_name},
            {
                "config.resources": 1,
                "config.platform": 1,
                "schedule_name": 1,
                "timestamp": 1,
            },
        )
    )

    # calculate ETAs of the tasks we are currently running
    for task in running_tasks:
        task.update(get_task_eta(task, worker_name))

    return running_tasks
예제 #12
0
    def get(self, task_id: str, token: AccessToken.Payload = None):

        # exclude notification to not expose private information (privacy)
        # on anonymous requests and requests for users without schedules_update
        projection = (None if token
                      and token.get_permission("schedules", "update") else {
                          "notification": 0
                      })

        task = Tasks().find_one({"_id": task_id}, projection)
        if task is None:
            raise TaskNotFound()

        task["updated_at"] = task["events"][-1]["timestamp"]

        if not token or not token.get_permission("tasks", "create"):
            remove_secrets_from_response(task)

        return jsonify(task)
예제 #13
0
    def post(self, task_id: str, token: AccessToken.Payload):

        task = Tasks().find_one(
            {
                "status": {
                    "$in": TaskStatus.incomplete()
                },
                "_id": task_id
            }, {"_id": 1})
        if task is None:
            raise TaskNotFound()

        task_event_handler(task["_id"], TaskStatus.cancel_requested,
                           {"canceled_by": token.username})

        # broadcast cancel-request to worker
        BROADCASTER.broadcast_cancel_task(task_id)

        return Response(status=HTTPStatus.NO_CONTENT)
예제 #14
0
    def patch(self, task_id: str, token: AccessToken.Payload):

        task = Tasks().find_one({"_id": task_id}, {"_id": 1})
        if task is None:
            raise TaskNotFound()

        try:
            request_json = TasKUpdateSchema().load(request.get_json())
            # empty dict passes the validator but troubles mongo
            if not request.get_json():
                raise ValidationError("Update can't be empty")
        except ValidationError as e:
            raise InvalidRequestJSON(e.messages)

        task_event_handler(task["_id"], request_json["event"],
                           request_json["payload"])

        BROADCASTER.broadcast_updated_task(task_id, request_json["event"],
                                           request_json["payload"])

        return Response(status=HTTPStatus.NO_CONTENT)
예제 #15
0
def _update_schedule_most_recent_task_status(task_id):
    """ update `most_recent_task` value of associated schedule """
    # get schedule and last event
    cursor = Tasks().aggregate([
        {
            "$match": {
                "_id": task_id
            }
        },
        {
            "$project": {
                "schedule_name": 1,
                "last_event": {
                    "$arrayElemAt": ["$events", -1]
                },
            }
        },
    ])
    tasks = [task for task in cursor]
    task = tasks[0] if tasks else None
    if not task:
        return

    # update schedule most recent task
    schedule_name = task["schedule_name"]
    last_event_code = task["last_event"]["code"]
    last_event_timestamp = task["last_event"]["timestamp"]
    if "container" in last_event_code:
        return

    schedule_updates = {
        "most_recent_task": {
            "_id": task_id,
            "status": last_event_code,
            "updated_at": last_event_timestamp,
        }
    }
    Schedules().update_one({"name": schedule_name}, {"$set": schedule_updates})
예제 #16
0
    def patch(self, schedule_name: str, token: AccessToken.Payload):
        """Update all properties of a schedule but _id and most_recent_task"""

        query = {"name": schedule_name}
        schedule = Schedules().find_one(query, {"config.task_name": 1})
        if not schedule:
            raise ScheduleNotFound()

        try:
            update = UpdateSchema().load(request.get_json())  # , partial=True
            # empty dict passes the validator but troubles mongo
            if not request.get_json():
                raise ValidationError("Update can't be empty")

            # ensure we test flags according to new task_name if present
            if "task_name" in update:
                if "flags" not in update:
                    raise ValidationError(
                        "Can't update offliner without updating flags"
                    )
                flags_schema = ScheduleConfigSchema.get_offliner_schema(
                    update["task_name"]
                )
            else:
                flags_schema = ScheduleConfigSchema.get_offliner_schema(
                    schedule["config"]["task_name"]
                )

            if "flags" in update:
                flags_schema().load(update["flags"])
        except ValidationError as e:
            raise InvalidRequestJSON(e.messages)

        if "name" in update:
            if Schedules().count_documents({"name": update["name"]}):
                raise BadRequest(
                    "Schedule with name `{}` already exists".format(update["name"])
                )

        config_keys = [
            "task_name",
            "warehouse_path",
            "image",
            "resources",
            "platform",
            "flags",
        ]
        mongo_update = {
            f"config.{key}" if key in config_keys else key: value
            for key, value in update.items()
        }

        matched_count = (
            Schedules().update_one(query, {"$set": mongo_update}).matched_count
        )

        if matched_count:
            tasks_query = {"schedule_name": schedule_name}
            if "name" in update:
                Tasks().update_many(
                    tasks_query, {"$set": {"schedule_name": update["name"]}}
                )

                RequestedTasks().update_many(
                    tasks_query, {"$set": {"schedule_name": update["name"]}}
                )

            return Response(status=HTTPStatus.NO_CONTENT)

        raise ScheduleNotFound()
예제 #17
0
def make_task(database, make_event):
    task_ids = []
    tasks = Tasks(database=database)

    def _make_task(
        schedule_id=ObjectId(),
        schedule_name="",
        status=TaskStatus.succeeded,
        hostname="zimfarm_worker.com",
    ):
        if status == TaskStatus.requested:
            events = [TaskStatus.requested]
        elif status == TaskStatus.reserved:
            events = [TaskStatus.requested, TaskStatus.reserved]
        elif status == TaskStatus.started:
            events = [
                TaskStatus.requested, TaskStatus.reserved, TaskStatus.started
            ]
        elif status == TaskStatus.succeeded:
            events = [
                TaskStatus.requested,
                TaskStatus.reserved,
                TaskStatus.started,
                TaskStatus.succeeded,
            ]
        else:
            events = [
                TaskStatus.requested,
                TaskStatus.reserved,
                TaskStatus.started,
                TaskStatus.failed,
            ]

        timestamp = {event: datetime.now() for event in events}
        events = [make_event(event, timestamp[event]) for event in events]
        container = {
            "command": "mwoffliner --mwUrl=https://example.com",
            "image": {
                "name": "mwoffliner",
                "tag": "1.8.0"
            },
            "exit_code": 0,
            "stderr": "example_stderr",
            "stdout": "example_stdout",
        }
        debug = {"args": [], "kwargs": {}}

        if status == TaskStatus.failed:
            debug["exception"] = "example_exception"
            debug["traceback"] = "example_traceback"
            files = {}
        else:
            files = {
                "mwoffliner_1.zim": {
                    "name": "mwoffliner_1.zim",
                    "size": 1000
                }
            }

        task = {
            "_id": ObjectId(),
            "status": status,
            "worker": hostname,
            "schedule_name": schedule_name,
            "timestamp": timestamp,
            "events": events,
            "container": container,
            "debug": debug,
            "files": files,
        }

        tasks.insert_one(task)
        task_ids.append(task["_id"])
        return task

    yield _make_task

    tasks.delete_many({"_id": {"$in": task_ids}})
예제 #18
0
def save_event(task_id: ObjectId, code: str, timestamp: datetime.datetime,
               **kwargs):
    """ save event and its accompagning data to database """

    task_updates = {}
    if "file" not in code:  # don't update timestamp for file events as not unique
        task_updates[f"timestamp.{code}"] = timestamp
        # insert event and sort by timestamp
        Tasks().update_one(
            {"_id": task_id},
            {
                "$push": {
                    "events": {
                        "$each": [{
                            "code": code,
                            "timestamp": timestamp
                        }],
                        "$sort": {
                            "timestamp": 1
                        },
                    }
                }
            },
        )

    # update task status, timestamp and other fields
    if "file" not in code:
        task_updates["status"] = code

    def add_to_update_if_present(payload_key, update_key):
        if payload_key in kwargs:
            task_updates[update_key] = kwargs[payload_key]

    add_to_update_if_present("worker", "worker")
    add_to_update_if_present("canceled_by", "canceled_by")
    add_to_update_if_present("command", "container.command")
    add_to_update_if_present("image", "container.image")
    add_to_update_if_present("exit_code", "container.exit_code")
    add_to_update_if_present("stdout", "container.stdout")
    add_to_update_if_present("stderr", "container.stderr")
    add_to_update_if_present("timeout", "container.timeout")
    add_to_update_if_present("log", "container.log")
    add_to_update_if_present("task_log", "debug.log")
    add_to_update_if_present("task_name", "debug.task_name")
    add_to_update_if_present("task_args", "debug.task_args")
    add_to_update_if_present("task_kwargs", "debug.task_kwargs")
    add_to_update_if_present("traceback", "debug.traceback")
    add_to_update_if_present("exception", "debug.exception")

    # files are uploaded as there are created ; 2 events:
    # - one on file creation with name, size and status=created
    # - one on file upload complete with name and status=uploaded
    if kwargs.get("file", {}).get("name"):
        # mongo doesn't support `.` in keys (so we replace with Unicode Full Stop)
        fkey = kwargs["file"]["name"].replace(".", ".")
        fstatus = kwargs["file"].get("status")
        if fstatus == "created":
            task_updates[f"files.{fkey}"] = {
                "name": kwargs["file"]["name"],
                "size": kwargs["file"].get("size"),  # missing in uploaded,
                "status": fstatus,
                f"{fstatus}_timestamp": timestamp,
            }
        elif fstatus in ("uploaded", "failed"):
            task_updates[f"files.{fkey}.status"] = fstatus
            task_updates[f"files.{fkey}.{fstatus}_timestamp"] = timestamp

    Tasks().update_one({"_id": task_id}, {"$set": task_updates})

    _update_schedule_most_recent_task_status(task_id)

    if code == TaskStatus.scraper_completed:
        schedule_name = Tasks().find_one({"_id": task_id},
                                         {"schedule_name": 1})["schedule_name"]
        update_schedule_duration(schedule_name)