Ejemplo n.º 1
0
    def patch(self, task_id: str, token: AccessToken.Payload):

        task = Tasks().find_one({"_id": task_id}, {"_id": 1})
        if task is None:
            raise TaskNotFound()

        try:
            request_json = TasKUpdateSchema().load(request.get_json())
            # empty dict passes the validator but troubles mongo
            if not request.get_json():
                raise ValidationError("Update can't be empty")
        except ValidationError as e:
            raise InvalidRequestJSON(e.messages)

        task_event_handler(task["_id"], request_json["event"],
                           request_json["payload"])

        BROADCASTER.broadcast_updated_task(task_id, request_json["event"],
                                           request_json["payload"])

        return Response(status=HTTPStatus.NO_CONTENT)
Ejemplo n.º 2
0
def status_to_cancel(now, status, timeout):
    logger.info(f":: canceling tasks `{status}` for more than {timeout}s")
    ago = now - datetime.timedelta(seconds=timeout)
    query = {"status": status, f"timestamp.{status}": {"$lte": ago}}
    result = Tasks().update_many(
        query,
        {
            "$set": {
                "status": TaskStatus.canceled,
                "canceled_by": NAME,
                f"timestamp.{TaskStatus.canceled}": now,
            },
            "$push": {
                "events": {
                    "code": TaskStatus.canceled,
                    "timestamp": now,
                }
            },
        },
    )
    logger.info(
        f"::: canceled {result.modified_count}/{result.matched_count} tasks")
Ejemplo n.º 3
0
def get_currently_running_tasks(worker_name):
    """ list of tasks being run by worker at this moment, including ETA """
    running_tasks = list(Tasks().find(
        {
            "status": {
                "$nin": TaskStatus.complete()
            },
            "worker": worker_name
        },
        {
            "config.resources": 1,
            "config.platform": 1,
            "schedule_name": 1,
            "timestamp": 1,
        },
    ))

    # calculate ETAs of the tasks we are currently running
    for task in running_tasks:
        task.update(get_task_eta(task, worker_name))

    return running_tasks
Ejemplo n.º 4
0
def _update_schedule_most_recent_task_status(task_id):
    """ update `most_recent_task` value of associated schedule """
    # get schedule and last event
    cursor = Tasks().aggregate([
        {
            "$match": {
                "_id": task_id
            }
        },
        {
            "$project": {
                "schedule_name": 1,
                "last_event": {
                    "$arrayElemAt": ["$events", -1]
                },
            }
        },
    ])
    tasks = [task for task in cursor]
    task = tasks[0] if tasks else None
    if not task:
        return

    # update schedule most recent task
    schedule_name = task["schedule_name"]
    last_event_code = task["last_event"]["code"]
    last_event_timestamp = task["last_event"]["timestamp"]
    if "container" in last_event_code:
        return

    schedule_updates = {
        "most_recent_task": {
            "_id": task_id,
            "status": last_event_code,
            "updated_at": last_event_timestamp,
        }
    }
    Schedules().update_one({"name": schedule_name}, {"$set": schedule_updates})
Ejemplo n.º 5
0
def save_event(task_id: ObjectId, code: str, timestamp: datetime.datetime,
               **kwargs):
    """ save event and its accompagning data to database """

    task_updates = {}
    if "file" not in code:  # don't update timestamp for file events as not unique
        task_updates[f"timestamp.{code}"] = timestamp
        # insert event and sort by timestamp
        Tasks().update_one(
            {"_id": task_id},
            {
                "$push": {
                    "events": {
                        "$each": [{
                            "code": code,
                            "timestamp": timestamp
                        }],
                        "$sort": {
                            "timestamp": 1
                        },
                    }
                }
            },
        )

    # update task status, timestamp and other fields
    if "file" not in code:
        task_updates["status"] = code

    def add_to_update_if_present(payload_key, update_key):
        if payload_key in kwargs:
            task_updates[update_key] = kwargs[payload_key]

    add_to_update_if_present("worker", "worker")
    add_to_update_if_present("canceled_by", "canceled_by")
    add_to_update_if_present("command", "container.command")
    add_to_update_if_present("image", "container.image")
    add_to_update_if_present("exit_code", "container.exit_code")
    add_to_update_if_present("stdout", "container.stdout")
    add_to_update_if_present("stderr", "container.stderr")
    add_to_update_if_present("timeout", "container.timeout")
    add_to_update_if_present("log", "container.log")
    add_to_update_if_present("task_log", "debug.log")
    add_to_update_if_present("task_name", "debug.task_name")
    add_to_update_if_present("task_args", "debug.task_args")
    add_to_update_if_present("task_kwargs", "debug.task_kwargs")
    add_to_update_if_present("traceback", "debug.traceback")
    add_to_update_if_present("exception", "debug.exception")

    # files are uploaded as there are created ; 2 events:
    # - one on file creation with name, size and status=created
    # - one on file upload complete with name and status=uploaded
    if kwargs.get("file", {}).get("name"):
        # mongo doesn't support `.` in keys (so we replace with Unicode Full Stop)
        fkey = kwargs["file"]["name"].replace(".", ".")
        fstatus = kwargs["file"].get("status")
        if fstatus == "created":
            task_updates[f"files.{fkey}"] = {
                "name": kwargs["file"]["name"],
                "size": kwargs["file"].get("size"),  # missing in uploaded,
                "status": fstatus,
                f"{fstatus}_timestamp": timestamp,
            }
        elif fstatus in ("uploaded", "failed"):
            task_updates[f"files.{fkey}.status"] = fstatus
            task_updates[f"files.{fkey}.{fstatus}_timestamp"] = timestamp

    Tasks().update_one({"_id": task_id}, {"$set": task_updates})

    _update_schedule_most_recent_task_status(task_id)

    if code == TaskStatus.scraper_completed:
        schedule_name = Tasks().find_one({"_id": task_id},
                                         {"schedule_name": 1})["schedule_name"]
        update_schedule_duration(schedule_name)
Ejemplo n.º 6
0
    def patch(self, schedule_name: str, token: AccessToken.Payload):
        """Update all properties of a schedule but _id and most_recent_task"""

        query = {"name": schedule_name}
        schedule = Schedules().find_one(query, {"config.task_name": 1})
        if not schedule:
            raise ScheduleNotFound()

        try:
            update = UpdateSchema().load(request.get_json())  # , partial=True
            # empty dict passes the validator but troubles mongo
            if not request.get_json():
                raise ValidationError("Update can't be empty")

            # ensure we test flags according to new task_name if present
            if "task_name" in update:
                if "flags" not in update:
                    raise ValidationError(
                        "Can't update offliner without updating flags"
                    )
                flags_schema = ScheduleConfigSchema.get_offliner_schema(
                    update["task_name"]
                )
            else:
                flags_schema = ScheduleConfigSchema.get_offliner_schema(
                    schedule["config"]["task_name"]
                )

            if "flags" in update:
                flags_schema().load(update["flags"])
        except ValidationError as e:
            raise InvalidRequestJSON(e.messages)

        if "name" in update:
            if Schedules().count_documents({"name": update["name"]}):
                raise BadRequest(
                    "Schedule with name `{}` already exists".format(update["name"])
                )

        config_keys = [
            "task_name",
            "warehouse_path",
            "image",
            "resources",
            "platform",
            "flags",
        ]
        mongo_update = {
            f"config.{key}" if key in config_keys else key: value
            for key, value in update.items()
        }

        matched_count = (
            Schedules().update_one(query, {"$set": mongo_update}).matched_count
        )

        if matched_count:
            tasks_query = {"schedule_name": schedule_name}
            if "name" in update:
                Tasks().update_many(
                    tasks_query, {"$set": {"schedule_name": update["name"]}}
                )

                RequestedTasks().update_many(
                    tasks_query, {"$set": {"schedule_name": update["name"]}}
                )

            return Response(status=HTTPStatus.NO_CONTENT)

        raise ScheduleNotFound()
Ejemplo n.º 7
0
def make_task(database, make_event):
    task_ids = []
    tasks = Tasks(database=database)

    def _make_task(
        schedule_id=ObjectId(),
        schedule_name="",
        status=TaskStatus.succeeded,
        hostname="zimfarm_worker.com",
    ):
        if status == TaskStatus.requested:
            events = [TaskStatus.requested]
        elif status == TaskStatus.reserved:
            events = [TaskStatus.requested, TaskStatus.reserved]
        elif status == TaskStatus.started:
            events = [
                TaskStatus.requested, TaskStatus.reserved, TaskStatus.started
            ]
        elif status == TaskStatus.succeeded:
            events = [
                TaskStatus.requested,
                TaskStatus.reserved,
                TaskStatus.started,
                TaskStatus.succeeded,
            ]
        else:
            events = [
                TaskStatus.requested,
                TaskStatus.reserved,
                TaskStatus.started,
                TaskStatus.failed,
            ]

        timestamp = {event: datetime.now() for event in events}
        events = [make_event(event, timestamp[event]) for event in events]
        container = {
            "command": "mwoffliner --mwUrl=https://example.com",
            "image": {
                "name": "mwoffliner",
                "tag": "1.8.0"
            },
            "exit_code": 0,
            "stderr": "example_stderr",
            "stdout": "example_stdout",
        }
        debug = {"args": [], "kwargs": {}}

        if status == TaskStatus.failed:
            debug["exception"] = "example_exception"
            debug["traceback"] = "example_traceback"
            files = {}
        else:
            files = {
                "mwoffliner_1.zim": {
                    "name": "mwoffliner_1.zim",
                    "size": 1000
                }
            }

        task = {
            "_id": ObjectId(),
            "status": status,
            "worker": hostname,
            "schedule_name": schedule_name,
            "timestamp": timestamp,
            "events": events,
            "container": container,
            "debug": debug,
            "files": files,
        }

        tasks.insert_one(task)
        task_ids.append(task["_id"])
        return task

    yield _make_task

    tasks.delete_many({"_id": {"$in": task_ids}})