Ejemplo n.º 1
0
    def post(self, schedule_name: str, token: AccessToken.Payload):
        """Update all properties of a schedule but _id and most_recent_task"""

        query = {"name": schedule_name}
        schedule = Schedules().find_one(query)
        if not schedule:
            raise ScheduleNotFound()

        request_json = CloneSchema().load(request.get_json())
        new_schedule_name = request_json["name"]

        # ensure it's not a duplicate
        if Schedules().find_one({"name": new_schedule_name}, {"name": 1}):
            raise BadRequest("schedule with name `{}` already exists".format(
                new_schedule_name))

        schedule.pop("_id", None)
        schedule.pop("most_recent_task", None)
        schedule.pop("duration", None)
        schedule["name"] = new_schedule_name
        schedule["enabled"] = False
        schedule["duration"] = {"default": get_default_duration()}

        # insert document
        schedule_id = Schedules().insert_one(schedule).inserted_id

        return make_response(jsonify({"_id": str(schedule_id)}),
                             HTTPStatus.CREATED)
Ejemplo n.º 2
0
    def get(self, *args, **kwargs):
        """return a list of tags"""

        request_args = SkipLimitSchema().load(request.args.to_dict())
        skip, limit = request_args["skip"], request_args["limit"]

        base_pipeline = [
            {
                "$project": {
                    "_id": 0,
                    "tags": 1
                }
            },
            {
                "$unwind": "$tags"
            },
            {
                "$group": {
                    "_id": "$tags"
                }
            },
        ]

        try:
            nb_tags = next(Schedules().aggregate(base_pipeline +
                                                 [{
                                                     "$count": "count"
                                                 }]))["count"]
        except StopIteration:
            nb_tags = 0

        if nb_tags == 0:
            tags = []
        else:
            pipeline = base_pipeline + [
                {
                    "$sort": {
                        "_id": 1
                    }
                },
                {
                    "$skip": skip
                },
                {
                    "$limit": limit
                },
            ]

            tags = [t["_id"] for t in Schedules().aggregate(pipeline)]

        return jsonify({
            "meta": {
                "skip": skip,
                "limit": limit,
                "count": nb_tags
            },
            "items": tags
        })
Ejemplo n.º 3
0
    def patch(self, schedule_name: str, token: AccessToken.Payload):
        """Update all properties of a schedule but _id and most_recent_task"""

        query = {"name": schedule_name}
        schedule = Schedules().find_one(query, {"config.task_name": 1})
        if not schedule:
            raise ScheduleNotFound()

        try:
            update = UpdateSchema().load(request.get_json())  # , partial=True
            # empty dict passes the validator but troubles mongo
            if not request.get_json():
                raise ValidationError("Update can't be empty")

            # ensure we test flags according to new task_name if present
            if "task_name" in update:
                if "flags" not in update:
                    raise ValidationError(
                        "Can't update offliner without updating flags")
                flags_schema = ScheduleConfigSchema.get_offliner_schema(
                    update["task_name"])
            else:
                flags_schema = ScheduleConfigSchema.get_offliner_schema(
                    schedule["config"]["task_name"])

            if "flags" in update:
                flags_schema().load(update["flags"])
        except ValidationError as e:
            raise InvalidRequestJSON(e.messages)

        if "name" in update:
            if Schedules().count_documents({"name": update["name"]}):
                raise BadRequest(
                    "Schedule with name `{}` already exists".format(
                        update["name"]))

        config_keys = [
            "task_name",
            "warehouse_path",
            "image",
            "resources",
            "platform",
            "flags",
        ]
        mongo_update = {
            f"config.{key}" if key in config_keys else key: value
            for key, value in update.items()
        }

        matched_count = (Schedules().update_one(query, {
            "$set": mongo_update
        }).matched_count)

        if matched_count:
            return Response(status=HTTPStatus.NO_CONTENT)

        raise ScheduleNotFound()
Ejemplo n.º 4
0
    def get(self, *args, **kwargs):
        """return a list of languages"""

        request_args = SkipLimit500Schema().load(request.args.to_dict())
        skip, limit = request_args["skip"], request_args["limit"]

        group = {
            "$group": {
                "_id": "$language.code",
                "name_en": {
                    "$first": "$language.name_en"
                },
                "name_native": {
                    "$first": "$language.name_native"
                },
            }
        }

        try:
            nb_languages = next(Schedules().aggregate(
                [group, {
                    "$count": "count"
                }]))["count"]
        except StopIteration:
            nb_languages = 0

        if nb_languages == 0:
            languages = []
        else:
            pipeline = [
                group,
                {
                    "$sort": {
                        "_id": 1
                    }
                },
                {
                    "$skip": skip
                },
                {
                    "$limit": limit
                },
            ]
            languages = [{
                "code": s["_id"],
                "name_en": s["name_en"],
                "name_native": s["name_native"],
            } for s in Schedules().aggregate(pipeline)]

        return jsonify({
            "meta": {
                "skip": skip,
                "limit": limit,
                "count": nb_languages
            },
            "items": languages,
        })
Ejemplo n.º 5
0
    def get(self):
        """Return a list of schedules"""

        request_args = request.args.to_dict()
        for key in ("category", "tag", "lang"):
            request_args[key] = request.args.getlist(key)
        request_args = SchedulesSchema().load(request_args)

        skip, limit, categories, tags, lang, name = (
            request_args.get("skip"),
            request_args.get("limit"),
            request_args.get("category"),
            request_args.get("tag"),
            request_args.get("lang"),
            request_args.get("name"),
        )

        # assemble filters
        query = {}
        if categories:
            query["category"] = {"$in": categories}
        if lang:
            query["language.code"] = {"$in": lang}
        if tags:
            query["tags"] = {"$all": tags}
        if name:
            query["name"] = {"$regex": r".*{}.*".format(name), "$options": "i"}

        # get schedules from database
        projection = {
            "_id": 0,
            "name": 1,
            "category": 1,
            "language": 1,
            "config.task_name": 1,
            "most_recent_task": 1,
        }
        cursor = Schedules().find(query, projection).skip(skip).limit(limit)
        count = Schedules().count_documents(query)
        schedules = [schedule for schedule in cursor]

        return jsonify({
            "meta": {
                "skip": skip,
                "limit": limit,
                "count": count
            },
            "items": schedules
        })
Ejemplo n.º 6
0
    def get(self):
        """Return all schedules backup"""

        projection = {"most_recent_task": 0}
        cursor = Schedules().find({}, projection)
        schedules = [schedule for schedule in cursor]
        return jsonify(schedules)
Ejemplo n.º 7
0
def get_duration_for(schedule_name, worker_name):
    """ duration doc for a schedule and worker (or default one) """
    schedule = Schedules().find_one({"name": schedule_name}, {"duration": 1})
    if not schedule:
        return get_default_duration()
    return schedule["duration"]["workers"].get(worker_name,
                                               schedule["duration"]["default"])
Ejemplo n.º 8
0
    def delete(self, schedule_name: str, token: AccessToken.Payload):
        """Delete a schedule."""

        query = {"name": schedule_name}
        result = Schedules().delete_one(query)

        if result.deleted_count == 0:
            raise ScheduleNotFound()
        return Response(status=HTTPStatus.NO_CONTENT)
Ejemplo n.º 9
0
    def post(self, token: AccessToken.Payload):
        """create a new schedule"""

        try:
            document = ScheduleSchema().load(request.get_json())
        except ValidationError as e:
            raise InvalidRequestJSON(e.messages)

        # make sure it's not a duplicate
        if Schedules().find_one({"name": document["name"]}, {"name": 1}):
            raise BadRequest("schedule with name `{}` already exists".format(
                document["name"]))

        document["duration"] = {"default": get_default_duration()}
        schedule_id = Schedules().insert_one(document).inserted_id

        return make_response(jsonify({"_id": str(schedule_id)}),
                             HTTPStatus.CREATED)
Ejemplo n.º 10
0
    def get(self, schedule_name: str):
        """Get schedule object."""

        query = {"name": schedule_name}
        schedule = Schedules().find_one(query, {"_id": 0})
        if schedule is None:
            raise ScheduleNotFound()

        schedule["config"].update(command_information_for(schedule["config"]))
        return jsonify(schedule)
Ejemplo n.º 11
0
    def get(self, schedule_name: str):
        """Get schedule object."""

        query = {"name": schedule_name}
        schedule = Schedules().find_one(query, {"_id": 0})
        if schedule is None:
            raise ScheduleNotFound()

        schedule["config"] = expanded_config(schedule["config"])
        return jsonify(schedule)
Ejemplo n.º 12
0
def update_schedule_duration(schedule_name):
    """set/update the `duration` object of a schedule by looking at its recent tasks

    value is computed with `scraper_completed - started` timestamps"""

    schedule_query = {"name": schedule_name}

    # retrieve last tasks that completed the resources intensive part
    query = {
        "schedule_name": schedule_name,
        f"timestamp.{TaskStatus.scraper_completed}": {
            "$exists": True
        },
        f"timestamp.{TaskStatus.started}": {
            "$exists": True
        },
        "container.exit_code": 0,
    }

    document = {
        "default": get_default_duration(),
    }

    # we have no finished task for this schedule, using default duration
    if Tasks().count_documents(query) == 0:
        document.update({"available": False, "workers": {}})

    # compute duration from last completed tasks
    else:
        tasks = (Tasks().find(query, {
            "timestamp": 1,
            "worker": 1
        }).sort(f"timestamp.{TaskStatus.scraper_completed}",
                pymongo.ASCENDING))

        workers = {
            task["worker"]: {
                "worker":
                task["worker"],
                "task":
                task["_id"],
                "value":
                int((task["timestamp"]["scraper_completed"] -
                     task["timestamp"]["started"]).total_seconds()),
                "on":
                task["timestamp"][TaskStatus.scraper_completed],
            }
            for task in tasks
        }
        if workers:
            document.update({"available": True, "workers": workers})

    Schedules().update_one(schedule_query, {"$set": {"duration": document}})
Ejemplo n.º 13
0
    def get(self, schedule_name: str, token: AccessToken.Payload):
        """Get schedule object."""

        query = {"name": schedule_name}
        schedule = Schedules().find_one(query, {"_id": 0})
        if schedule is None:
            raise ScheduleNotFound()

        schedule["config"] = expanded_config(schedule["config"])
        if not token or not token.get_permission("schedules", "update"):
            remove_secrets_from_response(schedule)

        return jsonify(schedule)
Ejemplo n.º 14
0
def request_tasks_using_schedule():
    """ create requested_tasks based on schedule's periodicity field

        Expected to be ran periodically to compute what needs to be scheduled """

    requester = "period-scheduler"
    priority = 0
    worker = None

    query = {"enabled": True}
    projection = {"name": 1, "config": 1, "most_recent_task": 1}

    for period, period_data in {
        p: PERIODICITIES.get(p) for p in SchedulePeriodicity.all()
    }.items():
        if not period_data:
            continue  # manually has no data

        period_start = getnow() - datetime.timedelta(days=period_data["days"])
        logger.debug(f"requesting for `{period}` schedules (before {period_start})")

        # find non-requested schedules which last run started before our period start
        query["periodicity"] = period
        for schedule in Schedules().find(query, projection):
            # don't bother if the schedule's already requested
            if (
                RequestedTasks().count_documents({"schedule_name": schedule["name"]})
                > 0
            ):
                continue

            if schedule.get("most_recent_task"):
                last_run = Tasks().find_one(
                    {"_id": schedule["most_recent_task"]["_id"]}, {"timestamp": 1}
                )
                # don't bother if it started after this rolling period's start
                if (
                    last_run
                    and last_run["timestamp"].get(
                        "started", datetime.datetime(2019, 1, 1)
                    )
                    > period_start
                ):
                    continue

            if request_a_schedule(schedule["name"], requester, worker, priority):
                logger.debug(f"requested {schedule['name']}")
            else:
                logger.debug(f"could not request {schedule['name']}")
Ejemplo n.º 15
0
def request_a_schedule(
    schedule_name, requested_by: str, worker: str = None, priority: int = 0
):
    """ created requested_task for schedule_name if possible else None

        enabled=False schedules can't be requested
        schedule can't be requested if already requested on same worker """

    # skip if already requested
    if RequestedTasks().count_documents(
        {"schedule_name": schedule_name, "worker": worker}
    ):
        return None

    schedule = Schedules().find_one(
        {"name": schedule_name, "enabled": True}, {"config": 1}
    )
    # schedule might be disabled
    if not schedule:
        return None

    config = schedule["config"]
    # build and save command-information to config
    config.update(command_information_for(config))

    now = getnow()

    document = {
        "schedule_name": schedule_name,
        "status": TaskStatus.requested,
        "timestamp": {TaskStatus.requested: now},
        "events": [{"code": TaskStatus.requested, "timestamp": now}],
        "requested_by": requested_by,
        "priority": priority,
        "worker": worker,
        "config": config,
    }

    if worker:
        document["worker"] = worker

    rt_id = RequestedTasks().insert_one(document).inserted_id

    document.update({"_id": str(rt_id)})
    return document
Ejemplo n.º 16
0
    def post(self, token: AccessToken.Payload):
        """ Create requested task from a list of schedule_names """

        try:
            request_json = NewRequestedTaskSchema().load(request.get_json())
        except ValidationError as e:
            raise InvalidRequestJSON(e.messages)

        schedule_names = request_json["schedule_names"]
        priority = request_json.get("priority", 0)
        worker = request_json.get("worker")

        # raise 404 if nothing to schedule
        if not Schedules().count_documents({
                "name": {
                    "$in": schedule_names
                },
                "enabled": True
        }):
            raise NotFound()

        requested_tasks = []
        for schedule_name in schedule_names:

            rq_task = request_a_schedule(schedule_name, token.username, worker,
                                         priority)
            if rq_task is None:
                continue

            requested_tasks.append(rq_task)

        if len(requested_tasks) > 1:
            BROADCASTER.broadcast_requested_tasks(requested_tasks)
        elif len(requested_tasks) == 1:
            BROADCASTER.broadcast_requested_task(requested_tasks[0])

        # trigger event handler
        for task in requested_tasks:
            task_event_handler(ObjectId(task["_id"]), "requested", {})

        return make_response(
            jsonify({"requested": [rt["_id"] for rt in requested_tasks]}),
            HTTPStatus.CREATED,
        )
Ejemplo n.º 17
0
def _update_schedule_most_recent_task_status(task_id):
    """ update `most_recent_task` value of associated schedule """
    # get schedule and last event
    cursor = Tasks().aggregate([
        {
            "$match": {
                "_id": task_id
            }
        },
        {
            "$project": {
                "schedule_name": 1,
                "last_event": {
                    "$arrayElemAt": ["$events", -1]
                },
            }
        },
    ])
    tasks = [task for task in cursor]
    task = tasks[0] if tasks else None
    if not task:
        return

    # update schedule most recent task
    schedule_name = task["schedule_name"]
    last_event_code = task["last_event"]["code"]
    last_event_timestamp = task["last_event"]["timestamp"]
    if "container" in last_event_code:
        return

    schedule_updates = {
        "most_recent_task": {
            "_id": task_id,
            "status": last_event_code,
            "updated_at": last_event_timestamp,
        }
    }
    Schedules().update_one({"name": schedule_name}, {"$set": schedule_updates})
Ejemplo n.º 18
0
def request_a_schedule(schedule_name,
                       requested_by: str,
                       worker: str = None,
                       priority: int = 0):
    """created requested_task for schedule_name if possible else None

    enabled=False schedules can't be requested
    schedule can't be requested if already requested on same worker"""

    # skip if already requested
    if RequestedTasks().count_documents({
            "schedule_name": schedule_name,
            "worker": worker
    }):
        return None

    schedule = Schedules().find_one({
        "name": schedule_name,
        "enabled": True
    }, {
        "config": 1,
        "notification": 1
    })
    # schedule might be disabled
    if not schedule:
        return None

    config = schedule["config"]
    # build and save command-information to config
    config = expanded_config(config)

    now = getnow()

    document = {
        "schedule_name": schedule_name,
        "status": TaskStatus.requested,
        "timestamp": {
            TaskStatus.requested: now
        },
        "events": [{
            "code": TaskStatus.requested,
            "timestamp": now
        }],
        "requested_by": requested_by,
        "priority": priority,
        "worker": worker,
        "config": config,
        # reverse ObjectId to randomize task ids
        "_id": ObjectId(str(ObjectId())[::-1]),
        "upload": {
            "zim": {
                "upload_uri": ZIM_UPLOAD_URI,
                "expiration": ZIM_EXPIRATION,
                "zimcheck": ZIMCHECK_OPTION,
            },
            "logs": {
                "upload_uri": LOGS_UPLOAD_URI,
                "expiration": LOGS_EXPIRATION,
            },
        },
        "notification": schedule.get("notification", {}),
    }

    if worker:
        document["worker"] = worker

    rt_id = RequestedTasks().insert_one(document).inserted_id

    document.update({"_id": str(rt_id)})
    return document