def test_get_available_slots_issue_when_reading_queue(mocker):
    mock = mocker.patch("swh.scheduler.celery_backend.config.get_queue_length")
    mock.side_effect = ValueError

    actual_num = get_available_slots(app, "anything", max_length=10)
    assert actual_num == MAX_NUM_TASKS
    assert mock.called
def test_get_available_slots(mocker):
    mock = mocker.patch("swh.scheduler.celery_backend.config.get_queue_length")
    max_length = 100
    queue_length = 90
    mock.return_value = queue_length
    actual_num = get_available_slots(app, "anything", max_length)
    assert actual_num == max_length - queue_length
    assert mock.called
Beispiel #3
0
def send_to_celery(
    ctx,
    policy: str,
    queue: Optional[str],
    tablesample: Optional[float],
    type: str,
    enabled: bool,
    lister_uuid: Optional[str] = None,
):
    """Send the next origin visits of the TYPE loader to celery, filling the queue."""
    from kombu.utils.uuid import uuid

    from swh.scheduler.celery_backend.config import app, get_available_slots

    scheduler = ctx.obj["scheduler"]

    task_type = scheduler.get_task_type(f"load-{type}")

    task_name = task_type["backend_name"]
    queue_name = queue or task_name

    num_tasks = get_available_slots(app, queue_name,
                                    task_type["max_queue_length"])

    click.echo(f"{num_tasks} slots available in celery queue")
    origins = scheduler.grab_next_visits(
        type,
        num_tasks,
        policy=policy,
        tablesample=tablesample,
        enabled=enabled,
        lister_uuid=lister_uuid,
    )

    click.echo(f"{len(origins)} visits to send to celery")
    for task_dict in create_origin_task_dicts(origins, scheduler):
        app.send_task(
            task_name,
            task_id=uuid(),
            args=task_dict["arguments"]["args"],
            kwargs=task_dict["arguments"]["kwargs"],
            queue=queue_name,
        )
def send_visits_for_visit_type(
    scheduler: SchedulerInterface,
    app,
    visit_type: str,
    task_type: Dict,
    policy_cfg: List[Dict[str, Any]],
) -> float:
    """Schedule the next batch of visits for the given ``visit_type``.

    First, we determine the number of available slots by introspecting the RabbitMQ
    queue.

    If there's fewer than :py:data:`MIN_SLOTS_RATIO` slots available in the queue, we
    wait for :py:data:`QUEUE_FULL_BACKOFF` seconds. This avoids running the expensive
    :py:func:`~swh.scheduler.interface.SchedulerInterface.grab_next_visits` queries when
    there's not many jobs to queue.

    Once there's more than :py:data:`MIN_SLOTS_RATIO` slots available, we run
    :py:func:`grab_next_visits_policy_weights` to retrieve the next set of origin visits
    to schedule, and we send them to celery.

    If the last scheduling attempt didn't return any origins, we sleep for
    :py:data:`NO_ORIGINS_SCHEDULED_BACKOFF` seconds. This avoids running the expensive
    :py:func:`~swh.scheduler.interface.SchedulerInterface.grab_next_visits` queries too
    often if there's nothing left to schedule.

    The :py:data:`POLICY_CFG` argument is the policy configuration used to
    choose the next origins to visit. It is passed directly to the
    :py:func:`grab_next_visits_policy_weights()` function.

    Returns:
       the earliest :py:func:`time.monotonic` value at which to run the next iteration
       of the loop.

    """
    queue_name = task_type["backend_name"]
    max_queue_length = task_type.get("max_queue_length") or 0
    min_available_slots = max_queue_length * MIN_SLOTS_RATIO

    current_iteration_start = time.monotonic()

    # Check queue level
    available_slots = get_available_slots(app, queue_name, max_queue_length)
    logger.debug(
        "%s available slots for visit type %s in queue %s",
        available_slots,
        visit_type,
        queue_name,
    )
    if available_slots < min_available_slots:
        return current_iteration_start + QUEUE_FULL_BACKOFF

    origins = grab_next_visits_policy_weights(scheduler, visit_type,
                                              available_slots, policy_cfg)

    if not origins:
        logger.debug("No origins to visit for type %s", visit_type)
        return current_iteration_start + NO_ORIGINS_SCHEDULED_BACKOFF

    # Try to smooth the ingestion load, origins pulled by different
    # scheduling policies have different resource usage patterns
    random.shuffle(origins)

    for task_dict in create_origin_task_dicts(origins, scheduler):
        app.send_task(
            queue_name,
            task_id=uuid(),
            args=task_dict["arguments"]["args"],
            kwargs=task_dict["arguments"]["kwargs"],
            queue=queue_name,
        )

    logger.info(
        "%s: %s visits scheduled in queue %s",
        visit_type,
        len(origins),
        queue_name,
    )

    # When everything worked, we can try to schedule origins again ASAP.
    return time.monotonic()
def test_get_available_slots_no_queue_length(mocker):
    mock = mocker.patch("swh.scheduler.celery_backend.config.get_queue_length")
    mock.return_value = None
    actual_num = get_available_slots(app, "anything", max_length=100)
    assert actual_num == MAX_NUM_TASKS
    assert mock.called
def test_get_available_slots_no_max_length():
    actual_num = get_available_slots(app, "anything", None)
    assert actual_num == MAX_NUM_TASKS
def run_ready_tasks(
    backend: SchedulerInterface,
    app,
    task_types: List[Dict] = [],
    with_priority: bool = False,
) -> List[Dict]:
    """Schedule tasks ready to be scheduled.

    This lookups any tasks per task type and mass schedules those accordingly (send
    messages to rabbitmq and mark as scheduled equivalent tasks in the scheduler
    backend).

    If tasks (per task type) with priority exist, they will get redirected to dedicated
    high priority queue (standard queue name prefixed with `save_code_now:`).

    Args:
        backend: scheduler backend to interact with (read/update tasks)
        app (App): Celery application to send tasks to
        task_types: The list of task types dict to iterate over. By default, empty.
          When empty, the full list of task types referenced in the scheduler will be
          used.
        with_priority: If True, only tasks with priority set will be fetched and
          scheduled. By default, False.

    Returns:
        A list of dictionaries::

          {
            'task': the scheduler's task id,
            'backend_id': Celery's task id,
            'scheduler': utcnow()
          }

        The result can be used to block-wait for the tasks' results::

          backend_tasks = run_ready_tasks(self.scheduler, app)
          for task in backend_tasks:
              AsyncResult(id=task['backend_id']).get()

    """
    all_backend_tasks: List[Dict] = []
    while True:
        if not task_types:
            task_types = backend.get_task_types()
        task_types_d = {}
        pending_tasks = []
        for task_type in task_types:
            task_type_name = task_type["type"]
            task_types_d[task_type_name] = task_type
            max_queue_length = task_type["max_queue_length"]
            if max_queue_length is None:
                max_queue_length = 0
            backend_name = task_type["backend_name"]

            if with_priority:
                # grab max_queue_length (or 10) potential tasks with any priority for
                # the same type (limit the result to avoid too long running queries)
                grabbed_priority_tasks = backend.grab_ready_priority_tasks(
                    task_type_name, num_tasks=max_queue_length or 10
                )
                if grabbed_priority_tasks:
                    pending_tasks.extend(grabbed_priority_tasks)
                    logger.info(
                        "Grabbed %s tasks %s (priority)",
                        len(grabbed_priority_tasks),
                        task_type_name,
                    )
                    statsd.increment(
                        "swh_scheduler_runner_scheduled_task_total",
                        len(grabbed_priority_tasks),
                        tags={"task_type": task_type_name},
                    )
            else:
                num_tasks = get_available_slots(app, backend_name, max_queue_length)
                # only pull tasks if the buffer is at least 1/5th empty (= 80%
                # full), to help postgresql use properly indexed queries.
                if num_tasks > min(MAX_NUM_TASKS, max_queue_length) // 5:
                    # Only grab num_tasks tasks with no priority
                    grabbed_tasks = backend.grab_ready_tasks(
                        task_type_name, num_tasks=num_tasks
                    )
                    if grabbed_tasks:
                        pending_tasks.extend(grabbed_tasks)
                        logger.info(
                            "Grabbed %s tasks %s", len(grabbed_tasks), task_type_name
                        )
                        statsd.increment(
                            "swh_scheduler_runner_scheduled_task_total",
                            len(grabbed_tasks),
                            tags={"task_type": task_type_name},
                        )

        if not pending_tasks:
            return all_backend_tasks

        backend_tasks = []
        celery_tasks: List[Tuple[bool, str, str, List, Dict]] = []
        for task in pending_tasks:
            args = task["arguments"]["args"]
            kwargs = task["arguments"]["kwargs"]

            backend_name = task_types_d[task["type"]]["backend_name"]
            backend_id = uuid()
            celery_tasks.append(
                (
                    task.get("priority") is not None,
                    backend_name,
                    backend_id,
                    args,
                    kwargs,
                )
            )
            data = {
                "task": task["id"],
                "backend_id": backend_id,
                "scheduled": utcnow(),
            }

            backend_tasks.append(data)
        logger.debug("Sent %s celery tasks", len(backend_tasks))

        backend.mass_schedule_task_runs(backend_tasks)
        for with_priority, backend_name, backend_id, args, kwargs in celery_tasks:
            kw = dict(
                task_id=backend_id,
                args=args,
                kwargs=kwargs,
            )
            if with_priority:
                kw["queue"] = f"save_code_now:{backend_name}"
            app.send_task(backend_name, **kw)

        all_backend_tasks.extend(backend_tasks)