Esempio n. 1
0
async def daemon_killer(
    *,
    settings: configuration.OperatorSettings,
    memories: containers.ResourceMemories,
) -> None:
    """
    An operator's root task to kill the daemons on the operator's shutdown.
    """

    # Sleep forever, or until cancelled, which happens when the operator begins its shutdown.
    try:
        await asyncio.Event().wait()

    # Terminate all running daemons when the operator exits (and this task is cancelled).
    finally:
        coros = [
            stop_daemon(daemon=daemon, settings=settings)
            for memory in memories.iter_all_memories()
            for daemon in memory.running_daemons.values()
        ]
        if coros:
            await asyncio.wait(coros)
Esempio n. 2
0
async def daemon_killer(
    *,
    settings: configuration.OperatorSettings,
    memories: containers.ResourceMemories,
    operator_paused: primitives.ToggleSet,
) -> None:
    """
    An operator's root task to kill the daemons on the operator's demand.

    The "demand" comes in two cases: when the operator is exiting (gracefully
    or not), and when the operator is pausing because of peering. In that case,
    all watch-streams are disconnected, and all daemons/timers should stop.

    When pausing, the daemons/timers are stopped via their regular stopping
    procedure: with graceful or forced termination, backoffs, timeouts.

    .. warning::

        Each daemon will be respawned on the next K8s watch-event strictly
        after the previous daemon is fully stopped.
        There are never 2 instances of the same daemon running in parallel.

        In normal cases (enough time is given to stop), this is usually done
        by the post-pause re-listing event. In rare cases when the re-pausing
        happens faster than the daemon is stopped (highly unlikely to happen),
        that event can be missed because the daemon is being stopped yet,
        so the respawn can happen with a significant delay.

        This issue is considered low-priority & auxiliary, so as the peering
        itself. It can be fixed later. Workaround: make daemons to exit fast.
    """
    # Unlimited job pool size —- the same as if we would be managing the tasks directly.
    # Unlimited timeout in `close()` -- since we have our own per-daemon timeout management.
    scheduler: aiojobs.Scheduler = await aiojobs.create_scheduler(
        limit=None, close_timeout=99999)
    try:
        while True:

            # Stay here while the operator is running normally, until it is paused.
            await operator_paused.wait_for(True)

            # The stopping tasks are "fire-and-forget" -- we do not get (or care of) the result.
            # The daemons remain resumable, since they exit not on their own accord.
            for memory in memories.iter_all_memories():
                for daemon in memory.running_daemons.values():
                    await scheduler.spawn(
                        stop_daemon(settings=settings,
                                    daemon=daemon,
                                    reason=primitives.DaemonStoppingReason.
                                    OPERATOR_PAUSING))

            # Stay here while the operator is paused, until it is resumed.
            # The fresh stream of watch-events will spawn new daemons naturally.
            await operator_paused.wait_for(False)

    # Terminate all running daemons when the operator exits (and this task is cancelled).
    finally:
        for memory in memories.iter_all_memories():
            for daemon in memory.running_daemons.values():
                await scheduler.spawn(
                    stop_daemon(settings=settings,
                                daemon=daemon,
                                reason=primitives.DaemonStoppingReason.
                                OPERATOR_EXITING))
        await scheduler.close()