Ejemplo n.º 1
0
async def queue_fetches(pg_render_locker: PgRenderLocker):
    """Queue all pending fetches in RabbitMQ.

    We'll set is_busy=True as we queue them, so we don't send double-fetches.
    """
    pending_ids = await load_pending_steps()

    for workflow_id, step_id in pending_ids:
        # Don't schedule a fetch if we're currently rendering.
        #
        # This still lets us schedule a fetch if a render is _queued_, so it
        # doesn't solve any races. But it should lower the number of fetches of
        # resource-intensive workflows.
        #
        # Using pg_render_locker means we can only queue a fetch _between_
        # renders. The fetch/render queues may be non-empty (we aren't
        # checking); but we're giving the renderers a chance to tackle some
        # backlog.
        try:
            async with pg_render_locker.render_lock(workflow_id) as lock:
                # At this moment, the workflow isn't rendering. Let's pass
                # through and queue the fetch.
                await lock.stall_others()  # required by the PgRenderLocker API

            logger.info("Queue fetch of step(%d, %d)", workflow_id, step_id)
            await set_step_busy(step_id)
            await rabbitmq.send_update_to_workflow_clients(
                workflow_id,
                clientside.Update(steps={step_id: clientside.StepUpdate(is_busy=True)}),
            )
            await rabbitmq.queue_fetch(workflow_id, step_id)
        except WorkflowAlreadyLocked:
            # Don't queue a fetch. We'll revisit this Step next time we
            # query for pending fetches.
            pass
Ejemplo n.º 2
0
async def render_workflow_and_maybe_requeue(
    pg_render_locker: PgRenderLocker,
    workflow_id: int,
    delta_id: int,
) -> None:
    """
    Acquire an advisory lock and render, or re-queue task if the lock is held.

    If a render is requested on a Workflow that's already being rendered,
    there's no point in wasting CPU cycles starting from scratch. Wait for the
    first render to exit (which will happen at the next stale database-write).
    It should then re-schedule a render.
    """
    # Query for workflow before locking. We don't need a lock for this, and no
    # lock means we can dismiss spurious renders sooner, so they don't fill the
    # render queue.
    try:
        workflow = await _lookup_workflow(workflow_id)
    except Workflow.DoesNotExist:
        logger.info("Skipping render of deleted Workflow %d", workflow_id)
        return

    try:
        async with pg_render_locker.render_lock(workflow_id) as lock:
            # any error leads to undefined behavior
            result = await render_workflow_once(workflow, delta_id)

            # requeue if needed
            await lock.stall_others()
            if result == RenderResult.MUST_REQUEUE:
                want_requeue = True
            elif result == RenderResult.MUST_NOT_REQUEUE:
                want_requeue = False
            else:
                try:
                    workflow = await _lookup_workflow(workflow_id)
                    if workflow.last_delta_id != delta_id:
                        logger.info(
                            "Requeueing render(workflow=%d, delta=%d)",
                            workflow_id,
                            workflow.last_delta_id,
                        )
                        want_requeue = True
                    else:
                        want_requeue = False
                except Workflow.DoesNotExist:
                    logger.info("Skipping requeue of deleted Workflow %d", workflow_id)
                    want_requeue = False
            if want_requeue:
                await rabbitmq.queue_render(workflow_id, workflow.last_delta_id)
                # This is why we used `lock.stall_others()`: after requeue,
                # another renderer may try to lock this workflow and we want
                # that lock to _succeed_ -- not raise WorkflowAlreadyLocked.
            # Only ack() _after_ requeue. That preserves our invariant: if we
            # schedule a render, there is always an un-acked render for that
            # workflow queued in RabbitMQ until the workflow is up-to-date. (At
            # this exact moment, there are briefly two un-acked renders.)
    except WorkflowAlreadyLocked:
        logger.info("Workflow %d is being rendered elsewhere; ignoring", workflow_id)
Ejemplo n.º 3
0
async def queue_fetches(pg_render_locker: PgRenderLocker):
    """
    Queue all pending fetches in RabbitMQ.

    We'll set is_busy=True as we queue them, so we don't send double-fetches.
    """
    wf_modules = await load_pending_wf_modules()

    for workflow_id, wf_module in wf_modules:
        # Don't schedule a fetch if we're currently rendering.
        #
        # This still lets us schedule a fetch if a render is _queued_, so it
        # doesn't solve any races. But it should lower the number of fetches of
        # resource-intensive workflows.
        #
        # Using pg_render_locker means we can only queue a fetch _between_
        # renders. The fetch/render queues may be non-empty (we aren't
        # checking); but we're giving the renderers a chance to tackle some
        # backlog.
        try:
            async with pg_render_locker.render_lock(workflow_id) as lock:
                # At this moment, the workflow isn't rendering. Let's pass
                # through and queue the fetch.
                await lock.stall_others()  # required by the PgRenderLocker API

            logger.info("Queue fetch of wf_module(%d, %d)", workflow_id,
                        wf_module.id)
            await set_wf_module_busy(wf_module)
            await websockets.ws_client_send_delta_async(
                workflow_id,
                {
                    "updateWfModules": {
                        str(wf_module.id): {
                            "is_busy": True,
                            "fetch_error": ""
                        }
                    }
                },
            )
            await rabbitmq.queue_fetch(wf_module)
        except WorkflowAlreadyLocked:
            # Don't queue a fetch. We'll revisit this WfModule next time we
            # query for pending fetches.
            pass
Ejemplo n.º 4
0
async def render_workflow_and_maybe_requeue(
    pg_render_locker: PgRenderLocker,
    workflow_id: int,
    delta_id: int,
    ack: Callable[[], Awaitable[None]],
    requeue: Callable[[int, int], Awaitable[None]],
) -> None:
    """
    Acquire an advisory lock and render, or re-queue task if the lock is held.

    If a render is requested on a Workflow that's already being rendered,
    there's no point in wasting CPU cycles starting from scratch. Wait for the
    first render to exit (which will happen at the next stale database-write).
    It should then re-schedule a render.
    """
    # Query for workflow before locking. We don't need a lock for this, and no
    # lock means we can dismiss spurious renders sooner, so they don't fill the
    # render queue.
    try:
        workflow = await _lookup_workflow(workflow_id)
    except Workflow.DoesNotExist:
        logger.info("Skipping render of deleted Workflow %d", workflow_id)
        await ack()
        return

    try:
        async with pg_render_locker.render_lock(workflow_id) as lock:
            try:
                result = await render_workflow_once(workflow, delta_id)
            except (asyncio.CancelledError, DatabaseError, InterfaceError):
                raise  # all undefined behavior

            # requeue if needed
            await lock.stall_others()
            if result == RenderResult.MUST_REQUEUE:
                want_requeue = True
            elif result == RenderResult.MUST_NOT_REQUEUE:
                want_requeue = False
            else:
                try:
                    workflow = await _lookup_workflow(workflow_id)
                    if workflow.last_delta_id != delta_id:
                        logger.info(
                            "Requeueing render(workflow=%d, delta=%d)",
                            workflow_id,
                            workflow.last_delta_id,
                        )
                        want_requeue = True
                    else:
                        want_requeue = False
                except Workflow.DoesNotExist:
                    logger.info("Skipping requeue of deleted Workflow %d",
                                workflow_id)
                    want_requeue = False
            if want_requeue:
                await requeue(workflow_id, workflow.last_delta_id)
                # This is why we used `lock.stall_others()`: after requeue,
                # another renderer may try to lock this workflow and we want
                # that lock to _succeed_ -- not raise WorkflowAlreadyLocked.
            # Only ack() _after_ requeue. That preserves our invariant: if we
            # schedule a render, there is always an un-acked render for that
            # workflow queued in RabbitMQ until the workflow is up-to-date. (At
            # this exact moment, there are briefly two un-acked renders.)
            await ack()
    except WorkflowAlreadyLocked:
        logger.info("Workflow %d is being rendered elsewhere; ignoring",
                    workflow_id)
        await ack()
    except (DatabaseError, InterfaceError):
        # Possibilities:
        #
        # 1. There's a bug in renderer.execute. This may leave the event
        # loop's executor thread's database connection in an inconsistent
        # state. [2018-11-06 saw this on production.] The best way to clear
        # up the leaked, broken connection is to die. (Our parent process
        # should restart us, and RabbitMQ will give the job to someone
        # else.)
        #
        # 2. The database connection died (e.g., Postgres went away). The
        # best way to clear up the leaked, broken connection is to die.
        # (Our parent process should restart us, and RabbitMQ will give the
        # job to someone else.)
        #
        # 3. PgRenderLocker's database connection died (e.g., Postgres went
        # away). We haven't seen this much in practice; so let's die and let
        # the parent process restart us.
        #
        # 4. There's some design flaw we haven't thought of, and we
        # shouldn't ever render this workflow. If this is the case, we're
        # doomed.
        #
        # If you're seeing this error that means there's a bug somewhere
        # _else_. If you're staring at a case-3 situation, please remember
        # that cases 1 and 2 are important, too.
        logger.exception("Fatal database error; exiting")
        os._exit(1)