async def queue_fetches(pg_render_locker: PgRenderLocker): """Queue all pending fetches in RabbitMQ. We'll set is_busy=True as we queue them, so we don't send double-fetches. """ pending_ids = await load_pending_steps() for workflow_id, step_id in pending_ids: # Don't schedule a fetch if we're currently rendering. # # This still lets us schedule a fetch if a render is _queued_, so it # doesn't solve any races. But it should lower the number of fetches of # resource-intensive workflows. # # Using pg_render_locker means we can only queue a fetch _between_ # renders. The fetch/render queues may be non-empty (we aren't # checking); but we're giving the renderers a chance to tackle some # backlog. try: async with pg_render_locker.render_lock(workflow_id) as lock: # At this moment, the workflow isn't rendering. Let's pass # through and queue the fetch. await lock.stall_others() # required by the PgRenderLocker API logger.info("Queue fetch of step(%d, %d)", workflow_id, step_id) await set_step_busy(step_id) await rabbitmq.send_update_to_workflow_clients( workflow_id, clientside.Update(steps={step_id: clientside.StepUpdate(is_busy=True)}), ) await rabbitmq.queue_fetch(workflow_id, step_id) except WorkflowAlreadyLocked: # Don't queue a fetch. We'll revisit this Step next time we # query for pending fetches. pass
async def render_workflow_and_maybe_requeue( pg_render_locker: PgRenderLocker, workflow_id: int, delta_id: int, ) -> None: """ Acquire an advisory lock and render, or re-queue task if the lock is held. If a render is requested on a Workflow that's already being rendered, there's no point in wasting CPU cycles starting from scratch. Wait for the first render to exit (which will happen at the next stale database-write). It should then re-schedule a render. """ # Query for workflow before locking. We don't need a lock for this, and no # lock means we can dismiss spurious renders sooner, so they don't fill the # render queue. try: workflow = await _lookup_workflow(workflow_id) except Workflow.DoesNotExist: logger.info("Skipping render of deleted Workflow %d", workflow_id) return try: async with pg_render_locker.render_lock(workflow_id) as lock: # any error leads to undefined behavior result = await render_workflow_once(workflow, delta_id) # requeue if needed await lock.stall_others() if result == RenderResult.MUST_REQUEUE: want_requeue = True elif result == RenderResult.MUST_NOT_REQUEUE: want_requeue = False else: try: workflow = await _lookup_workflow(workflow_id) if workflow.last_delta_id != delta_id: logger.info( "Requeueing render(workflow=%d, delta=%d)", workflow_id, workflow.last_delta_id, ) want_requeue = True else: want_requeue = False except Workflow.DoesNotExist: logger.info("Skipping requeue of deleted Workflow %d", workflow_id) want_requeue = False if want_requeue: await rabbitmq.queue_render(workflow_id, workflow.last_delta_id) # This is why we used `lock.stall_others()`: after requeue, # another renderer may try to lock this workflow and we want # that lock to _succeed_ -- not raise WorkflowAlreadyLocked. # Only ack() _after_ requeue. That preserves our invariant: if we # schedule a render, there is always an un-acked render for that # workflow queued in RabbitMQ until the workflow is up-to-date. (At # this exact moment, there are briefly two un-acked renders.) except WorkflowAlreadyLocked: logger.info("Workflow %d is being rendered elsewhere; ignoring", workflow_id)
async def queue_fetches(pg_render_locker: PgRenderLocker): """ Queue all pending fetches in RabbitMQ. We'll set is_busy=True as we queue them, so we don't send double-fetches. """ wf_modules = await load_pending_wf_modules() for workflow_id, wf_module in wf_modules: # Don't schedule a fetch if we're currently rendering. # # This still lets us schedule a fetch if a render is _queued_, so it # doesn't solve any races. But it should lower the number of fetches of # resource-intensive workflows. # # Using pg_render_locker means we can only queue a fetch _between_ # renders. The fetch/render queues may be non-empty (we aren't # checking); but we're giving the renderers a chance to tackle some # backlog. try: async with pg_render_locker.render_lock(workflow_id) as lock: # At this moment, the workflow isn't rendering. Let's pass # through and queue the fetch. await lock.stall_others() # required by the PgRenderLocker API logger.info("Queue fetch of wf_module(%d, %d)", workflow_id, wf_module.id) await set_wf_module_busy(wf_module) await websockets.ws_client_send_delta_async( workflow_id, { "updateWfModules": { str(wf_module.id): { "is_busy": True, "fetch_error": "" } } }, ) await rabbitmq.queue_fetch(wf_module) except WorkflowAlreadyLocked: # Don't queue a fetch. We'll revisit this WfModule next time we # query for pending fetches. pass
async def render_workflow_and_maybe_requeue( pg_render_locker: PgRenderLocker, workflow_id: int, delta_id: int, ack: Callable[[], Awaitable[None]], requeue: Callable[[int, int], Awaitable[None]], ) -> None: """ Acquire an advisory lock and render, or re-queue task if the lock is held. If a render is requested on a Workflow that's already being rendered, there's no point in wasting CPU cycles starting from scratch. Wait for the first render to exit (which will happen at the next stale database-write). It should then re-schedule a render. """ # Query for workflow before locking. We don't need a lock for this, and no # lock means we can dismiss spurious renders sooner, so they don't fill the # render queue. try: workflow = await _lookup_workflow(workflow_id) except Workflow.DoesNotExist: logger.info("Skipping render of deleted Workflow %d", workflow_id) await ack() return try: async with pg_render_locker.render_lock(workflow_id) as lock: try: result = await render_workflow_once(workflow, delta_id) except (asyncio.CancelledError, DatabaseError, InterfaceError): raise # all undefined behavior # requeue if needed await lock.stall_others() if result == RenderResult.MUST_REQUEUE: want_requeue = True elif result == RenderResult.MUST_NOT_REQUEUE: want_requeue = False else: try: workflow = await _lookup_workflow(workflow_id) if workflow.last_delta_id != delta_id: logger.info( "Requeueing render(workflow=%d, delta=%d)", workflow_id, workflow.last_delta_id, ) want_requeue = True else: want_requeue = False except Workflow.DoesNotExist: logger.info("Skipping requeue of deleted Workflow %d", workflow_id) want_requeue = False if want_requeue: await requeue(workflow_id, workflow.last_delta_id) # This is why we used `lock.stall_others()`: after requeue, # another renderer may try to lock this workflow and we want # that lock to _succeed_ -- not raise WorkflowAlreadyLocked. # Only ack() _after_ requeue. That preserves our invariant: if we # schedule a render, there is always an un-acked render for that # workflow queued in RabbitMQ until the workflow is up-to-date. (At # this exact moment, there are briefly two un-acked renders.) await ack() except WorkflowAlreadyLocked: logger.info("Workflow %d is being rendered elsewhere; ignoring", workflow_id) await ack() except (DatabaseError, InterfaceError): # Possibilities: # # 1. There's a bug in renderer.execute. This may leave the event # loop's executor thread's database connection in an inconsistent # state. [2018-11-06 saw this on production.] The best way to clear # up the leaked, broken connection is to die. (Our parent process # should restart us, and RabbitMQ will give the job to someone # else.) # # 2. The database connection died (e.g., Postgres went away). The # best way to clear up the leaked, broken connection is to die. # (Our parent process should restart us, and RabbitMQ will give the # job to someone else.) # # 3. PgRenderLocker's database connection died (e.g., Postgres went # away). We haven't seen this much in practice; so let's die and let # the parent process restart us. # # 4. There's some design flaw we haven't thought of, and we # shouldn't ever render this workflow. If this is the case, we're # doomed. # # If you're seeing this error that means there's a bug somewhere # _else_. If you're staring at a case-3 situation, please remember # that cases 1 and 2 are important, too. logger.exception("Fatal database error; exiting") os._exit(1)