async def reenter_txn(pool: SAEngine, conn: SAConnection): if conn is None: async with pool.acquire() as conn, conn.begin(): yield conn else: async with conn.begin_nested(): yield conn
async def _schedule_multi_node_session( self, sched_ctx: SchedulingContext, scheduler: AbstractScheduler, agent_db_conn: SAConnection, kernel_db_conn: SAConnection, sgroup_name: str, candidate_agents: Sequence[AgentContext], sess_ctx: PendingSession, check_results: List[Tuple[str, Union[Exception, PredicateResult]]], ) -> Tuple[PendingSession, List[KernelAgentBinding]]: # Assign agent resource per kernel in the session. log_fmt = _log_fmt.get() log_args = _log_args.get() agent_query_extra_conds = None kernel_agent_bindings: List[KernelAgentBinding] = [] async with agent_db_conn.begin(isolation_level="REPEATABLE READ"): # This outer transaction is rolled back when any exception occurs inside, # including scheduling failures of a kernel. # It ensures that occupied_slots are recovered when there are partial # scheduling failures. for kernel in sess_ctx.kernels: try: agent_id = scheduler.assign_agent_for_kernel( candidate_agents, kernel) if agent_id is None: raise InstanceNotAvailable async with agent_db_conn.begin_nested(): agent_alloc_ctx = await _reserve_agent( sched_ctx, agent_db_conn, sgroup_name, agent_id, kernel.requested_slots, extra_conds=agent_query_extra_conds, ) candidate_agents = await _list_agents_by_sgroup( agent_db_conn, sgroup_name) except InstanceNotAvailable: log.debug(log_fmt + 'no-available-instances', *log_args) async with kernel_db_conn.begin(): await _invoke_failure_callbacks( kernel_db_conn, sched_ctx, sess_ctx, check_results, ) query = kernels.update().values({ 'status_info': "no-available-instances", 'status_data': sql_json_increment(kernels.c.status_data, ('scheduler', 'retries'), parent_updates={ 'last_try': datetime.now( tzutc()).isoformat(), }), }).where(kernels.c.id == kernel.kernel_id) await kernel_db_conn.execute(query) raise except Exception as e: log.exception( log_fmt + 'unexpected-error, during agent allocation', *log_args, ) async with kernel_db_conn.begin(): await _invoke_failure_callbacks( kernel_db_conn, sched_ctx, sess_ctx, check_results, ) query = kernels.update().values({ 'status_info': "scheduler-error", 'status_data': convert_to_status_data(e), }).where(kernels.c.id == kernel.kernel_id) await kernel_db_conn.execute(query) raise else: kernel_agent_bindings.append( KernelAgentBinding(kernel, agent_alloc_ctx)) if len(kernel_agent_bindings) == len(sess_ctx.kernels): # Proceed to PREPARING only when all kernels are successfully scheduled. async with kernel_db_conn.begin(): for binding in kernel_agent_bindings: query = kernels.update().values({ 'agent': binding.agent_alloc_ctx.agent_id, 'agent_addr': binding.agent_alloc_ctx.agent_addr, 'scaling_group': sgroup_name, 'status': KernelStatus.PREPARING, 'status_info': 'scheduled', 'status_data': {}, 'status_changed': datetime.now(tzutc()), }).where(kernels.c.id == binding.kernel.kernel_id) await kernel_db_conn.execute(query) return (sess_ctx, kernel_agent_bindings)