async def _load_state(self, reload_id: int) -> ClusterState: commands: Optional[CommandsRegistry] = None init_addrs = self._get_init_addrs(reload_id) state = await self._fetch_state(init_addrs) # initialize connections pool for every master node in new state for node in state._data.masters: await self._pooler.ensure_pool(node.addr) # choose random master node and load command specs from node pool = await self._pooler.ensure_pool(state.random_master().addr) # fetch commands only for first cluster state load if reload_id == 1: async with async_timeout.timeout(self._execute_timeout): raw_commands = await pool.execute(b"COMMAND", encoding="utf-8") commands = create_registry(raw_commands) logger.debug("Found %d supported commands in cluster", commands.size()) # assign initial cluster state and commands self._state = state if commands is not None: self._commands = commands if logger.isEnabledFor(logging.INFO): logger.info( "Loaded state: %s (reload_id=%d)", state.repr_stats(), reload_id, ) return self._state
def _make_execute_props( self, state: ClusterState, ctx: ExecuteContext, fail_props: ExecuteFailProps = None, ) -> ExecuteProps: exec_props = ExecuteProps() node_addr: Address if fail_props: # reraise exception for simplify classification # instead of many isinstance conditions try: raise fail_props.error except self._connection_errors: if ctx.attempt <= 2 and ctx.slot is not None: replica = state.random_slot_replica(ctx.slot) if replica is not None: node_addr = replica.addr else: node_addr = state.random_node().addr else: node_addr = state.random_node().addr except MovedError as e: node_addr = Address(e.info.host, e.info.port) except AskError as e: node_addr = Address(e.info.host, e.info.port) exec_props.asking = e.info.ask except (ClusterDownError, TryAgainError, LoadingError, ProtocolError): node_addr = state.random_node().addr except Exception as e: # usualy never be done here logger.exception("Uncaught exception on execute: %r", e) raise logger.info("New node to execute: %s", node_addr) else: if ctx.slot is not None: try: node = state.slot_master(ctx.slot) except UncoveredSlotError: logger.warning("No node found by slot %d", ctx.slot) # probably cluster is corrupted and # we need try to recover cluster state exec_props.reload_state_required = True node = state.random_master() node_addr = node.addr else: node_addr = state.random_master().addr logger.debug("Defined node to command: %s", node_addr) exec_props.node_addr = node_addr return exec_props
async def _try_execute( self, ctx: ExecuteContext, props: ExecuteProps, fail_props: Optional[ExecuteFailProps] ) -> Any: node_addr = props.node_addr attempt_log_prefix = "" if ctx.attempt > 1: attempt_log_prefix = f"[{ctx.attempt}/{ctx.max_attempts}] " if logger.isEnabledFor(logging.DEBUG): logger.debug("%sExecute %r on %s", attempt_log_prefix, ctx.cmd_for_repr(), node_addr) pool = await self._pooler.ensure_pool(node_addr) pool_size = pool.size if pool_size >= pool.maxsize and pool.freesize == 0: logger.warning( "ConnectionPool to %s size limit reached (minsize:%s, maxsize:%s, current:%s])", node_addr, pool.minsize, pool.maxsize, pool_size, ) if props.asking: logger.info("Send ASKING to %s for command %r", node_addr, ctx.cmd_name) result = await self._conn_execute( pool, ctx.cmd, ctx.kwargs, timeout=self._attempt_timeout, asking=True, ) else: if ctx.cmd_info.is_blocking(): result = await self._conn_execute( pool, ctx.cmd, ctx.kwargs, timeout=self._attempt_timeout, ) else: result = await self._pool_execute( pool, ctx.cmd, ctx.kwargs, timeout=self._attempt_timeout, ) return result
async def ensure_pool(self, addr: Address) -> AbcPool: if addr in self._nodes and self._nodes[addr].pool.closed: self._erase_addr(addr) if addr not in self._nodes: async with self._creation_lock(addr): if addr not in self._nodes: logger.debug("Create connections pool for %s", addr) pool = await self._create_pool((addr.host, addr.port)) self._nodes[addr] = PoolHolder(pool, self._reap_calls) self._pubsub_addrs[addr] = set() holder = self._nodes[addr] holder.generation = self._reap_calls return holder.pool
async def execute(self, *args, **kwargs) -> Any: """Execute redis command.""" ctx = self._make_exec_context(args, kwargs) keys = self._extract_command_keys(ctx.cmd_info, ctx.cmd) if keys: ctx.slot = self.determine_slot(*keys) if logger.isEnabledFor(logging.DEBUG): logger.debug("Determined slot for %r is %d", ctx.cmd_for_repr(), ctx.slot) exec_fail_props: Optional[ExecuteFailProps] = None while ctx.attempt < ctx.max_attempts: self._check_closed() ctx.attempt += 1 state = await self._manager.get_state() exec_props = self._make_execute_props(state, ctx, exec_fail_props) if exec_props.reload_state_required: self._manager.require_reload_state() node_addr = exec_props.node_addr # reset previous execute fail properties prev_exec_fail_props = exec_fail_props exec_fail_props = None try: result = await self._try_execute(ctx, exec_props, prev_exec_fail_props) except asyncio.CancelledError: raise except Exception as e: exec_fail_props = ExecuteFailProps( node_addr=node_addr, error=e, ) if exec_fail_props: await self._on_execute_fail(ctx, exec_fail_props) continue break return result
async def _fetch_state(self, addrs: Sequence[Address]) -> ClusterState: if len(addrs) == 0: raise RuntimeError("no addrs to fetch cluster state") last_err: Optional[BaseException] = None if len(addrs) > 10: # choose first minimum ten addrs # addrs probable randomized addrs = addrs[: max(10, len(addrs) // 2)] logger.debug("Trying to obtain cluster state from addrs: %r", addrs) # get first successful cluster slots response for addr in addrs: logger.info("Obtain cluster state from %s", addr) try: pool = await self._pooler.ensure_pool(addr) async with async_timeout.timeout(self._execute_timeout): # ensure one connection behaviour async with pool.get() as conn: raw_cluster_info: str = await conn.execute( b"CLUSTER", b"INFO", encoding="utf-8" ) cluster_info = parse_info(raw_cluster_info) slots_resp = await conn.execute(b"CLUSTER", b"SLOTS", encoding="utf-8") except asyncio.TimeoutError as e: last_err = e logger.warning("Getting cluster state from %s is timed out", addr) continue except Exception as e: last_err = e logger.warning("Unable to get cluster state from %s: %r", addr, e) continue if cluster_info[CLUSTER_INFO_STATE_KEY] != NodeClusterState.OK.value: logger.warning( 'Node %s was return not "ok" cluster state "%s". Try next node', addr, cluster_info[CLUSTER_INFO_STATE_KEY], ) continue logger.debug( "Cluster state successful loaded from %s: info:%r slots:%r", addr, cluster_info, slots_resp, ) break else: if last_err is not None: logger.error("No available hosts to load cluster slots. Tried hosts: %r", addrs) raise last_err state = create_cluster_state(slots_resp, cluster_info, addr) if state.state is not NodeClusterState.OK: logger.warning( ( "Cluster probably broken. Tried %d nodes and " 'apply not "ok" (%s) cluster state from %s' ), len(addrs), state.state.value, addr, ) return state