def reschedule_routers_from_down_agents(self): """Reschedule routers from down l3 agents if admin state is up.""" # give agents extra time to handle transient failures agent_dead_limit = cfg.CONF.agent_down_time * 2 # check for an abrupt clock change since last check. if a change is # detected, sleep for a while to let the agents check in. tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary', timeutils.utcnow()) if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time: LOG.warn(_LW("Time since last L3 agent reschedule check has " "exceeded the interval between checks. Waiting " "before check to allow agents to send a heartbeat " "in case there was a clock adjustment.")) time.sleep(agent_dead_limit) self._clock_jump_canary = timeutils.utcnow() context = n_ctx.get_admin_context() cutoff = timeutils.utcnow() - datetime.timedelta( seconds=agent_dead_limit) down_bindings = ( context.session.query(RouterL3AgentBinding). join(agents_db.Agent). filter(agents_db.Agent.heartbeat_timestamp < cutoff, agents_db.Agent.admin_state_up)) for binding in down_bindings: LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s " "because the agent did not report to the server in " "the last %(dead_time)s seconds."), {'router': binding.router_id, 'agent': binding.l3_agent_id, 'dead_time': agent_dead_limit}) self.reschedule_router(context, binding.router_id)
def reschedule_routers_from_down_agents(self): """Reschedule routers from down l3 agents if admin state is up.""" # give agents extra time to handle transient failures agent_dead_limit = cfg.CONF.agent_down_time * 2 # check for an abrupt clock change since last check. if a change is # detected, sleep for a while to let the agents check in. tdelta = timeutils.utcnow() - getattr(self, "_clock_jump_canary", timeutils.utcnow()) if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time: LOG.warn( _LW( "Time since last L3 agent reschedule check has " "exceeded the interval between checks. Waiting " "before check to allow agents to send a heartbeat " "in case there was a clock adjustment." ) ) time.sleep(agent_dead_limit) self._clock_jump_canary = timeutils.utcnow() context = n_ctx.get_admin_context() cutoff = timeutils.utcnow() - datetime.timedelta(seconds=agent_dead_limit) down_bindings = ( context.session.query(RouterL3AgentBinding) .join(agents_db.Agent) .filter(agents_db.Agent.heartbeat_timestamp < cutoff, agents_db.Agent.admin_state_up) .outerjoin( l3_attrs_db.RouterExtraAttributes, l3_attrs_db.RouterExtraAttributes.router_id == RouterL3AgentBinding.router_id, ) .filter( sa.or_( l3_attrs_db.RouterExtraAttributes.ha == sql.false(), l3_attrs_db.RouterExtraAttributes.ha == sql.null(), ) ) ) try: for binding in down_bindings: LOG.warn( _LW( "Rescheduling router %(router)s from agent %(agent)s " "because the agent did not report to the server in " "the last %(dead_time)s seconds." ), {"router": binding.router_id, "agent": binding.l3_agent_id, "dead_time": agent_dead_limit}, ) try: self.reschedule_router(context, binding.router_id) except (l3agentscheduler.RouterReschedulingFailed, n_rpc.RemoteError): # Catch individual router rescheduling errors here # so one broken one doesn't stop the iteration. LOG.exception(_LE("Failed to reschedule router %s"), binding.router_id) except db_exc.DBError: # Catch DB errors here so a transient DB connectivity issue # doesn't stop the loopingcall. LOG.exception(_LE("Exception encountered during router " "rescheduling."))
def reschedule_routers_from_down_agents(self): """Reschedule routers from down l3 agents if admin state is up.""" # give agents extra time to handle transient failures agent_dead_limit = cfg.CONF.agent_down_time * 2 # check for an abrupt clock change since last check. if a change is # detected, sleep for a while to let the agents check in. tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary', timeutils.utcnow()) if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time: LOG.warn(_LW("Time since last L3 agent reschedule check has " "exceeded the interval between checks. Waiting " "before check to allow agents to send a heartbeat " "in case there was a clock adjustment.")) time.sleep(agent_dead_limit) self._clock_jump_canary = timeutils.utcnow() context = n_ctx.get_admin_context() cutoff = timeutils.utcnow() - datetime.timedelta( seconds=agent_dead_limit) down_bindings = ( context.session.query(RouterL3AgentBinding). join(agents_db.Agent). filter(agents_db.Agent.heartbeat_timestamp < cutoff, agents_db.Agent.admin_state_up). outerjoin(l3_attrs_db.RouterExtraAttributes, l3_attrs_db.RouterExtraAttributes.router_id == RouterL3AgentBinding.router_id). filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(), l3_attrs_db.RouterExtraAttributes.ha == sql.null()))) try: for binding in down_bindings: LOG.warn(_LW( "Rescheduling router %(router)s from agent %(agent)s " "because the agent did not report to the server in " "the last %(dead_time)s seconds."), {'router': binding.router_id, 'agent': binding.l3_agent_id, 'dead_time': agent_dead_limit}) try: self.reschedule_router(context, binding.router_id) except (l3agentscheduler.RouterReschedulingFailed, n_rpc.RemoteError): # Catch individual router rescheduling errors here # so one broken one doesn't stop the iteration. LOG.exception(_LE("Failed to reschedule router %s"), binding.router_id) except db_exc.DBError: # Catch DB errors here so a transient DB connectivity issue # doesn't stop the loopingcall. LOG.exception(_LE("Exception encountered during router " "rescheduling."))
def reschedule_lbaas_from_down_agents(self): """Reschedule lbaas from down lbaas agents if admin state is up.""" LOG.info("reschedule_lbaas_from_down_agents called.") # give agents extra time to handle transient failures agent_dead_limit = cfg.CONF.agent_down_time * 2 # check for an abrupt clock change since last check. if a change is # detected, sleep for a while to let the agents check in. tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary', timeutils.utcnow()) if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time: LOG.warn( _LW("Time since last LBaaS agent reschedule check has " "exceeded the interval between checks. Waiting " "before check to allow agents to send a heartbeat " "in case there was a clock adjustment.")) time.sleep(agent_dead_limit) self._clock_jump_canary = timeutils.utcnow() context = n_ctx.get_admin_context() cutoff = timeutils.utcnow() - datetime.timedelta( seconds=agent_dead_limit) down_bindings = (context.session.query(LoadbalancerAgentBinding).join( agents_db.Agent).filter( agents_db.Agent.heartbeat_timestamp < cutoff, agents_db.Agent.admin_state_up)) for binding in down_bindings: LOG.warn( _LW("Rescheduling loadbalancer %(loadbalancer)s from agent %(agent)s " "because the agent did not report to the server in " "the last %(dead_time)s seconds."), { 'loadbalancer': binding.loadbalancer_id, 'agent': binding.agent_id, 'dead_time': agent_dead_limit }) self.reschedule_loadbalancer_instance(binding.loadbalancer_id)