def reschedule_routers_from_down_agents(self):
        """Reschedule routers from down l3 agents if admin state is up."""

        # give agents extra time to handle transient failures
        agent_dead_limit = cfg.CONF.agent_down_time * 2

        # check for an abrupt clock change since last check. if a change is
        # detected, sleep for a while to let the agents check in.
        tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary',
                                              timeutils.utcnow())
        if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
            LOG.warn(_LW("Time since last L3 agent reschedule check has "
                         "exceeded the interval between checks. Waiting "
                         "before check to allow agents to send a heartbeat "
                         "in case there was a clock adjustment."))
            time.sleep(agent_dead_limit)
        self._clock_jump_canary = timeutils.utcnow()

        context = n_ctx.get_admin_context()
        cutoff = timeutils.utcnow() - datetime.timedelta(
            seconds=agent_dead_limit)
        down_bindings = (
            context.session.query(RouterL3AgentBinding).
            join(agents_db.Agent).
            filter(agents_db.Agent.heartbeat_timestamp < cutoff,
                   agents_db.Agent.admin_state_up))
        for binding in down_bindings:
            LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s "
                         "because the agent did not report to the server in "
                         "the last %(dead_time)s seconds."),
                     {'router': binding.router_id,
                      'agent': binding.l3_agent_id,
                      'dead_time': agent_dead_limit})
            self.reschedule_router(context, binding.router_id)
Beispiel #2
0
    def reschedule_routers_from_down_agents(self):
        """Reschedule routers from down l3 agents if admin state is up."""

        # give agents extra time to handle transient failures
        agent_dead_limit = cfg.CONF.agent_down_time * 2

        # check for an abrupt clock change since last check. if a change is
        # detected, sleep for a while to let the agents check in.
        tdelta = timeutils.utcnow() - getattr(self, "_clock_jump_canary", timeutils.utcnow())
        if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
            LOG.warn(
                _LW(
                    "Time since last L3 agent reschedule check has "
                    "exceeded the interval between checks. Waiting "
                    "before check to allow agents to send a heartbeat "
                    "in case there was a clock adjustment."
                )
            )
            time.sleep(agent_dead_limit)
        self._clock_jump_canary = timeutils.utcnow()

        context = n_ctx.get_admin_context()
        cutoff = timeutils.utcnow() - datetime.timedelta(seconds=agent_dead_limit)
        down_bindings = (
            context.session.query(RouterL3AgentBinding)
            .join(agents_db.Agent)
            .filter(agents_db.Agent.heartbeat_timestamp < cutoff, agents_db.Agent.admin_state_up)
            .outerjoin(
                l3_attrs_db.RouterExtraAttributes,
                l3_attrs_db.RouterExtraAttributes.router_id == RouterL3AgentBinding.router_id,
            )
            .filter(
                sa.or_(
                    l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
                    l3_attrs_db.RouterExtraAttributes.ha == sql.null(),
                )
            )
        )
        try:
            for binding in down_bindings:
                LOG.warn(
                    _LW(
                        "Rescheduling router %(router)s from agent %(agent)s "
                        "because the agent did not report to the server in "
                        "the last %(dead_time)s seconds."
                    ),
                    {"router": binding.router_id, "agent": binding.l3_agent_id, "dead_time": agent_dead_limit},
                )
                try:
                    self.reschedule_router(context, binding.router_id)
                except (l3agentscheduler.RouterReschedulingFailed, n_rpc.RemoteError):
                    # Catch individual router rescheduling errors here
                    # so one broken one doesn't stop the iteration.
                    LOG.exception(_LE("Failed to reschedule router %s"), binding.router_id)
        except db_exc.DBError:
            # Catch DB errors here so a transient DB connectivity issue
            # doesn't stop the loopingcall.
            LOG.exception(_LE("Exception encountered during router " "rescheduling."))
Beispiel #3
0
    def reschedule_routers_from_down_agents(self):
        """Reschedule routers from down l3 agents if admin state is up."""

        # give agents extra time to handle transient failures
        agent_dead_limit = cfg.CONF.agent_down_time * 2

        # check for an abrupt clock change since last check. if a change is
        # detected, sleep for a while to let the agents check in.
        tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary',
                                              timeutils.utcnow())
        if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
            LOG.warn(_LW("Time since last L3 agent reschedule check has "
                         "exceeded the interval between checks. Waiting "
                         "before check to allow agents to send a heartbeat "
                         "in case there was a clock adjustment."))
            time.sleep(agent_dead_limit)
        self._clock_jump_canary = timeutils.utcnow()

        context = n_ctx.get_admin_context()
        cutoff = timeutils.utcnow() - datetime.timedelta(
            seconds=agent_dead_limit)
        down_bindings = (
            context.session.query(RouterL3AgentBinding).
            join(agents_db.Agent).
            filter(agents_db.Agent.heartbeat_timestamp < cutoff,
                   agents_db.Agent.admin_state_up).
            outerjoin(l3_attrs_db.RouterExtraAttributes,
                      l3_attrs_db.RouterExtraAttributes.router_id ==
                      RouterL3AgentBinding.router_id).
            filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
                          l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
        try:
            for binding in down_bindings:
                LOG.warn(_LW(
                    "Rescheduling router %(router)s from agent %(agent)s "
                    "because the agent did not report to the server in "
                    "the last %(dead_time)s seconds."),
                    {'router': binding.router_id,
                     'agent': binding.l3_agent_id,
                     'dead_time': agent_dead_limit})
                try:
                    self.reschedule_router(context, binding.router_id)
                except (l3agentscheduler.RouterReschedulingFailed,
                        n_rpc.RemoteError):
                    # Catch individual router rescheduling errors here
                    # so one broken one doesn't stop the iteration.
                    LOG.exception(_LE("Failed to reschedule router %s"),
                                  binding.router_id)
        except db_exc.DBError:
            # Catch DB errors here so a transient DB connectivity issue
            # doesn't stop the loopingcall.
            LOG.exception(_LE("Exception encountered during router "
                              "rescheduling."))
    def reschedule_lbaas_from_down_agents(self):
        """Reschedule lbaas from down lbaas agents if admin state is up."""
        LOG.info("reschedule_lbaas_from_down_agents called.")
        # give agents extra time to handle transient failures
        agent_dead_limit = cfg.CONF.agent_down_time * 2

        # check for an abrupt clock change since last check. if a change is
        # detected, sleep for a while to let the agents check in.
        tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary',
                                              timeutils.utcnow())
        if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
            LOG.warn(
                _LW("Time since last LBaaS agent reschedule check has "
                    "exceeded the interval between checks. Waiting "
                    "before check to allow agents to send a heartbeat "
                    "in case there was a clock adjustment."))
            time.sleep(agent_dead_limit)
        self._clock_jump_canary = timeutils.utcnow()

        context = n_ctx.get_admin_context()
        cutoff = timeutils.utcnow() - datetime.timedelta(
            seconds=agent_dead_limit)
        down_bindings = (context.session.query(LoadbalancerAgentBinding).join(
            agents_db.Agent).filter(
                agents_db.Agent.heartbeat_timestamp < cutoff,
                agents_db.Agent.admin_state_up))
        for binding in down_bindings:
            LOG.warn(
                _LW("Rescheduling loadbalancer %(loadbalancer)s from agent %(agent)s "
                    "because the agent did not report to the server in "
                    "the last %(dead_time)s seconds."), {
                        'loadbalancer': binding.loadbalancer_id,
                        'agent': binding.agent_id,
                        'dead_time': agent_dead_limit
                    })
            self.reschedule_loadbalancer_instance(binding.loadbalancer_id)