def countVotes(self): if self.role != MasterService.RL_VOTING: log.warn("Tally triggered but it's not election time !") return if type(self.ballotBox) != dict or len(self.ballotBox) == 0: log.emerg("No vote received ! There is a critical network failure.") self.panic(True) # noCheck=True because role is not consistent return # Select election winner self.currentElection=None self.lastTallyDate=int(time.time()) self.master=self.ballotBox[max(self.ballotBox.keys())] log.info("New master is %s." % (self.master)) self._startSlave() if self.master == DNSCache.getInstance().name: log.info("I'm the new master.") self.role=MasterService.RL_ACTIVE self._startMaster() else: self.role=MasterService.RL_PASSIVE if self.panicRequested: log.warn("Engaging panic mode requested during election stage.") self.panicRequested=False self.panic()
def countVotes(self): if self.role != MasterService.RL_VOTING: log.warn("Tally triggered but it's not election time !") return if type(self.ballotBox) != dict or len(self.ballotBox) == 0: log.emerg( "No vote received ! There is a critical network failure.") self.panic(True) # noCheck=True because role is not consistent return # Select election winner self.currentElection = None self.lastTallyDate = int(time.time()) self.master = self.ballotBox[max(self.ballotBox.keys())] log.info("New master is %s." % (self.master)) self._startSlave() if self.master == DNSCache.getInstance().name: log.info("I'm the new master.") self.role = MasterService.RL_ACTIVE self._startMaster() else: self.role = MasterService.RL_PASSIVE if self.panicRequested: log.warn("Engaging panic mode requested during election stage.") self.panicRequested = False self.panic()
def slaveWatchdogFailed(reason): log.emerg("Master Heartbeat checks failed: %s." % (reason.getErrorMessage())) # Stop slave heartbeat to tell master we have a problem, but if we are here, # there is no more master, and so, we cannot ensure that panic mode will be propagated. # Hope that another node will trigger an election... and fence me. self.s_slaveHb.stopService() log.emerg("This is an unrecoverable error: FENCE ME !") self.panic(True) # noCheck because there is no master
def _sendError(self, reason): # Log all stacktrace to view the origin of this error log.err("Netheartbeat failure: %s" % (reason)) if self.retry >= self.MAX_RETRY: log.emerg("Too many retry. Asking master to engage panic mode.") # Engage panic mode agent=Agent() d=agent.panic() d.addErrback(log.err) d.addBoth(lambda x: agent.disconnect()) else: log.warn("Restarting network heartbeat within a few seconds...") self.retry+=1 # Will be resetted each elections (or panic recovery) reactor.callLater(2, self._run, self._proto)
def _sendError(self, reason): # Log all stacktrace to view the origin of this error log.err("Netheartbeat failure: %s" % (reason)) if self.retry >= self.MAX_RETRY: log.emerg("Too many retry. Asking master to engage panic mode.") # Engage panic mode agent = Agent() d = agent.panic() d.addErrback(log.err) d.addBoth(lambda x: agent.disconnect()) else: log.warn("Restarting network heartbeat within a few seconds...") self.retry += 1 # Will be resetted each elections (or panic recovery) reactor.callLater(2, self._run, self._proto)
def updateMasterStatus(self, msg): if self.master is None: self.master = msg.node log.info("Found master at %s." % (self.master)) else: # Check origin of message if we known the cluster members if msg.node not in self.status: log.warn("Received master heartbeat from unknown node %s." % (msg.node)) return # Active master's checks if self.role == MasterService.RL_ACTIVE: if self.master == msg.node: return # Discard our own master heartbeat else: # Usecase #8: partition ended with many master log.warn( "Received another master's heartbeat from %s ! Trying to recover from partition..." % (msg.node)) self.triggerElection().addErrback(log.err) # Propagate panic mode from another master if msg.state == MasterService.ST_PANIC: log.warn( "Concurrent master is in panic mode, so we should be too." ) self.panic() return # Passive master's checks if self.master != msg.node: log.warn("Received master heartbeat from a wrong master %s !" % (msg.node)) return # Check error mode change to panic if not self.isInPanic() and msg.state == MasterService.ST_PANIC: log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.") # Keep a backup of the active master's state and status self.status = msg.status self.state = msg.state self.masterLastSeen = int(time.time())
def updateMasterStatus(self, msg): if self.master is None: self.master=msg.node log.info("Found master at %s." % (self.master)) else: # Check origin of message if we known the cluster members if msg.node not in self.status: log.warn("Received master heartbeat from unknown node %s." % (msg.node)) return # Active master's checks if self.role == MasterService.RL_ACTIVE: if self.master == msg.node: return # Discard our own master heartbeat else: # Usecase #8: partition ended with many master log.warn("Received another master's heartbeat from %s ! Trying to recover from partition..." % (msg.node)) self.triggerElection().addErrback(log.err) # Propagate panic mode from another master if msg.state == MasterService.ST_PANIC: log.warn("Concurrent master is in panic mode, so we should be too.") self.panic() return # Passive master's checks if self.master != msg.node: log.warn("Received master heartbeat from a wrong master %s !" % (msg.node)) return # Check error mode change to panic if not self.isInPanic() and msg.state == MasterService.ST_PANIC: log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.") # Keep a backup of the active master's state and status self.status=msg.status self.state=msg.state self.masterLastSeen=int(time.time())
def panic(self, noCheck=False): """ Engage panic mode. Use noCheck=True if you want to panic whatever the cluster role. """ def panicFailed(reason): log.emerg("Panic query failed: %s." % (reason.getErrorMessage())) self.panic(True) if self.state == MasterService.ST_PANIC: log.emerg("Panic mode already engaged.") elif self.role == MasterService.RL_ACTIVE or noCheck: log.emerg("SYSTEM FAILURE: Panic mode engaged.") log.emerg( "This is a critical error. You should bring your ass over here, right now." ) log.emerg( "Please check logs and be sure of what you're doing before re-engaging normal mode." ) self.state = MasterService.ST_PANIC # TODO + stop LB if self.l_masterDog.running: self.l_masterDog.stop() elif self.role == MasterService.RL_VOTING: # No master during election stage: waiting next master log.warn("Panic mode requested during election stage: delaying.") self.panicRequested = True elif self.role == MasterService.RL_PASSIVE: log.warn("I'm slave: asking master to engage panic mode...") agent = Agent() d = agent.panic() d.addErrback(panicFailed) d.addErrback(log.err) else: # RL_ALONE or RL_JOINING or RL_LEAVING log.warn( "I'm not in a running state (master or slave). Cannot engage panic mode." ) raise RPCRefusedError("Not in running state")
def panic(self, noCheck=False): """ Engage panic mode. Use noCheck=True if you want to panic whatever the cluster role. """ def panicFailed(reason): log.emerg("Panic query failed: %s." % (reason.getErrorMessage())) self.panic(True) if self.state == MasterService.ST_PANIC: log.emerg("Panic mode already engaged.") elif self.role == MasterService.RL_ACTIVE or noCheck: log.emerg("SYSTEM FAILURE: Panic mode engaged.") log.emerg("This is a critical error. You should bring your ass over here, right now.") log.emerg("Please check logs and be sure of what you're doing before re-engaging normal mode.") self.state=MasterService.ST_PANIC # TODO + stop LB if self.l_masterDog.running: self.l_masterDog.stop() elif self.role == MasterService.RL_VOTING: # No master during election stage: waiting next master log.warn("Panic mode requested during election stage: delaying.") self.panicRequested=True elif self.role == MasterService.RL_PASSIVE: log.warn("I'm slave: asking master to engage panic mode...") agent=Agent() d=agent.panic() d.addErrback(panicFailed) d.addErrback(log.err) else: # RL_ALONE or RL_JOINING or RL_LEAVING log.warn("I'm not in a running state (master or slave). Cannot engage panic mode.") raise RPCRefusedError("Not in running state")
def recoverFailed(reason): log.emerg("Recovery failed:", reason.getErrorMessage()) self.panic()
def masterWatchdogFailed(reason): log.emerg("Slave heartbeat checks failed: %s." % (reason.getErrorMessage())) self.panic()
def panicFailed(reason): log.emerg("Panic query failed: %s." % (reason.getErrorMessage())) self.panic(True)