Esempio n. 1
0
File: master.py Progetto: nagius/cxm
	def countVotes(self):
		if self.role != MasterService.RL_VOTING:
			log.warn("Tally triggered but it's not election time !")
			return

		if type(self.ballotBox) != dict or len(self.ballotBox) == 0:
			log.emerg("No vote received ! There is a critical network failure.")
			self.panic(True) # noCheck=True because role is not consistent
			return

		# Select election winner
		self.currentElection=None
		self.lastTallyDate=int(time.time())
		self.master=self.ballotBox[max(self.ballotBox.keys())]
		log.info("New master is %s." % (self.master))
		self._startSlave()

		if self.master == DNSCache.getInstance().name:
			log.info("I'm the new master.")
			self.role=MasterService.RL_ACTIVE
			self._startMaster()
		else:
			self.role=MasterService.RL_PASSIVE
		
		if self.panicRequested:
			log.warn("Engaging panic mode requested during election stage.")
			self.panicRequested=False
			self.panic()
Esempio n. 2
0
    def countVotes(self):
        if self.role != MasterService.RL_VOTING:
            log.warn("Tally triggered but it's not election time !")
            return

        if type(self.ballotBox) != dict or len(self.ballotBox) == 0:
            log.emerg(
                "No vote received ! There is a critical network failure.")
            self.panic(True)  # noCheck=True because role is not consistent
            return

        # Select election winner
        self.currentElection = None
        self.lastTallyDate = int(time.time())
        self.master = self.ballotBox[max(self.ballotBox.keys())]
        log.info("New master is %s." % (self.master))
        self._startSlave()

        if self.master == DNSCache.getInstance().name:
            log.info("I'm the new master.")
            self.role = MasterService.RL_ACTIVE
            self._startMaster()
        else:
            self.role = MasterService.RL_PASSIVE

        if self.panicRequested:
            log.warn("Engaging panic mode requested during election stage.")
            self.panicRequested = False
            self.panic()
Esempio n. 3
0
File: master.py Progetto: nagius/cxm
		def slaveWatchdogFailed(reason):
			log.emerg("Master Heartbeat checks failed: %s." % (reason.getErrorMessage()))
			# Stop slave heartbeat to tell master we have a problem, but if we are here, 
			# there is no more master, and so, we cannot ensure that panic mode will be propagated.
			# Hope that another node will trigger an election... and fence me.
			self.s_slaveHb.stopService()  
			log.emerg("This is an unrecoverable error: FENCE ME !")
			self.panic(True) # noCheck because there is no master
Esempio n. 4
0
 def slaveWatchdogFailed(reason):
     log.emerg("Master Heartbeat checks failed: %s." %
               (reason.getErrorMessage()))
     # Stop slave heartbeat to tell master we have a problem, but if we are here,
     # there is no more master, and so, we cannot ensure that panic mode will be propagated.
     # Hope that another node will trigger an election... and fence me.
     self.s_slaveHb.stopService()
     log.emerg("This is an unrecoverable error: FENCE ME !")
     self.panic(True)  # noCheck because there is no master
Esempio n. 5
0
	def _sendError(self, reason):
		# Log all stacktrace to view the origin of this error
		log.err("Netheartbeat failure: %s" % (reason))
		if self.retry >= self.MAX_RETRY:
			log.emerg("Too many retry. Asking master to engage panic mode.")
			# Engage panic mode
			agent=Agent()
			d=agent.panic()
			d.addErrback(log.err)
			d.addBoth(lambda x: agent.disconnect())
		else:
			log.warn("Restarting network heartbeat within a few seconds...")
			self.retry+=1	# Will be resetted each elections (or panic recovery)
			reactor.callLater(2, self._run, self._proto)
Esempio n. 6
0
 def _sendError(self, reason):
     # Log all stacktrace to view the origin of this error
     log.err("Netheartbeat failure: %s" % (reason))
     if self.retry >= self.MAX_RETRY:
         log.emerg("Too many retry. Asking master to engage panic mode.")
         # Engage panic mode
         agent = Agent()
         d = agent.panic()
         d.addErrback(log.err)
         d.addBoth(lambda x: agent.disconnect())
     else:
         log.warn("Restarting network heartbeat within a few seconds...")
         self.retry += 1  # Will be resetted each elections (or panic recovery)
         reactor.callLater(2, self._run, self._proto)
Esempio n. 7
0
    def updateMasterStatus(self, msg):

        if self.master is None:
            self.master = msg.node
            log.info("Found master at %s." % (self.master))
        else:
            # Check origin of message if we known the cluster members
            if msg.node not in self.status:
                log.warn("Received master heartbeat from unknown node %s." %
                         (msg.node))
                return

        # Active master's checks
        if self.role == MasterService.RL_ACTIVE:
            if self.master == msg.node:
                return  # Discard our own master heartbeat
            else:
                # Usecase #8: partition ended with many master
                log.warn(
                    "Received another master's heartbeat from %s ! Trying to recover from partition..."
                    % (msg.node))
                self.triggerElection().addErrback(log.err)

                # Propagate panic mode from another master
                if msg.state == MasterService.ST_PANIC:
                    log.warn(
                        "Concurrent master is in panic mode, so we should be too."
                    )
                    self.panic()
                return

        # Passive master's checks
        if self.master != msg.node:
            log.warn("Received master heartbeat from a wrong master %s !" %
                     (msg.node))
            return

        # Check error mode change to panic
        if not self.isInPanic() and msg.state == MasterService.ST_PANIC:
            log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.")

        # Keep a backup of the active master's state and status
        self.status = msg.status
        self.state = msg.state
        self.masterLastSeen = int(time.time())
Esempio n. 8
0
File: master.py Progetto: nagius/cxm
	def updateMasterStatus(self, msg):

		if self.master is None:
			self.master=msg.node
			log.info("Found master at %s." % (self.master))
		else:
			# Check origin of message if we known the cluster members
			if msg.node not in self.status:
				log.warn("Received master heartbeat from unknown node %s." % (msg.node))
				return

		# Active master's checks 
		if self.role == MasterService.RL_ACTIVE:
			if self.master == msg.node:
				return		# Discard our own master heartbeat
			else:
				# Usecase #8: partition ended with many master
				log.warn("Received another master's heartbeat from %s ! Trying to recover from partition..." % (msg.node))
				self.triggerElection().addErrback(log.err) 

				# Propagate panic mode from another master
				if msg.state == MasterService.ST_PANIC:
					log.warn("Concurrent master is in panic mode, so we should be too.")
					self.panic()
				return

		# Passive master's checks
		if self.master != msg.node:
			log.warn("Received master heartbeat from a wrong master %s !" % (msg.node))
			return

		# Check error mode change to panic
		if not self.isInPanic() and msg.state == MasterService.ST_PANIC:
			log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.")

		# Keep a backup of the active master's state and status
		self.status=msg.status
		self.state=msg.state
		self.masterLastSeen=int(time.time())
Esempio n. 9
0
    def panic(self, noCheck=False):
        """ 
		Engage panic mode.
		Use noCheck=True if you want to panic whatever the cluster role.
		"""
        def panicFailed(reason):
            log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
            self.panic(True)

        if self.state == MasterService.ST_PANIC:
            log.emerg("Panic mode already engaged.")

        elif self.role == MasterService.RL_ACTIVE or noCheck:
            log.emerg("SYSTEM FAILURE: Panic mode engaged.")
            log.emerg(
                "This is a critical error. You should bring your ass over here, right now."
            )
            log.emerg(
                "Please check logs and be sure of what you're doing before re-engaging normal mode."
            )
            self.state = MasterService.ST_PANIC

            # TODO + stop LB
            if self.l_masterDog.running:
                self.l_masterDog.stop()

        elif self.role == MasterService.RL_VOTING:
            # No master during election stage: waiting next master
            log.warn("Panic mode requested during election stage: delaying.")
            self.panicRequested = True

        elif self.role == MasterService.RL_PASSIVE:
            log.warn("I'm slave: asking master to engage panic mode...")

            agent = Agent()
            d = agent.panic()
            d.addErrback(panicFailed)
            d.addErrback(log.err)

        else:  # RL_ALONE or RL_JOINING or RL_LEAVING
            log.warn(
                "I'm not in a running state (master or slave). Cannot engage panic mode."
            )
            raise RPCRefusedError("Not in running state")
Esempio n. 10
0
File: master.py Progetto: nagius/cxm
	def panic(self, noCheck=False):
		""" 
		Engage panic mode.
		Use noCheck=True if you want to panic whatever the cluster role.
		"""

		def panicFailed(reason):
			log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
			self.panic(True)

		if self.state == MasterService.ST_PANIC:
			log.emerg("Panic mode already engaged.")

		elif self.role == MasterService.RL_ACTIVE or noCheck:
			log.emerg("SYSTEM FAILURE: Panic mode engaged.")
			log.emerg("This is a critical error. You should bring your ass over here, right now.")
			log.emerg("Please check logs and be sure of what you're doing before re-engaging normal mode.")
			self.state=MasterService.ST_PANIC

			# TODO + stop LB
			if self.l_masterDog.running:
				self.l_masterDog.stop()

		elif self.role == MasterService.RL_VOTING:
			# No master during election stage: waiting next master
			log.warn("Panic mode requested during election stage: delaying.")
			self.panicRequested=True

		elif self.role == MasterService.RL_PASSIVE:
			log.warn("I'm slave: asking master to engage panic mode...")

			agent=Agent()
			d=agent.panic()
			d.addErrback(panicFailed)
			d.addErrback(log.err)

		else: # RL_ALONE or RL_JOINING or RL_LEAVING
			log.warn("I'm not in a running state (master or slave). Cannot engage panic mode.")
			raise RPCRefusedError("Not in running state")
Esempio n. 11
0
File: master.py Progetto: nagius/cxm
		def recoverFailed(reason):
			log.emerg("Recovery failed:", reason.getErrorMessage())
			self.panic()
Esempio n. 12
0
File: master.py Progetto: nagius/cxm
		def masterWatchdogFailed(reason):
			log.emerg("Slave heartbeat checks failed: %s." % (reason.getErrorMessage()))
			self.panic()
Esempio n. 13
0
File: master.py Progetto: nagius/cxm
		def panicFailed(reason):
			log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
			self.panic(True)
Esempio n. 14
0
 def recoverFailed(reason):
     log.emerg("Recovery failed:", reason.getErrorMessage())
     self.panic()
Esempio n. 15
0
 def masterWatchdogFailed(reason):
     log.emerg("Slave heartbeat checks failed: %s." %
               (reason.getErrorMessage()))
     self.panic()
Esempio n. 16
0
 def panicFailed(reason):
     log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
     self.panic(True)