Example #1
0
File: master.py Project: nagius/cxm
	def __init__(self):
		self.role			= MasterService.RL_ALONE		# Current role of this node
		self.state			= MasterService.ST_NORMAL		# Current cluster error status
		self.master			= None							# Name of the active master
		self.masterLastSeen	= 0								# Timestamp for master failover
		self.status			= dict()						# Whole cluster status
		self.localNode		= Node(DNSCache.getInstance().name)
		self.disk			= DiskHeartbeat()
		self.s_slaveHb		= SlaveHearbeatService(self)
		self.s_masterHb		= MasterHeartbeatService(self)
		self.s_rpc			= RPCService(self) 

		# Watchdogs for failover
		self.l_slaveDog		= task.LoopingCall(self.checkMasterHeartbeat)
		self.l_masterDog	= task.LoopingCall(self.checkSlaveHeartbeats)

		# Election Stuff
		self.ballotBox 			= None		# All received votes
		self.currentElection	= None		# Election name, none if no pending election
		self.f_tally			= None		# IDelayedCall used to trigger countVotes()
		self.lastTallyDate		= 0			# Timestamp for debbuging elections
		self.panicRequested		= False		# True if panic is requested during election
Example #2
0
File: master.py Project: nagius/cxm
class MasterService(Service):

	# Possible master roles (for self.role)
	RL_ACTIVE  = "active"			# Active master, aka master
	RL_PASSIVE = "passive"  		# Passive master, aka slave
	RL_JOINING = "joining"			# When trying to connect to the cluster
	RL_LEAVING = "leaving"			# When trying to leave the cluster
	RL_VOTING  = "voting"			# During election stage
	RL_ALONE   = "alone"			# Before joining

	# Possible states, aka error mode (for self.state)
	ST_NORMAL    = "normal" 		# Normal operations
	ST_RECOVERY  = "recovery"		# When a failed node is being recovered
	ST_PANIC     = "panic"			# "I don't do anything" mode

	# Elections and failover timeouts
	TM_TALLY	= 1					# Records vote for 1 sec
	TM_WATCHDOG	= core.cfg['TIMER']	# Check for failure every 3 sec
	TM_MASTER	= TM_WATCHDOG*2		# Re-elect master if no response wihtin 6 sec
	TM_SLAVE	= TM_WATCHDOG*3		# Trigger failover if no response within 9 sec (master + tally + rounding)

	def __init__(self):
		self.role			= MasterService.RL_ALONE		# Current role of this node
		self.state			= MasterService.ST_NORMAL		# Current cluster error status
		self.master			= None							# Name of the active master
		self.masterLastSeen	= 0								# Timestamp for master failover
		self.status			= dict()						# Whole cluster status
		self.localNode		= Node(DNSCache.getInstance().name)
		self.disk			= DiskHeartbeat()
		self.s_slaveHb		= SlaveHearbeatService(self)
		self.s_masterHb		= MasterHeartbeatService(self)
		self.s_rpc			= RPCService(self) 

		# Watchdogs for failover
		self.l_slaveDog		= task.LoopingCall(self.checkMasterHeartbeat)
		self.l_masterDog	= task.LoopingCall(self.checkSlaveHeartbeats)

		# Election Stuff
		self.ballotBox 			= None		# All received votes
		self.currentElection	= None		# Election name, none if no pending election
		self.f_tally			= None		# IDelayedCall used to trigger countVotes()
		self.lastTallyDate		= 0			# Timestamp for debbuging elections
		self.panicRequested		= False		# True if panic is requested during election

	def startService(self):
		Service.startService(self)
		
		# Print welcome message
		log.info("Starting cxmd version", meta.version)

		self._messagePort=reactor.listenUDP(core.cfg['UDP_PORT'], UDPListener(self.dispatchMessage))
		reactor.callLater(2, self.joinCluster)

	def stopService(self):
		def exit(result):
			log.info("Stopping daemon...")
			if not reactor._stopped:
				reactor.stop()

		if self.running:
			Service.stopService(self)

			# Stop receiving cluster messages
			self._messagePort.stopListening()
			self.s_rpc.stopService().addErrback(log.err)

			# Cleanly leave cluster
			d = self.leaveCluster()
			d.addErrback(log.err)
			d.addBoth(exit) # Even if there are errors
			return d
		else:
			return defer.succeed(None)
	
	def panic(self, noCheck=False):
		""" 
		Engage panic mode.
		Use noCheck=True if you want to panic whatever the cluster role.
		"""

		def panicFailed(reason):
			log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
			self.panic(True)

		if self.state == MasterService.ST_PANIC:
			log.emerg("Panic mode already engaged.")

		elif self.role == MasterService.RL_ACTIVE or noCheck:
			log.emerg("SYSTEM FAILURE: Panic mode engaged.")
			log.emerg("This is a critical error. You should bring your ass over here, right now.")
			log.emerg("Please check logs and be sure of what you're doing before re-engaging normal mode.")
			self.state=MasterService.ST_PANIC

			# TODO + stop LB
			if self.l_masterDog.running:
				self.l_masterDog.stop()

		elif self.role == MasterService.RL_VOTING:
			# No master during election stage: waiting next master
			log.warn("Panic mode requested during election stage: delaying.")
			self.panicRequested=True

		elif self.role == MasterService.RL_PASSIVE:
			log.warn("I'm slave: asking master to engage panic mode...")

			agent=Agent()
			d=agent.panic()
			d.addErrback(panicFailed)
			d.addErrback(log.err)

		else: # RL_ALONE or RL_JOINING or RL_LEAVING
			log.warn("I'm not in a running state (master or slave). Cannot engage panic mode.")
			raise RPCRefusedError("Not in running state")


	# Properties accessors
	###########################################################################

	def getStatus(self):
		return self.status
	
	def getState(self):
		return self.state
	
	def getLocalNode(self):
		return self.localNode

	def getActiveMaster(self):
		return self.master
	
	def getNodesList(self):
		return self.status.keys()

	def isActive(self):
		return self.role == MasterService.RL_ACTIVE

	def isInPanic(self):
		return self.state == MasterService.ST_PANIC

	# Messages handlers
	###########################################################################

	def dispatchMessage(self, data, host):
		dispatcher = {
			"slavehb" : self.updateNodeStatus,
			"masterhb" : self.updateMasterStatus,
			"voterequest" : self.voteForNewMaster,
			"voteresponse" : self.recordVote,
		}

		try:
			msg=MessageHelper.get(data, host)
			log.debugd("Received", msg)
			dispatcher[msg.type()](msg)
		except (MessageError, KeyError), e:
			log.err("Bad message from %s : %s , %s" % (host,data,e))
		except IDontCareException:
			pass # Discard useless messages