Example #1
0
    def __init__(self):
        self.role = MasterService.RL_ALONE  # Current role of this node
        self.state = MasterService.ST_NORMAL  # Current cluster error status
        self.master = None  # Name of the active master
        self.masterLastSeen = 0  # Timestamp for master failover
        self.status = dict()  # Whole cluster status
        self.localNode = Node(DNSCache.getInstance().name)
        self.disk = DiskHeartbeat()
        self.s_slaveHb = SlaveHearbeatService(self)
        self.s_masterHb = MasterHeartbeatService(self)
        self.s_rpc = RPCService(self)

        # Watchdogs for failover
        self.l_slaveDog = task.LoopingCall(self.checkMasterHeartbeat)
        self.l_masterDog = task.LoopingCall(self.checkSlaveHeartbeats)

        # Election Stuff
        self.ballotBox = None  # All received votes
        self.currentElection = None  # Election name, none if no pending election
        self.f_tally = None  # IDelayedCall used to trigger countVotes()
        self.lastTallyDate = 0  # Timestamp for debbuging elections
        self.panicRequested = False  # True if panic is requested during election
Example #2
0
File: master.py Project: nagius/cxm
	def __init__(self):
		self.role			= MasterService.RL_ALONE		# Current role of this node
		self.state			= MasterService.ST_NORMAL		# Current cluster error status
		self.master			= None							# Name of the active master
		self.masterLastSeen	= 0								# Timestamp for master failover
		self.status			= dict()						# Whole cluster status
		self.localNode		= Node(DNSCache.getInstance().name)
		self.disk			= DiskHeartbeat()
		self.s_slaveHb		= SlaveHearbeatService(self)
		self.s_masterHb		= MasterHeartbeatService(self)
		self.s_rpc			= RPCService(self) 

		# Watchdogs for failover
		self.l_slaveDog		= task.LoopingCall(self.checkMasterHeartbeat)
		self.l_masterDog	= task.LoopingCall(self.checkSlaveHeartbeats)

		# Election Stuff
		self.ballotBox 			= None		# All received votes
		self.currentElection	= None		# Election name, none if no pending election
		self.f_tally			= None		# IDelayedCall used to trigger countVotes()
		self.lastTallyDate		= 0			# Timestamp for debbuging elections
		self.panicRequested		= False		# True if panic is requested during election
Example #3
0
#!/usr/bin/env python

import logging
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
import handlers

from rpc import RPCService
from namespaces import *

RPCService([("info", info.InfoNamespace), ("person", person.PersonNamespace),
            ("session", session.SessionNamespace)])

application = webapp.WSGIApplication(
    [('/', handlers.MainHandler), ('/test', handlers.RPCFormHandler),
     (r'/rpc/([a-zA-Z]+)/([a-zA-Z]+)', handlers.RPCHandler)],
    debug=True)


def main():
    util.run_wsgi_app(application)


if __name__ == '__main__':
    main()
Example #4
0
File: master.py Project: nagius/cxm
class MasterService(Service):

	# Possible master roles (for self.role)
	RL_ACTIVE  = "active"			# Active master, aka master
	RL_PASSIVE = "passive"  		# Passive master, aka slave
	RL_JOINING = "joining"			# When trying to connect to the cluster
	RL_LEAVING = "leaving"			# When trying to leave the cluster
	RL_VOTING  = "voting"			# During election stage
	RL_ALONE   = "alone"			# Before joining

	# Possible states, aka error mode (for self.state)
	ST_NORMAL    = "normal" 		# Normal operations
	ST_RECOVERY  = "recovery"		# When a failed node is being recovered
	ST_PANIC     = "panic"			# "I don't do anything" mode

	# Elections and failover timeouts
	TM_TALLY	= 1					# Records vote for 1 sec
	TM_WATCHDOG	= core.cfg['TIMER']	# Check for failure every 3 sec
	TM_MASTER	= TM_WATCHDOG*2		# Re-elect master if no response wihtin 6 sec
	TM_SLAVE	= TM_WATCHDOG*3		# Trigger failover if no response within 9 sec (master + tally + rounding)

	def __init__(self):
		self.role			= MasterService.RL_ALONE		# Current role of this node
		self.state			= MasterService.ST_NORMAL		# Current cluster error status
		self.master			= None							# Name of the active master
		self.masterLastSeen	= 0								# Timestamp for master failover
		self.status			= dict()						# Whole cluster status
		self.localNode		= Node(DNSCache.getInstance().name)
		self.disk			= DiskHeartbeat()
		self.s_slaveHb		= SlaveHearbeatService(self)
		self.s_masterHb		= MasterHeartbeatService(self)
		self.s_rpc			= RPCService(self) 

		# Watchdogs for failover
		self.l_slaveDog		= task.LoopingCall(self.checkMasterHeartbeat)
		self.l_masterDog	= task.LoopingCall(self.checkSlaveHeartbeats)

		# Election Stuff
		self.ballotBox 			= None		# All received votes
		self.currentElection	= None		# Election name, none if no pending election
		self.f_tally			= None		# IDelayedCall used to trigger countVotes()
		self.lastTallyDate		= 0			# Timestamp for debbuging elections
		self.panicRequested		= False		# True if panic is requested during election

	def startService(self):
		Service.startService(self)
		
		# Print welcome message
		log.info("Starting cxmd version", meta.version)

		self._messagePort=reactor.listenUDP(core.cfg['UDP_PORT'], UDPListener(self.dispatchMessage))
		reactor.callLater(2, self.joinCluster)

	def stopService(self):
		def exit(result):
			log.info("Stopping daemon...")
			if not reactor._stopped:
				reactor.stop()

		if self.running:
			Service.stopService(self)

			# Stop receiving cluster messages
			self._messagePort.stopListening()
			self.s_rpc.stopService().addErrback(log.err)

			# Cleanly leave cluster
			d = self.leaveCluster()
			d.addErrback(log.err)
			d.addBoth(exit) # Even if there are errors
			return d
		else:
			return defer.succeed(None)
	
	def panic(self, noCheck=False):
		""" 
		Engage panic mode.
		Use noCheck=True if you want to panic whatever the cluster role.
		"""

		def panicFailed(reason):
			log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
			self.panic(True)

		if self.state == MasterService.ST_PANIC:
			log.emerg("Panic mode already engaged.")

		elif self.role == MasterService.RL_ACTIVE or noCheck:
			log.emerg("SYSTEM FAILURE: Panic mode engaged.")
			log.emerg("This is a critical error. You should bring your ass over here, right now.")
			log.emerg("Please check logs and be sure of what you're doing before re-engaging normal mode.")
			self.state=MasterService.ST_PANIC

			# TODO + stop LB
			if self.l_masterDog.running:
				self.l_masterDog.stop()

		elif self.role == MasterService.RL_VOTING:
			# No master during election stage: waiting next master
			log.warn("Panic mode requested during election stage: delaying.")
			self.panicRequested=True

		elif self.role == MasterService.RL_PASSIVE:
			log.warn("I'm slave: asking master to engage panic mode...")

			agent=Agent()
			d=agent.panic()
			d.addErrback(panicFailed)
			d.addErrback(log.err)

		else: # RL_ALONE or RL_JOINING or RL_LEAVING
			log.warn("I'm not in a running state (master or slave). Cannot engage panic mode.")
			raise RPCRefusedError("Not in running state")


	# Properties accessors
	###########################################################################

	def getStatus(self):
		return self.status
	
	def getState(self):
		return self.state
	
	def getLocalNode(self):
		return self.localNode

	def getActiveMaster(self):
		return self.master
	
	def getNodesList(self):
		return self.status.keys()

	def isActive(self):
		return self.role == MasterService.RL_ACTIVE

	def isInPanic(self):
		return self.state == MasterService.ST_PANIC

	# Messages handlers
	###########################################################################

	def dispatchMessage(self, data, host):
		dispatcher = {
			"slavehb" : self.updateNodeStatus,
			"masterhb" : self.updateMasterStatus,
			"voterequest" : self.voteForNewMaster,
			"voteresponse" : self.recordVote,
		}

		try:
			msg=MessageHelper.get(data, host)
			log.debugd("Received", msg)
			dispatcher[msg.type()](msg)
		except (MessageError, KeyError), e:
			log.err("Bad message from %s : %s , %s" % (host,data,e))
		except IDontCareException:
			pass # Discard useless messages
Example #5
0
class MasterService(Service):

    # Possible master roles (for self.role)
    RL_ACTIVE = "active"  # Active master, aka master
    RL_PASSIVE = "passive"  # Passive master, aka slave
    RL_JOINING = "joining"  # When trying to connect to the cluster
    RL_LEAVING = "leaving"  # When trying to leave the cluster
    RL_VOTING = "voting"  # During election stage
    RL_ALONE = "alone"  # Before joining

    # Possible states, aka error mode (for self.state)
    ST_NORMAL = "normal"  # Normal operations
    ST_RECOVERY = "recovery"  # When a failed node is being recovered
    ST_PANIC = "panic"  # "I don't do anything" mode

    # Elections and failover timeouts
    TM_TALLY = 1  # Records vote for 1 sec
    TM_WATCHDOG = core.cfg['TIMER']  # Check for failure every 3 sec
    TM_MASTER = TM_WATCHDOG * 2  # Re-elect master if no response wihtin 6 sec
    TM_SLAVE = TM_WATCHDOG * 3  # Trigger failover if no response within 9 sec (master + tally + rounding)

    def __init__(self):
        self.role = MasterService.RL_ALONE  # Current role of this node
        self.state = MasterService.ST_NORMAL  # Current cluster error status
        self.master = None  # Name of the active master
        self.masterLastSeen = 0  # Timestamp for master failover
        self.status = dict()  # Whole cluster status
        self.localNode = Node(DNSCache.getInstance().name)
        self.disk = DiskHeartbeat()
        self.s_slaveHb = SlaveHearbeatService(self)
        self.s_masterHb = MasterHeartbeatService(self)
        self.s_rpc = RPCService(self)

        # Watchdogs for failover
        self.l_slaveDog = task.LoopingCall(self.checkMasterHeartbeat)
        self.l_masterDog = task.LoopingCall(self.checkSlaveHeartbeats)

        # Election Stuff
        self.ballotBox = None  # All received votes
        self.currentElection = None  # Election name, none if no pending election
        self.f_tally = None  # IDelayedCall used to trigger countVotes()
        self.lastTallyDate = 0  # Timestamp for debbuging elections
        self.panicRequested = False  # True if panic is requested during election

    def startService(self):
        Service.startService(self)

        # Print welcome message
        log.info("Starting cxmd version", meta.version)

        self._messagePort = reactor.listenUDP(
            core.cfg['UDP_PORT'], UDPListener(self.dispatchMessage))
        reactor.callLater(2, self.joinCluster)

    def stopService(self):
        def exit(result):
            log.info("Stopping daemon...")
            if not reactor._stopped:
                reactor.stop()

        if self.running:
            Service.stopService(self)

            # Stop receiving cluster messages
            self._messagePort.stopListening()
            self.s_rpc.stopService().addErrback(log.err)

            # Cleanly leave cluster
            d = self.leaveCluster()
            d.addErrback(log.err)
            d.addBoth(exit)  # Even if there are errors
            return d
        else:
            return defer.succeed(None)

    def panic(self, noCheck=False):
        """ 
		Engage panic mode.
		Use noCheck=True if you want to panic whatever the cluster role.
		"""
        def panicFailed(reason):
            log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
            self.panic(True)

        if self.state == MasterService.ST_PANIC:
            log.emerg("Panic mode already engaged.")

        elif self.role == MasterService.RL_ACTIVE or noCheck:
            log.emerg("SYSTEM FAILURE: Panic mode engaged.")
            log.emerg(
                "This is a critical error. You should bring your ass over here, right now."
            )
            log.emerg(
                "Please check logs and be sure of what you're doing before re-engaging normal mode."
            )
            self.state = MasterService.ST_PANIC

            # TODO + stop LB
            if self.l_masterDog.running:
                self.l_masterDog.stop()

        elif self.role == MasterService.RL_VOTING:
            # No master during election stage: waiting next master
            log.warn("Panic mode requested during election stage: delaying.")
            self.panicRequested = True

        elif self.role == MasterService.RL_PASSIVE:
            log.warn("I'm slave: asking master to engage panic mode...")

            agent = Agent()
            d = agent.panic()
            d.addErrback(panicFailed)
            d.addErrback(log.err)

        else:  # RL_ALONE or RL_JOINING or RL_LEAVING
            log.warn(
                "I'm not in a running state (master or slave). Cannot engage panic mode."
            )
            raise RPCRefusedError("Not in running state")

    # Properties accessors
    ###########################################################################

    def getStatus(self):
        return self.status

    def getState(self):
        return self.state

    def getLocalNode(self):
        return self.localNode

    def getActiveMaster(self):
        return self.master

    def getNodesList(self):
        return self.status.keys()

    def isActive(self):
        return self.role == MasterService.RL_ACTIVE

    def isInPanic(self):
        return self.state == MasterService.ST_PANIC

    # Messages handlers
    ###########################################################################

    def dispatchMessage(self, data, host):
        dispatcher = {
            "slavehb": self.updateNodeStatus,
            "masterhb": self.updateMasterStatus,
            "voterequest": self.voteForNewMaster,
            "voteresponse": self.recordVote,
        }

        try:
            msg = MessageHelper.get(data, host)
            log.debugd("Received", msg)
            dispatcher[msg.type()](msg)
        except (MessageError, KeyError), e:
            log.err("Bad message from %s : %s , %s" % (host, data, e))
        except IDontCareException:
            pass  # Discard useless messages