Example #1
0
File: master.py Project: nagius/cxm
	def joinCluster(self):

		def startHeartbeats():
			self._startSlave()
			self.s_rpc.startService()

			if self.role == MasterService.RL_ACTIVE:
				self._startMaster() 

		def joinRefused(reason):
			reason.trap(NodeRefusedError, RPCRefusedError)
			log.err("Join to cluster %s failed: Master %s has refused me: %s" % 
				(core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage()))
			self.stopService()

		def joinAccepted(result):
			self.role=MasterService.RL_PASSIVE
			log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME']))
			startHeartbeats()
			
		def masterConnected(obj):
			d = obj.callRemote("register",DNSCache.getInstance().name)
			d.addCallbacks(joinAccepted,joinRefused)
			d.addErrback(log.err)
			d.addBoth(lambda _: rpcConnector.disconnect())
			return d

		try:
			if self.master is None:
				# New active master
				if DNSCache.getInstance().name not in core.cfg['ALLOWED_NODES']:
					log.warn("I'm not allowed to create a new cluster. Exiting.")
					raise Exception("Cluster creation not allowed")

				if DiskHeartbeat.is_in_use():
					log.err("Heartbeat disk is in use but we are alone !")
					raise Exception("Heartbeat disk already in use")

				log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME']))
				self.role=MasterService.RL_ACTIVE
				self.master=DNSCache.getInstance().name
				self.status[self.master]={'timestamp': 0, 'offset': 0, 'vms': []}
				self.disk.make_slot(DNSCache.getInstance().name)
				startHeartbeats()

			else:
				# Passive master
				self.role=MasterService.RL_JOINING
				log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME']))

				factory = pb.PBClientFactory()
				rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory)
				d = factory.getRootObject()
				d.addCallback(masterConnected)
				d.addErrback(log.err)
		except Exception, e:
			log.err("Startup failed: %s. Shutting down." % (e))
			self.stopService()
Example #2
0
File: master.py Project: nagius/cxm
	def countVotes(self):
		if self.role != MasterService.RL_VOTING:
			log.warn("Tally triggered but it's not election time !")
			return

		if type(self.ballotBox) != dict or len(self.ballotBox) == 0:
			log.emerg("No vote received ! There is a critical network failure.")
			self.panic(True) # noCheck=True because role is not consistent
			return

		# Select election winner
		self.currentElection=None
		self.lastTallyDate=int(time.time())
		self.master=self.ballotBox[max(self.ballotBox.keys())]
		log.info("New master is %s." % (self.master))
		self._startSlave()

		if self.master == DNSCache.getInstance().name:
			log.info("I'm the new master.")
			self.role=MasterService.RL_ACTIVE
			self._startMaster()
		else:
			self.role=MasterService.RL_PASSIVE
		
		if self.panicRequested:
			log.warn("Engaging panic mode requested during election stage.")
			self.panicRequested=False
			self.panic()
Example #3
0
    def countVotes(self):
        if self.role != MasterService.RL_VOTING:
            log.warn("Tally triggered but it's not election time !")
            return

        if type(self.ballotBox) != dict or len(self.ballotBox) == 0:
            log.emerg(
                "No vote received ! There is a critical network failure.")
            self.panic(True)  # noCheck=True because role is not consistent
            return

        # Select election winner
        self.currentElection = None
        self.lastTallyDate = int(time.time())
        self.master = self.ballotBox[max(self.ballotBox.keys())]
        log.info("New master is %s." % (self.master))
        self._startSlave()

        if self.master == DNSCache.getInstance().name:
            log.info("I'm the new master.")
            self.role = MasterService.RL_ACTIVE
            self._startMaster()
        else:
            self.role = MasterService.RL_PASSIVE

        if self.panicRequested:
            log.warn("Engaging panic mode requested during election stage.")
            self.panicRequested = False
            self.panic()
Example #4
0
	def rand():
		"""
		Generate a random unique integer.
		Warning, this integer is unique only if you are on a /24 (or above) network.
		"""

		ip=DNSCache.getInstance().ip.split(".")[3]
		return random.randint(1,99)*1000+int(ip)
Example #5
0
    def rand():
        """
		Generate a random unique integer.
		Warning, this integer is unique only if you are on a /24 (or above) network.
		"""

        ip = DNSCache.getInstance().ip.split(".")[3]
        return random.randint(1, 99) * 1000 + int(ip)
Example #6
0
	def startProtocol(self):
		def setIp(result):
			self._ip=result
			self.d_onStart.callback(self) 

		# Set IP TOS field to Minimize-Delay
		self.transport.socket.setsockopt(socket.IPPROTO_IP, socket.IP_TOS, 0x10)

		if self.dest is None:
			# Enable broadcast
			self.transport.socket.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, True)
			d=DNSCache.getInstance().get_bcast()
		else:
			d=DNSCache.getInstance().get_by_name(self.dest)
		
		d.addCallback(setIp)
		d.addErrback(log.err)
Example #7
0
    def startProtocol(self):
        def setIp(result):
            self._ip = result
            self.d_onStart.callback(self)

        # Set IP TOS field to Minimize-Delay
        self.transport.socket.setsockopt(socket.IPPROTO_IP, socket.IP_TOS,
                                         0x10)

        if self.dest is None:
            # Enable broadcast
            self.transport.socket.setsockopt(socket.SOL_SOCKET,
                                             socket.SO_BROADCAST, True)
            d = DNSCache.getInstance().get_bcast()
        else:
            d = DNSCache.getInstance().get_by_name(self.dest)

        d.addCallback(setIp)
        d.addErrback(log.err)
Example #8
0
	def startService(self):
		def heartbeatFailed(reason):
			log.err("Disk heartbeat failure: %s." % (reason.getErrorMessage()))
			self.stopService()  # Stop slave heartbeat to tell master we have a problem

		Service.startService(self)

		log.info("Starting slave heartbeats...")
		self._hb = NetHeartbeat(self.forgeSlaveHeartbeat, self._master.getActiveMaster())
		self._hb.start()
		self._call = task.LoopingCall(self._master.disk.write_ts, DNSCache.getInstance().name)
		d=self._call.start(1)
		d.addErrback(heartbeatFailed)
		return d
Example #9
0
    def unregisterNode(self, name):
        # Can unregister node even if in panic mode

        if self.role != MasterService.RL_ACTIVE:
            log.warn("I'm not master. Cannot unregister %s." % (name))
            raise RPCRefusedError("Not master")

        if name not in self.status:
            log.warn("Unknown node %s try to quit the cluster." % (name))
            raise NodeRefusedError("Unknown node " + name)

        if name == DNSCache.getInstance().name:
            log.warn("I'm the master. Cannot self unregister.")
            raise NodeRefusedError("Cannot unregister master")

        self._unregister(name)
Example #10
0
File: master.py Project: nagius/cxm
	def unregisterNode(self, name):
		# Can unregister node even if in panic mode

		if self.role != MasterService.RL_ACTIVE:
			log.warn("I'm not master. Cannot unregister %s." % (name))
			raise RPCRefusedError("Not master")

		if name not in self.status:
			log.warn("Unknown node %s try to quit the cluster." % (name))
			raise NodeRefusedError("Unknown node "+name)

		if name == DNSCache.getInstance().name:
			log.warn("I'm the master. Cannot self unregister.")
			raise NodeRefusedError("Cannot unregister master")

		self._unregister(name)
Example #11
0
File: master.py Project: nagius/cxm
	def leaveCluster(self):

		def masterConnected(obj):
			d = obj.callRemote("unregister",DNSCache.getInstance().name)
			d.addErrback(log.err)
			d.addBoth(lambda _: rpcConnector.disconnect())
			return d

		# Stop slave hearbeat and watchdog
		self._stopSlave()

		previousRole=self.role
		self.role=MasterService.RL_LEAVING

		if previousRole == MasterService.RL_ACTIVE:

			# Self-delete our own record 
			self._unregister(DNSCache.getInstance().name)

			if len(self.status) <= 0:
				log.warn("I'm the last node, shutting down cluster.")
				d=defer.succeed(None)
			else:
				# New election only if there is at least one node
				d=self.triggerElection()
				d.addErrback(log.err)

			# Stop master hearbeat when vote request has been sent
			d.addBoth(lambda _: self._stopMaster())
		elif previousRole == MasterService.RL_PASSIVE:
			rpcFactory = pb.PBClientFactory()
			rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], rpcFactory)
			d = rpcFactory.getRootObject()
			d.addCallback(masterConnected)
		else: # RL_ALONE or RL_JOINING or RL_VOTING
			if previousRole == MasterService.RL_VOTING:
				# Others nodes will re-trigger an election if we win this one
				log.warn("Quitting cluster during election stage !")

			d=defer.succeed(None)
		
		return d
Example #12
0
    def leaveCluster(self):
        def masterConnected(obj):
            d = obj.callRemote("unregister", DNSCache.getInstance().name)
            d.addErrback(log.err)
            d.addBoth(lambda _: rpcConnector.disconnect())
            return d

        # Stop slave hearbeat and watchdog
        self._stopSlave()

        previousRole = self.role
        self.role = MasterService.RL_LEAVING

        if previousRole == MasterService.RL_ACTIVE:

            # Self-delete our own record
            self._unregister(DNSCache.getInstance().name)

            if len(self.status) <= 0:
                log.warn("I'm the last node, shutting down cluster.")
                d = defer.succeed(None)
            else:
                # New election only if there is at least one node
                d = self.triggerElection()
                d.addErrback(log.err)

            # Stop master hearbeat when vote request has been sent
            d.addBoth(lambda _: self._stopMaster())
        elif previousRole == MasterService.RL_PASSIVE:
            rpcFactory = pb.PBClientFactory()
            rpcConnector = reactor.connectTCP(self.master,
                                              core.cfg['TCP_PORT'], rpcFactory)
            d = rpcFactory.getRootObject()
            d.addCallback(masterConnected)
        else:  # RL_ALONE or RL_JOINING or RL_VOTING
            if previousRole == MasterService.RL_VOTING:
                # Others nodes will re-trigger an election if we win this one
                log.warn("Quitting cluster during election stage !")

            d = defer.succeed(None)

        return d
Example #13
0
File: master.py Project: nagius/cxm
	def __init__(self):
		self.role			= MasterService.RL_ALONE		# Current role of this node
		self.state			= MasterService.ST_NORMAL		# Current cluster error status
		self.master			= None							# Name of the active master
		self.masterLastSeen	= 0								# Timestamp for master failover
		self.status			= dict()						# Whole cluster status
		self.localNode		= Node(DNSCache.getInstance().name)
		self.disk			= DiskHeartbeat()
		self.s_slaveHb		= SlaveHearbeatService(self)
		self.s_masterHb		= MasterHeartbeatService(self)
		self.s_rpc			= RPCService(self) 

		# Watchdogs for failover
		self.l_slaveDog		= task.LoopingCall(self.checkMasterHeartbeat)
		self.l_masterDog	= task.LoopingCall(self.checkSlaveHeartbeats)

		# Election Stuff
		self.ballotBox 			= None		# All received votes
		self.currentElection	= None		# Election name, none if no pending election
		self.f_tally			= None		# IDelayedCall used to trigger countVotes()
		self.lastTallyDate		= 0			# Timestamp for debbuging elections
		self.panicRequested		= False		# True if panic is requested during election
Example #14
0
    def __init__(self):
        self.role = MasterService.RL_ALONE  # Current role of this node
        self.state = MasterService.ST_NORMAL  # Current cluster error status
        self.master = None  # Name of the active master
        self.masterLastSeen = 0  # Timestamp for master failover
        self.status = dict()  # Whole cluster status
        self.localNode = Node(DNSCache.getInstance().name)
        self.disk = DiskHeartbeat()
        self.s_slaveHb = SlaveHearbeatService(self)
        self.s_masterHb = MasterHeartbeatService(self)
        self.s_rpc = RPCService(self)

        # Watchdogs for failover
        self.l_slaveDog = task.LoopingCall(self.checkMasterHeartbeat)
        self.l_masterDog = task.LoopingCall(self.checkSlaveHeartbeats)

        # Election Stuff
        self.ballotBox = None  # All received votes
        self.currentElection = None  # Election name, none if no pending election
        self.f_tally = None  # IDelayedCall used to trigger countVotes()
        self.lastTallyDate = 0  # Timestamp for debbuging elections
        self.panicRequested = False  # True if panic is requested during election
Example #15
0
File: master.py Project: nagius/cxm
			raise RPCRefusedError("Panic mode engaged")

		if self.role != MasterService.RL_ACTIVE:
			log.warn("I'm not master. Cannot register %s." % (name))
			raise RPCRefusedError("Not master")

		if name not in core.cfg['ALLOWED_NODES']:
			log.warn("Node %s not allowed to join this cluster. Refusing." % (name))
			raise NodeRefusedError("Node not allowed to join this cluster.")

		if name in self.status:
			log.warn("Node %s is already joined ! Cannot re-join." % (name))
			raise NodeRefusedError("Node already in cluster")

		# Check if hostname is valid
		d=DNSCache.getInstance().add(name)
		d.addCallbacks(validHostname, invalidHostname)
		
		return d
			

	def _unregister(self, name):
		try:
			del self.status[name]
		except:
			pass

		try:
			self.disk.erase_slot(name)
		except DiskHeartbeatError, e:
			log.warn("Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e))
Example #16
0
	def __init__(self, host=None):
		if host is None:
			self.node=DNSCache.getInstance().name
		else:
			self.node=DNSCache.getInstance().get_by_ip(host)
Example #17
0
 def __init__(self, host=None):
     if host is None:
         self.node = DNSCache.getInstance().name
     else:
         self.node = DNSCache.getInstance().get_by_ip(host)
Example #18
0
File: master.py Project: nagius/cxm
		def masterConnected(obj):
			d = obj.callRemote("register",DNSCache.getInstance().name)
			d.addCallbacks(joinAccepted,joinRefused)
			d.addErrback(log.err)
			d.addBoth(lambda _: rpcConnector.disconnect())
			return d
Example #19
0
        if self.role != MasterService.RL_ACTIVE:
            log.warn("I'm not master. Cannot register %s." % (name))
            raise RPCRefusedError("Not master")

        if name not in core.cfg['ALLOWED_NODES']:
            log.warn("Node %s not allowed to join this cluster. Refusing." %
                     (name))
            raise NodeRefusedError("Node not allowed to join this cluster.")

        if name in self.status:
            log.warn("Node %s is already joined ! Cannot re-join." % (name))
            raise NodeRefusedError("Node already in cluster")

        # Check if hostname is valid
        d = DNSCache.getInstance().add(name)
        d.addCallbacks(validHostname, invalidHostname)

        return d

    def _unregister(self, name):
        try:
            del self.status[name]
        except:
            pass

        try:
            self.disk.erase_slot(name)
        except DiskHeartbeatError, e:
            log.warn(
                "Cannot erase slot: %s. You may have to reformat hearbeat disk."
Example #20
0
File: master.py Project: nagius/cxm
		def masterConnected(obj):
			d = obj.callRemote("unregister",DNSCache.getInstance().name)
			d.addErrback(log.err)
			d.addBoth(lambda _: rpcConnector.disconnect())
			return d
Example #21
0
 def masterConnected(obj):
     d = obj.callRemote("register", DNSCache.getInstance().name)
     d.addCallbacks(joinAccepted, joinRefused)
     d.addErrback(log.err)
     d.addBoth(lambda _: rpcConnector.disconnect())
     return d
Example #22
0
    def joinCluster(self):
        def startHeartbeats():
            self._startSlave()
            self.s_rpc.startService()

            if self.role == MasterService.RL_ACTIVE:
                self._startMaster()

        def joinRefused(reason):
            reason.trap(NodeRefusedError, RPCRefusedError)
            log.err("Join to cluster %s failed: Master %s has refused me: %s" %
                    (core.cfg['CLUSTER_NAME'], self.master,
                     reason.getErrorMessage()))
            self.stopService()

        def joinAccepted(result):
            self.role = MasterService.RL_PASSIVE
            log.info("Join successfull, I'm now part of cluster %s." %
                     (core.cfg['CLUSTER_NAME']))
            startHeartbeats()

        def masterConnected(obj):
            d = obj.callRemote("register", DNSCache.getInstance().name)
            d.addCallbacks(joinAccepted, joinRefused)
            d.addErrback(log.err)
            d.addBoth(lambda _: rpcConnector.disconnect())
            return d

        try:
            if self.master is None:
                # New active master
                if DNSCache.getInstance(
                ).name not in core.cfg['ALLOWED_NODES']:
                    log.warn(
                        "I'm not allowed to create a new cluster. Exiting.")
                    raise Exception("Cluster creation not allowed")

                if DiskHeartbeat.is_in_use():
                    log.err("Heartbeat disk is in use but we are alone !")
                    raise Exception("Heartbeat disk already in use")

                log.info("No master found. I'm now the new master of %s." %
                         (core.cfg['CLUSTER_NAME']))
                self.role = MasterService.RL_ACTIVE
                self.master = DNSCache.getInstance().name
                self.status[self.master] = {
                    'timestamp': 0,
                    'offset': 0,
                    'vms': []
                }
                self.disk.make_slot(DNSCache.getInstance().name)
                startHeartbeats()

            else:
                # Passive master
                self.role = MasterService.RL_JOINING
                log.info("Trying to join cluster %s..." %
                         (core.cfg['CLUSTER_NAME']))

                factory = pb.PBClientFactory()
                rpcConnector = reactor.connectTCP(self.master,
                                                  core.cfg['TCP_PORT'],
                                                  factory)
                d = factory.getRootObject()
                d.addCallback(masterConnected)
                d.addErrback(log.err)
        except Exception, e:
            log.err("Startup failed: %s. Shutting down." % (e))
            self.stopService()
Example #23
0
 def masterConnected(obj):
     d = obj.callRemote("unregister", DNSCache.getInstance().name)
     d.addErrback(log.err)
     d.addBoth(lambda _: rpcConnector.disconnect())
     return d