def joinCluster(self): def startHeartbeats(): self._startSlave() self.s_rpc.startService() if self.role == MasterService.RL_ACTIVE: self._startMaster() def joinRefused(reason): reason.trap(NodeRefusedError, RPCRefusedError) log.err("Join to cluster %s failed: Master %s has refused me: %s" % (core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage())) self.stopService() def joinAccepted(result): self.role=MasterService.RL_PASSIVE log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME'])) startHeartbeats() def masterConnected(obj): d = obj.callRemote("register",DNSCache.getInstance().name) d.addCallbacks(joinAccepted,joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d try: if self.master is None: # New active master if DNSCache.getInstance().name not in core.cfg['ALLOWED_NODES']: log.warn("I'm not allowed to create a new cluster. Exiting.") raise Exception("Cluster creation not allowed") if DiskHeartbeat.is_in_use(): log.err("Heartbeat disk is in use but we are alone !") raise Exception("Heartbeat disk already in use") log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME'])) self.role=MasterService.RL_ACTIVE self.master=DNSCache.getInstance().name self.status[self.master]={'timestamp': 0, 'offset': 0, 'vms': []} self.disk.make_slot(DNSCache.getInstance().name) startHeartbeats() else: # Passive master self.role=MasterService.RL_JOINING log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME'])) factory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory) d = factory.getRootObject() d.addCallback(masterConnected) d.addErrback(log.err) except Exception, e: log.err("Startup failed: %s. Shutting down." % (e)) self.stopService()
def countVotes(self): if self.role != MasterService.RL_VOTING: log.warn("Tally triggered but it's not election time !") return if type(self.ballotBox) != dict or len(self.ballotBox) == 0: log.emerg("No vote received ! There is a critical network failure.") self.panic(True) # noCheck=True because role is not consistent return # Select election winner self.currentElection=None self.lastTallyDate=int(time.time()) self.master=self.ballotBox[max(self.ballotBox.keys())] log.info("New master is %s." % (self.master)) self._startSlave() if self.master == DNSCache.getInstance().name: log.info("I'm the new master.") self.role=MasterService.RL_ACTIVE self._startMaster() else: self.role=MasterService.RL_PASSIVE if self.panicRequested: log.warn("Engaging panic mode requested during election stage.") self.panicRequested=False self.panic()
def countVotes(self): if self.role != MasterService.RL_VOTING: log.warn("Tally triggered but it's not election time !") return if type(self.ballotBox) != dict or len(self.ballotBox) == 0: log.emerg( "No vote received ! There is a critical network failure.") self.panic(True) # noCheck=True because role is not consistent return # Select election winner self.currentElection = None self.lastTallyDate = int(time.time()) self.master = self.ballotBox[max(self.ballotBox.keys())] log.info("New master is %s." % (self.master)) self._startSlave() if self.master == DNSCache.getInstance().name: log.info("I'm the new master.") self.role = MasterService.RL_ACTIVE self._startMaster() else: self.role = MasterService.RL_PASSIVE if self.panicRequested: log.warn("Engaging panic mode requested during election stage.") self.panicRequested = False self.panic()
def rand(): """ Generate a random unique integer. Warning, this integer is unique only if you are on a /24 (or above) network. """ ip=DNSCache.getInstance().ip.split(".")[3] return random.randint(1,99)*1000+int(ip)
def rand(): """ Generate a random unique integer. Warning, this integer is unique only if you are on a /24 (or above) network. """ ip = DNSCache.getInstance().ip.split(".")[3] return random.randint(1, 99) * 1000 + int(ip)
def startProtocol(self): def setIp(result): self._ip=result self.d_onStart.callback(self) # Set IP TOS field to Minimize-Delay self.transport.socket.setsockopt(socket.IPPROTO_IP, socket.IP_TOS, 0x10) if self.dest is None: # Enable broadcast self.transport.socket.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, True) d=DNSCache.getInstance().get_bcast() else: d=DNSCache.getInstance().get_by_name(self.dest) d.addCallback(setIp) d.addErrback(log.err)
def startProtocol(self): def setIp(result): self._ip = result self.d_onStart.callback(self) # Set IP TOS field to Minimize-Delay self.transport.socket.setsockopt(socket.IPPROTO_IP, socket.IP_TOS, 0x10) if self.dest is None: # Enable broadcast self.transport.socket.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, True) d = DNSCache.getInstance().get_bcast() else: d = DNSCache.getInstance().get_by_name(self.dest) d.addCallback(setIp) d.addErrback(log.err)
def startService(self): def heartbeatFailed(reason): log.err("Disk heartbeat failure: %s." % (reason.getErrorMessage())) self.stopService() # Stop slave heartbeat to tell master we have a problem Service.startService(self) log.info("Starting slave heartbeats...") self._hb = NetHeartbeat(self.forgeSlaveHeartbeat, self._master.getActiveMaster()) self._hb.start() self._call = task.LoopingCall(self._master.disk.write_ts, DNSCache.getInstance().name) d=self._call.start(1) d.addErrback(heartbeatFailed) return d
def unregisterNode(self, name): # Can unregister node even if in panic mode if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot unregister %s." % (name)) raise RPCRefusedError("Not master") if name not in self.status: log.warn("Unknown node %s try to quit the cluster." % (name)) raise NodeRefusedError("Unknown node " + name) if name == DNSCache.getInstance().name: log.warn("I'm the master. Cannot self unregister.") raise NodeRefusedError("Cannot unregister master") self._unregister(name)
def unregisterNode(self, name): # Can unregister node even if in panic mode if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot unregister %s." % (name)) raise RPCRefusedError("Not master") if name not in self.status: log.warn("Unknown node %s try to quit the cluster." % (name)) raise NodeRefusedError("Unknown node "+name) if name == DNSCache.getInstance().name: log.warn("I'm the master. Cannot self unregister.") raise NodeRefusedError("Cannot unregister master") self._unregister(name)
def leaveCluster(self): def masterConnected(obj): d = obj.callRemote("unregister",DNSCache.getInstance().name) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d # Stop slave hearbeat and watchdog self._stopSlave() previousRole=self.role self.role=MasterService.RL_LEAVING if previousRole == MasterService.RL_ACTIVE: # Self-delete our own record self._unregister(DNSCache.getInstance().name) if len(self.status) <= 0: log.warn("I'm the last node, shutting down cluster.") d=defer.succeed(None) else: # New election only if there is at least one node d=self.triggerElection() d.addErrback(log.err) # Stop master hearbeat when vote request has been sent d.addBoth(lambda _: self._stopMaster()) elif previousRole == MasterService.RL_PASSIVE: rpcFactory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], rpcFactory) d = rpcFactory.getRootObject() d.addCallback(masterConnected) else: # RL_ALONE or RL_JOINING or RL_VOTING if previousRole == MasterService.RL_VOTING: # Others nodes will re-trigger an election if we win this one log.warn("Quitting cluster during election stage !") d=defer.succeed(None) return d
def leaveCluster(self): def masterConnected(obj): d = obj.callRemote("unregister", DNSCache.getInstance().name) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d # Stop slave hearbeat and watchdog self._stopSlave() previousRole = self.role self.role = MasterService.RL_LEAVING if previousRole == MasterService.RL_ACTIVE: # Self-delete our own record self._unregister(DNSCache.getInstance().name) if len(self.status) <= 0: log.warn("I'm the last node, shutting down cluster.") d = defer.succeed(None) else: # New election only if there is at least one node d = self.triggerElection() d.addErrback(log.err) # Stop master hearbeat when vote request has been sent d.addBoth(lambda _: self._stopMaster()) elif previousRole == MasterService.RL_PASSIVE: rpcFactory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], rpcFactory) d = rpcFactory.getRootObject() d.addCallback(masterConnected) else: # RL_ALONE or RL_JOINING or RL_VOTING if previousRole == MasterService.RL_VOTING: # Others nodes will re-trigger an election if we win this one log.warn("Quitting cluster during election stage !") d = defer.succeed(None) return d
def __init__(self): self.role = MasterService.RL_ALONE # Current role of this node self.state = MasterService.ST_NORMAL # Current cluster error status self.master = None # Name of the active master self.masterLastSeen = 0 # Timestamp for master failover self.status = dict() # Whole cluster status self.localNode = Node(DNSCache.getInstance().name) self.disk = DiskHeartbeat() self.s_slaveHb = SlaveHearbeatService(self) self.s_masterHb = MasterHeartbeatService(self) self.s_rpc = RPCService(self) # Watchdogs for failover self.l_slaveDog = task.LoopingCall(self.checkMasterHeartbeat) self.l_masterDog = task.LoopingCall(self.checkSlaveHeartbeats) # Election Stuff self.ballotBox = None # All received votes self.currentElection = None # Election name, none if no pending election self.f_tally = None # IDelayedCall used to trigger countVotes() self.lastTallyDate = 0 # Timestamp for debbuging elections self.panicRequested = False # True if panic is requested during election
raise RPCRefusedError("Panic mode engaged") if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot register %s." % (name)) raise RPCRefusedError("Not master") if name not in core.cfg['ALLOWED_NODES']: log.warn("Node %s not allowed to join this cluster. Refusing." % (name)) raise NodeRefusedError("Node not allowed to join this cluster.") if name in self.status: log.warn("Node %s is already joined ! Cannot re-join." % (name)) raise NodeRefusedError("Node already in cluster") # Check if hostname is valid d=DNSCache.getInstance().add(name) d.addCallbacks(validHostname, invalidHostname) return d def _unregister(self, name): try: del self.status[name] except: pass try: self.disk.erase_slot(name) except DiskHeartbeatError, e: log.warn("Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e))
def __init__(self, host=None): if host is None: self.node=DNSCache.getInstance().name else: self.node=DNSCache.getInstance().get_by_ip(host)
def __init__(self, host=None): if host is None: self.node = DNSCache.getInstance().name else: self.node = DNSCache.getInstance().get_by_ip(host)
def masterConnected(obj): d = obj.callRemote("register",DNSCache.getInstance().name) d.addCallbacks(joinAccepted,joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d
if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot register %s." % (name)) raise RPCRefusedError("Not master") if name not in core.cfg['ALLOWED_NODES']: log.warn("Node %s not allowed to join this cluster. Refusing." % (name)) raise NodeRefusedError("Node not allowed to join this cluster.") if name in self.status: log.warn("Node %s is already joined ! Cannot re-join." % (name)) raise NodeRefusedError("Node already in cluster") # Check if hostname is valid d = DNSCache.getInstance().add(name) d.addCallbacks(validHostname, invalidHostname) return d def _unregister(self, name): try: del self.status[name] except: pass try: self.disk.erase_slot(name) except DiskHeartbeatError, e: log.warn( "Cannot erase slot: %s. You may have to reformat hearbeat disk."
def masterConnected(obj): d = obj.callRemote("unregister",DNSCache.getInstance().name) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d
def masterConnected(obj): d = obj.callRemote("register", DNSCache.getInstance().name) d.addCallbacks(joinAccepted, joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d
def joinCluster(self): def startHeartbeats(): self._startSlave() self.s_rpc.startService() if self.role == MasterService.RL_ACTIVE: self._startMaster() def joinRefused(reason): reason.trap(NodeRefusedError, RPCRefusedError) log.err("Join to cluster %s failed: Master %s has refused me: %s" % (core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage())) self.stopService() def joinAccepted(result): self.role = MasterService.RL_PASSIVE log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME'])) startHeartbeats() def masterConnected(obj): d = obj.callRemote("register", DNSCache.getInstance().name) d.addCallbacks(joinAccepted, joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d try: if self.master is None: # New active master if DNSCache.getInstance( ).name not in core.cfg['ALLOWED_NODES']: log.warn( "I'm not allowed to create a new cluster. Exiting.") raise Exception("Cluster creation not allowed") if DiskHeartbeat.is_in_use(): log.err("Heartbeat disk is in use but we are alone !") raise Exception("Heartbeat disk already in use") log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME'])) self.role = MasterService.RL_ACTIVE self.master = DNSCache.getInstance().name self.status[self.master] = { 'timestamp': 0, 'offset': 0, 'vms': [] } self.disk.make_slot(DNSCache.getInstance().name) startHeartbeats() else: # Passive master self.role = MasterService.RL_JOINING log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME'])) factory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory) d = factory.getRootObject() d.addCallback(masterConnected) d.addErrback(log.err) except Exception, e: log.err("Startup failed: %s. Shutting down." % (e)) self.stopService()
def masterConnected(obj): d = obj.callRemote("unregister", DNSCache.getInstance().name) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d