Beispiel #1
0
Datei: vm.py Projekt: nagius/cxm
	def __init__(self,vmname, id=-1, ram=None, vcpu=None):
		"""Instanciate a VM object, with the optional ram and vcpu metrics."""
		self.name=vmname
		self.id=id
		self.__ram=ram
		self.__vcpu=vcpu
		self.config=dict()
		self.metrics=None
		self.devices=dict()

		try:
			try:
				execfile("%s/%s" % (core.cfg['VMCONF_DIR'],vmname) ,dict(),self.config)
			except IOError:
				execfile("%s/%s.cfg" % (core.cfg['VMCONF_DIR'],vmname) ,dict(),self.config)
		except IOError:
			if not core.cfg['QUIET']:
				log.warn("Missing configuration file: %s" % (vmname))
			
		log.debug("[VM]", vmname, self.config)

		# Get devices from config file
		try:
			for disk in self.config['disk']:
				try:
					self.devices[self.diskre.search(disk).group(1)]=self.diskre.search(disk).group(2)
				except:
					if not core.cfg['QUIET']:
						log.warn("Bad disk input for %s: %s" % (self.name, disk))
		except KeyError:
			pass
Beispiel #2
0
    def checkSlaveHeartbeats(self):
        # Checks slaves timestamps only if we are active master
        if self.role != MasterService.RL_ACTIVE:
            return

        # No failover in panic mode
        if self.state == MasterService.ST_PANIC:
            return

        # No more failover if a recovery is running
        if self.state == MasterService.ST_RECOVERY:
            return

        # No failover if we are alone
        if len(self.status) <= 1:
            return

        # Check net heartbeat
        netFailed = Set()
        for name, values in self.status.items():
            if values['timestamp'] == 0:
                # Do nothing if first heartbeat has not been received yet
                continue

            if values['timestamp'] + MasterService.TM_SLAVE <= int(
                    time.time()):
                log.warn("Net heartbeat lost for %s." % (name))
                netFailed.add(name)

        # Get diskhearbeat timestamps
        try:
            tsDisk = self.disk.get_all_ts()
        except Exception, e:
            log.err("Diskheartbeat read failed: %s." % (e))
            raise
Beispiel #3
0
    def countVotes(self):
        if self.role != MasterService.RL_VOTING:
            log.warn("Tally triggered but it's not election time !")
            return

        if type(self.ballotBox) != dict or len(self.ballotBox) == 0:
            log.emerg(
                "No vote received ! There is a critical network failure.")
            self.panic(True)  # noCheck=True because role is not consistent
            return

        # Select election winner
        self.currentElection = None
        self.lastTallyDate = int(time.time())
        self.master = self.ballotBox[max(self.ballotBox.keys())]
        log.info("New master is %s." % (self.master))
        self._startSlave()

        if self.master == DNSCache.getInstance().name:
            log.info("I'm the new master.")
            self.role = MasterService.RL_ACTIVE
            self._startMaster()
        else:
            self.role = MasterService.RL_PASSIVE

        if self.panicRequested:
            log.warn("Engaging panic mode requested during election stage.")
            self.panicRequested = False
            self.panic()
Beispiel #4
0
	def countVotes(self):
		if self.role != MasterService.RL_VOTING:
			log.warn("Tally triggered but it's not election time !")
			return

		if type(self.ballotBox) != dict or len(self.ballotBox) == 0:
			log.emerg("No vote received ! There is a critical network failure.")
			self.panic(True) # noCheck=True because role is not consistent
			return

		# Select election winner
		self.currentElection=None
		self.lastTallyDate=int(time.time())
		self.master=self.ballotBox[max(self.ballotBox.keys())]
		log.info("New master is %s." % (self.master))
		self._startSlave()

		if self.master == DNSCache.getInstance().name:
			log.info("I'm the new master.")
			self.role=MasterService.RL_ACTIVE
			self._startMaster()
		else:
			self.role=MasterService.RL_PASSIVE
		
		if self.panicRequested:
			log.warn("Engaging panic mode requested during election stage.")
			self.panicRequested=False
			self.panic()
Beispiel #5
0
	def checkSlaveHeartbeats(self):
		# Checks slaves timestamps only if we are active master
		if self.role != MasterService.RL_ACTIVE:
			return

		# No failover in panic mode
		if self.state == MasterService.ST_PANIC:
			return

		# No more failover if a recovery is running
		if self.state == MasterService.ST_RECOVERY:
			return

		# No failover if we are alone
		if len(self.status) <= 1:
			return

		# Check net heartbeat
		netFailed=Set()
		for name, values in self.status.items():
			if values['timestamp'] == 0:
				# Do nothing if first heartbeat has not been received yet
				continue

			if values['timestamp']+MasterService.TM_SLAVE <= int(time.time()):
				log.warn("Net heartbeat lost for %s." % (name))
				netFailed.add(name)

		# Get diskhearbeat timestamps
		try:
			tsDisk=self.disk.get_all_ts()
		except Exception, e:
			log.err("Diskheartbeat read failed: %s." % (e))
			raise
Beispiel #6
0
    def _startMaster(self):
        def masterWatchdogFailed(reason):
            log.emerg("Slave heartbeat checks failed: %s." %
                      (reason.getErrorMessage()))
            self.panic()

        def startMasterWatchdog():
            if not self.l_masterDog.running:
                d = self.l_masterDog.start(MasterService.TM_WATCHDOG)
                d.addErrback(masterWatchdogFailed)
                d.addErrback(log.err)

        # Start master heartbeat
        self.s_masterHb.startService()

        # Check state of previous master
        if self.state == MasterService.ST_RECOVERY:
            log.warn(
                "Previous master was recovering something: re-enabling failover."
            )
            # Force normal mode to re-run failover
            self.state = MasterService.ST_NORMAL

        # Start master's watchdog for slaves failover
        reactor.callLater(2, startMasterWatchdog)
Beispiel #7
0
 def __init__(self):
     path = os.path.abspath(os.path.dirname(__file__))
     root = os.path.dirname(os.path.dirname(path))
     name = os.path.join(root, 'templates')
     
     self._load_templates(name)
     self._renderer = pybars.Compiler()
     self._compiled = {}
     self._partials = {}
     
     for k, v in self.templates.iteritems():
         path, source = v
         source = unicode(source)
         
         try:
             compiled = self._renderer.compile(source)
             
             self._compiled[k] = compiled
             self._partials[k] = compiled
         except Exception, e:
             logs.warn("[%s] template compiler error (%s): %s" % (self, path, e))
             raise
         
         #def m(n, k):
         #    logs.warn("[%s] MISSING: '%s' '%s'" % (n, k))
         #self._renderer.register_helper('helperMissing', m)
         try:
             pass
             #def helper(items, options):
             #    return compiled(items)
             
             #self._renderer.register_helper(k, compiled)
         except Exception, e:
             logs.warn("[%s] template register helper error (%s): %s" % (self, path, e))
             raise
Beispiel #8
0
	def joinCluster(self):

		def startHeartbeats():
			self._startSlave()
			self.s_rpc.startService()

			if self.role == MasterService.RL_ACTIVE:
				self._startMaster() 

		def joinRefused(reason):
			reason.trap(NodeRefusedError, RPCRefusedError)
			log.err("Join to cluster %s failed: Master %s has refused me: %s" % 
				(core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage()))
			self.stopService()

		def joinAccepted(result):
			self.role=MasterService.RL_PASSIVE
			log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME']))
			startHeartbeats()
			
		def masterConnected(obj):
			d = obj.callRemote("register",DNSCache.getInstance().name)
			d.addCallbacks(joinAccepted,joinRefused)
			d.addErrback(log.err)
			d.addBoth(lambda _: rpcConnector.disconnect())
			return d

		try:
			if self.master is None:
				# New active master
				if DNSCache.getInstance().name not in core.cfg['ALLOWED_NODES']:
					log.warn("I'm not allowed to create a new cluster. Exiting.")
					raise Exception("Cluster creation not allowed")

				if DiskHeartbeat.is_in_use():
					log.err("Heartbeat disk is in use but we are alone !")
					raise Exception("Heartbeat disk already in use")

				log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME']))
				self.role=MasterService.RL_ACTIVE
				self.master=DNSCache.getInstance().name
				self.status[self.master]={'timestamp': 0, 'offset': 0, 'vms': []}
				self.disk.make_slot(DNSCache.getInstance().name)
				startHeartbeats()

			else:
				# Passive master
				self.role=MasterService.RL_JOINING
				log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME']))

				factory = pb.PBClientFactory()
				rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory)
				d = factory.getRootObject()
				d.addCallback(masterConnected)
				d.addErrback(log.err)
		except Exception, e:
			log.err("Startup failed: %s. Shutting down." % (e))
			self.stopService()
Beispiel #9
0
	def _unregister(self, name):
		try:
			del self.status[name]
		except:
			pass

		try:
			self.disk.erase_slot(name)
		except DiskHeartbeatError, e:
			log.warn("Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e))
Beispiel #10
0
	def checkMasterHeartbeat(self):
		# Master failover is still possible even if in panic mode

		# Master failover only if we are a slave
		if self.role != MasterService.RL_PASSIVE:
			return 

		# Usecase #7: master lost
		if self.masterLastSeen+MasterService.TM_MASTER <= int(time.time()):
			log.warn("Broadcast heartbeat lost, master has disappeared.")
			return self.triggerElection()
Beispiel #11
0
    def checkMasterHeartbeat(self):
        # Master failover is still possible even if in panic mode

        # Master failover only if we are a slave
        if self.role != MasterService.RL_PASSIVE:
            return

        # Usecase #7: master lost
        if self.masterLastSeen + MasterService.TM_MASTER <= int(time.time()):
            log.warn("Broadcast heartbeat lost, master has disappeared.")
            return self.triggerElection()
Beispiel #12
0
    def _stopMaster(self):

        if self.state == MasterService.ST_RECOVERY:
            # Recovery will be re-run by next master (VM on current host may be lost)
            # but first recovery may be still running (cannot kill threads)
            log.warn("Stopping master during a recovery process !")

        # Send a last heartbeat before stopping
        self.s_masterHb.forcePulse()
        self.s_masterHb.stopService().addErrback(log.err)
        if self.l_masterDog.running:
            self.l_masterDog.stop()
Beispiel #13
0
    def _unregister(self, name):
        try:
            del self.status[name]
        except:
            pass

        try:
            self.disk.erase_slot(name)
        except DiskHeartbeatError, e:
            log.warn(
                "Cannot erase slot: %s. You may have to reformat hearbeat disk."
                % (e))
Beispiel #14
0
	def _stopMaster(self):

		if self.state == MasterService.ST_RECOVERY:
			# Recovery will be re-run by next master (VM on current host may be lost)
			# but first recovery may be still running (cannot kill threads)
			log.warn("Stopping master during a recovery process !")

		# Send a last heartbeat before stopping
		self.s_masterHb.forcePulse() 
		self.s_masterHb.stopService().addErrback(log.err)
		if self.l_masterDog.running:
			self.l_masterDog.stop()
Beispiel #15
0
def sync_stat(api, stat, reason=""):
    logs.info("\x1b[0;32m status:%s reason:%s \x1b[0m", stat, reason)
    if api == None or api == '':
        logs.warn("sync state api is nil")
        return
    patch = [
        {"op": "replace", "path": "/status/migrateState", "value": stat},
        {"op": "replace", "path": "/status/message", "value": reason}
    ]
    sync(api, patch)

# sync_stat('http://10.213.44.128:12808/tidb/api/v1/tidbs/006-xinyang1', 'Dumping')
Beispiel #16
0
 def _transform_stamps(self, stamps):
     if stamps is None:
         stamps = []
     
     ret = []
     
     for stamp in stamps:
         try:
             ret.append(HTTPStamp().importStamp(stamp).dataExport())
         except Exception:
             logs.warn(utils.getFormattedException())
     
     return ret
Beispiel #17
0
	def _sendError(self, reason):
		# Log all stacktrace to view the origin of this error
		log.err("Netheartbeat failure: %s" % (reason))
		if self.retry >= self.MAX_RETRY:
			log.emerg("Too many retry. Asking master to engage panic mode.")
			# Engage panic mode
			agent=Agent()
			d=agent.panic()
			d.addErrback(log.err)
			d.addBoth(lambda x: agent.disconnect())
		else:
			log.warn("Restarting network heartbeat within a few seconds...")
			self.retry+=1	# Will be resetted each elections (or panic recovery)
			reactor.callLater(2, self._run, self._proto)
Beispiel #18
0
    def voteForNewMaster(self, msg):
        # Elections accepted even if in panic mode

        def sendVote(result):
            log.info("Sending our vote...")
            result.sendMessage()
            port.stopListening()

        # Check origin of message
        if msg.node not in self.status:
            log.warn("Received vote request from unknown node %s." %
                     (msg.node))
            return

        # Discard current election if there is a new one
        if self.role == MasterService.RL_VOTING:
            log.warn("Previous election aborded: new vote request received.")
            try:
                self.f_tally.cancel()
            except:
                pass

        log.info("Vote request received from %s." % (msg.node))
        self.currentElection = msg.election

        # Discard vote request if we are leaving
        if self.role == MasterService.RL_LEAVING:
            log.info("Vote request ignored: we are leaving this cluster.")
            return

        # Stop heartbeating
        self._stopSlave()
        if self.role == MasterService.RL_ACTIVE:
            self._stopMaster()

        # Prepare election
        self.role = MasterService.RL_VOTING
        self.ballotBox = dict()
        self.f_tally = reactor.callLater(
            MasterService.TM_TALLY,
            self.countVotes)  # Timout of election stage

        # Send our vote
        d = Deferred()
        port = reactor.listenUDP(
            0,
            UDPSender(
                d, lambda: MessageVoteResponse().forge(self.currentElection)))
        d.addCallback(sendVote)
        d.addErrback(log.err)
Beispiel #19
0
 def _sendError(self, reason):
     # Log all stacktrace to view the origin of this error
     log.err("Netheartbeat failure: %s" % (reason))
     if self.retry >= self.MAX_RETRY:
         log.emerg("Too many retry. Asking master to engage panic mode.")
         # Engage panic mode
         agent = Agent()
         d = agent.panic()
         d.addErrback(log.err)
         d.addBoth(lambda x: agent.disconnect())
     else:
         log.warn("Restarting network heartbeat within a few seconds...")
         self.retry += 1  # Will be resetted each elections (or panic recovery)
         reactor.callLater(2, self._run, self._proto)
Beispiel #20
0
    def migrate(self, vmname, src_hostname, dst_hostname):
        """Live migration of specified VM from src to dst.

		All params are strings.

		Raise a NotInClusterError if src or dst are not part of cluster.
		Raise a NotRunningVmError if vm is not started on src or 
		a RunningVmError if vm is already started on dst.
		"""

        # Security checks
        if not self.is_in_cluster(src_hostname):
            raise NotInClusterError(src_hostname)

        if not self.is_in_cluster(dst_hostname):
            raise NotInClusterError(dst_hostname)

        dst_node = self.get_node(dst_hostname)
        src_node = self.get_node(src_hostname)

        if not src_node.is_vm_started(vmname):
            raise NotRunningVmError(src_node.get_hostname(), vmname)

        if dst_node.is_vm_started(vmname):
            raise RunningVmError(dst_node.get_hostname(), vmname)

            # Resources checks
        used_ram = src_node.get_vm(vmname).get_ram()
        free_ram = dst_node.metrics.get_free_ram()
        if used_ram > free_ram:
            raise NotEnoughRamError(dst_node.get_hostname(), "need " + str(used_ram) + "M, has " + str(free_ram) + "M.")

            # Take care of proper migration
        dst_node.activate_lv(vmname)
        src_node.migrate(vmname, dst_node)  # Could raise xen.xm.XenAPI.Failure (not SystemExit)
        src_node.deactivate_lv(vmname)
        src_node.disable_vm_autostart(vmname)
        dst_node.enable_vm_autostart(vmname)

        if core.cfg["POST_MIGRATION_HOOK"]:
            # Run post migration script in background without error handling
            try:
                # Double fork with all filehandles closed, if not, run() hang
                src_node.run(
                    "(%s %s %s %s 2>&- >&- <&- &)&"
                    % (core.cfg["POST_MIGRATION_HOOK"], vmname, src_node.get_hostname(), dst_node.get_hostname())
                )
            except Exception, e:
                log.warn("Post-migration hook failed : %s" % (e))
Beispiel #21
0
	def updateNodeStatus(self, msg):

		if self.role != MasterService.RL_ACTIVE:
			# Some slave HB could reach us during election...
			if self.role == MasterService.RL_PASSIVE:
				log.warn("Received slave heartbeat from %s while we're not master." % (msg.node))
			return

		# Check origin of message
		if msg.node not in self.status:
			log.warn("Received slave heartbeat from unknown node %s." % (msg.node))
			return

		now=int(time.time())
		self.status[msg.node]={'timestamp': now, 'offset': now-msg.ts, 'vms': msg.vms}
Beispiel #22
0
		def instantiationFailed(reason):
			reason.trap(InstantiationError)

			failed=reason.value.value.keys()
			log.warn("Can't connect to", ", ".join(failed))

			# Delete failed nodes from cluster list
			running=self.getNodesList()
			for name in failed:
				running.remove(name)
			
			# Re-instanciate cluster without nodes in error
			d=XenCluster.getDeferInstance(running)
			d.addCallbacks(startRecover)
			return d
Beispiel #23
0
        def instantiationFailed(reason):
            reason.trap(InstantiationError)

            failed = reason.value.value.keys()
            log.warn("Can't connect to", ", ".join(failed))

            # Delete failed nodes from cluster list
            running = self.getNodesList()
            for name in failed:
                running.remove(name)

            # Re-instanciate cluster without nodes in error
            d = XenCluster.getDeferInstance(running)
            d.addCallbacks(startRecover)
            return d
Beispiel #24
0
	def migrate(self, vmname, src_hostname, dst_hostname):
		"""Live migration of specified VM from src to dst.

		All params are strings.

		Raise a NotInClusterError if src or dst are not part of cluster.
		Raise a NotRunningVmError if vm is not started on src or 
		a RunningVmError if vm is already started on dst.
		"""

		# Security checks
		if not self.is_in_cluster(src_hostname):
			raise NotInClusterError(src_hostname)

		if not self.is_in_cluster(dst_hostname):
			raise NotInClusterError(dst_hostname)

		dst_node=self.get_node(dst_hostname)
		src_node=self.get_node(src_hostname)

		if not src_node.is_vm_started(vmname):
			raise NotRunningVmError(src_node.get_hostname(), vmname)
		
		if dst_node.is_vm_started(vmname):
			raise RunningVmError(dst_node.get_hostname(), vmname)

		# Resources checks
		used_ram=src_node.get_vm(vmname).get_ram()
		free_ram=dst_node.metrics.get_free_ram()
		if used_ram>free_ram:
			raise NotEnoughRamError(dst_node.get_hostname(),"need "+str(used_ram)+"M, has "+str(free_ram)+"M.")

		# Take care of proper migration
		dst_node.activate_lv(vmname)
		src_node.migrate(vmname,dst_node) # Could raise xen.xm.XenAPI.Failure (not SystemExit)
		src_node.deactivate_lv(vmname)
		src_node.disable_vm_autostart(vmname)
		dst_node.enable_vm_autostart(vmname)

		if core.cfg['POST_MIGRATION_HOOK']:
			# Run post migration script in background without error handling
			try:
				# Double fork with all filehandles closed, if not, run() hang
				src_node.run("(%s %s %s %s 2>&- >&- <&- &)&" % (core.cfg['POST_MIGRATION_HOOK'], vmname, src_node.get_hostname(), dst_node.get_hostname()))
			except Exception, e:
				log.warn("Post-migration hook failed : %s" % (e))
Beispiel #25
0
    def recoverFromPanic(self):
        if not self.isInPanic():
            log.warn("I'm not in panic. Cannot recover anything.")
            raise RPCRefusedError("Not in panic mode")

        # Only master can do recovery
        if self.role != MasterService.RL_ACTIVE:
            log.warn("I'm not master. Cannot recover from panic.")
            raise RPCRefusedError("Not master")

        # Back to normal mode
        log.info("Recovering from panic mode. Back to normals operations.")
        self.state = MasterService.ST_NORMAL
        self.s_masterHb.forcePulse()
        d = self.triggerElection()

        return d
Beispiel #26
0
	def recoverFromPanic(self):
		if not self.isInPanic():
			log.warn("I'm not in panic. Cannot recover anything.")
			raise RPCRefusedError("Not in panic mode")

		# Only master can do recovery
		if self.role != MasterService.RL_ACTIVE:
			log.warn("I'm not master. Cannot recover from panic.")
			raise RPCRefusedError("Not master")

		# Back to normal mode 
		log.info("Recovering from panic mode. Back to normals operations.")
		self.state=MasterService.ST_NORMAL
		self.s_masterHb.forcePulse()
		d=self.triggerElection()

		return d
Beispiel #27
0
 def process_user(self, user, categories=None):
     assert user is not None
     
     if categories is None:
         categories = self._categories
     
     retries = 0
     
     while retries < 3:
         try:
             for category in categories:
                 ts = {
                     'user_id' : user.user_id, 
                     'scope'   : 'user'
                 }
                 
                 if category != 'default':
                     if category == 'app':
                         ts['subcategory'] = 'app'
                     else:
                         ts['category']    = category
                 
                 ts['limit'] = 100
                 collage     = self._collages[category]
                 stamp_slice = HTTPTimeSlice().dataImport(ts).exportTimeSlice()
                 stamps      = self.api.getStampCollection(stamp_slice)
                 entities    = map(lambda s: s.entity, stamps)
                 entities    = utils.shuffle(entities)[:30]
                 
                 logs.info("creating collage for user '%s' w/ category '%s' and %d entities" % (user.screen_name, category, len(entities)))
                 images = collage.generate_from_user(user, entities)
                 
                 for image in images:
                     filename = "collage-%s-%s-%sx%s.jpg" % (user.screen_name, category, image.size[0], image.size[1])
                     
                     self.save_image(image, filename)
             
             break
         except Exception, e:
             logs.warn("unexpected error processing user %s: %s" % (str(user), e))
             logs.warn(utils.getFormattedException())
             
             retries += 1
             time.sleep(2 ** retries)
Beispiel #28
0
	def voteForNewMaster(self, msg):
		# Elections accepted even if in panic mode

		def sendVote(result):
			log.info("Sending our vote...")
			result.sendMessage()
			port.stopListening()

		# Check origin of message
		if msg.node not in self.status:
			log.warn("Received vote request from unknown node %s." % (msg.node))
			return

		# Discard current election if there is a new one
		if self.role == MasterService.RL_VOTING:
			log.warn("Previous election aborded: new vote request received.")
			try:
				self.f_tally.cancel()
			except:
				pass

		log.info("Vote request received from %s." % (msg.node))
		self.currentElection=msg.election

		# Discard vote request if we are leaving
		if self.role == MasterService.RL_LEAVING:
			log.info("Vote request ignored: we are leaving this cluster.")
			return

		# Stop heartbeating
		self._stopSlave()
		if self.role == MasterService.RL_ACTIVE:
			self._stopMaster()

		# Prepare election
		self.role=MasterService.RL_VOTING
		self.ballotBox=dict()
		self.f_tally=reactor.callLater(MasterService.TM_TALLY, self.countVotes) # Timout of election stage

		# Send our vote
		d = Deferred()
		port = reactor.listenUDP(0, UDPSender(d, lambda: MessageVoteResponse().forge(self.currentElection)))
		d.addCallback(sendVote)
		d.addErrback(log.err)
Beispiel #29
0
	def leaveCluster(self):

		def masterConnected(obj):
			d = obj.callRemote("unregister",DNSCache.getInstance().name)
			d.addErrback(log.err)
			d.addBoth(lambda _: rpcConnector.disconnect())
			return d

		# Stop slave hearbeat and watchdog
		self._stopSlave()

		previousRole=self.role
		self.role=MasterService.RL_LEAVING

		if previousRole == MasterService.RL_ACTIVE:

			# Self-delete our own record 
			self._unregister(DNSCache.getInstance().name)

			if len(self.status) <= 0:
				log.warn("I'm the last node, shutting down cluster.")
				d=defer.succeed(None)
			else:
				# New election only if there is at least one node
				d=self.triggerElection()
				d.addErrback(log.err)

			# Stop master hearbeat when vote request has been sent
			d.addBoth(lambda _: self._stopMaster())
		elif previousRole == MasterService.RL_PASSIVE:
			rpcFactory = pb.PBClientFactory()
			rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], rpcFactory)
			d = rpcFactory.getRootObject()
			d.addCallback(masterConnected)
		else: # RL_ALONE or RL_JOINING or RL_VOTING
			if previousRole == MasterService.RL_VOTING:
				# Others nodes will re-trigger an election if we win this one
				log.warn("Quitting cluster during election stage !")

			d=defer.succeed(None)
		
		return d
Beispiel #30
0
    def leaveCluster(self):
        def masterConnected(obj):
            d = obj.callRemote("unregister", DNSCache.getInstance().name)
            d.addErrback(log.err)
            d.addBoth(lambda _: rpcConnector.disconnect())
            return d

        # Stop slave hearbeat and watchdog
        self._stopSlave()

        previousRole = self.role
        self.role = MasterService.RL_LEAVING

        if previousRole == MasterService.RL_ACTIVE:

            # Self-delete our own record
            self._unregister(DNSCache.getInstance().name)

            if len(self.status) <= 0:
                log.warn("I'm the last node, shutting down cluster.")
                d = defer.succeed(None)
            else:
                # New election only if there is at least one node
                d = self.triggerElection()
                d.addErrback(log.err)

            # Stop master hearbeat when vote request has been sent
            d.addBoth(lambda _: self._stopMaster())
        elif previousRole == MasterService.RL_PASSIVE:
            rpcFactory = pb.PBClientFactory()
            rpcConnector = reactor.connectTCP(self.master,
                                              core.cfg['TCP_PORT'], rpcFactory)
            d = rpcFactory.getRootObject()
            d.addCallback(masterConnected)
        else:  # RL_ALONE or RL_JOINING or RL_VOTING
            if previousRole == MasterService.RL_VOTING:
                # Others nodes will re-trigger an election if we win this one
                log.warn("Quitting cluster during election stage !")

            d = defer.succeed(None)

        return d
Beispiel #31
0
	def fence(self, hostname):
		"""
		Fence the given node. 

		You have to make a fencing script that will use iLo, IPMI or other such fencing device.
		See FENCE_CMD in configuration file.

		Raise a FenceNodeError if the fence fail of if DISABLE_FENCING is True.
		"""
		if core.cfg['DISABLE_FENCING']:
			raise FenceNodeError(self.get_hostname(), "Fencing disabled by configuration", hostname)

		if self.get_hostname() == hostname:
			log.warn("Node is self-fencing !")
			log.warn("\"Chérie ça va trancher.\"")

		try:
			self.run(core.cfg['FENCE_CMD'] + " " + hostname)
		except ShellError, e:
			raise FenceNodeError(self.get_hostname(), e.value, hostname)
Beispiel #32
0
    def fence(self, hostname):
        """
		Fence the given node. 

		You have to make a fencing script that will use iLo, IPMI or other such fencing device.
		See FENCE_CMD in configuration file.

		Raise a FenceNodeError if the fence fail of if DISABLE_FENCING is True.
		"""
        if core.cfg['DISABLE_FENCING']:
            raise FenceNodeError(self.get_hostname(),
                                 "Fencing disabled by configuration", hostname)

        if self.get_hostname() == hostname:
            log.warn("Node is self-fencing !")
            log.warn("\"Chérie ça va trancher.\"")

        try:
            self.run(core.cfg['FENCE_CMD'] + " " + hostname)
        except ShellError, e:
            raise FenceNodeError(self.get_hostname(), e.value, hostname)
Beispiel #33
0
def revive_tombstoned_entities(entity_id):
    """
    Finds all entities that are tombstoned to the given entity, undo the tombstone. This function will also find all
    stamps and todos that were transfered to the given entity as a consequence of tombstoning, and return those items to
    the original entity.
    """

    entities_by_id = {}
    entity_db = MongoEntityCollection.MongoEntityCollection()
    for entity in entity_db.getEntitiesByTombstoneId(entity_id):
        clear_tombstone_id(entity, entity_db, entities_by_id)

    todo_db = MongoTodoCollection.MongoTodoCollection()
    todo_seed_db = MongoTodoCollection.MongoSeedTodoCollection()
    for todo_id in todo_db.getTodoIdsFromEntityId(entity_id):
        original_entity_id = todo_seed_db.getEntityIdForTodo(todo_id)
        if original_entity_id is None:
            logs.warn('Could not find entity for seed todo: ' + todo_id)
            continue
        if original_entity_id in entities_by_id:
            entity = entities_by_id[original_entity_id]
        else:
            entity = entity_db.getEntity(original_entity_id)
            entity = clear_tombstone_id(entity, entity_db, entities_by_id)
        todo_db.updateTodoEntity(todo_id, entity.minimize())

    stamp_db = MongoStampCollection.MongoStampCollection()
    stamp_seed_db = MongoStampCollection.MongoSeedStampCollection()
    for stamp_id in stamp_db.getStampIdsForEntity(entity_id):
        original_entity_id = stamp_seed_db.getStamp(stamp_id).entity.entity_id
        if original_entity_id in entities_by_id:
            entity = entities_by_id[original_entity_id]
        else:
            entity = entity_db.getEntity(original_entity_id)
            entity = clear_tombstone_id(entity, entity_db, entities_by_id)
        stamp_db.updateStampEntity(stamp_id, entity.minimize())

    api = globalMongoStampedAPI()
    for entity in entities_by_id.itervalues():
        api.mergeEntity(entity)
Beispiel #34
0
def _buildMap(template, stamps):
    try:
        result = []

        for stamp in stamps:
            try:
                if "deleted" not in stamp:
                    result.append(HTTPStamp().importSchema(stamp).dataExport())
            except:
                logs.warn(utils.getFormattedException())

        result = json.dumps(result, sort_keys=True)

        response = render_to_response(template, {"stamps": result})
        response["Expires"] = (datetime.utcnow() + timedelta(minutes=10)).ctime()
        response["Cache-Control"] = "max-age=600"

        return response

    except Exception as e:
        logs.warning("Error: %s" % e)
        raise Http404
Beispiel #35
0
    def _elasticsearch(self):
        try:
            import pyes
        except:
            utils.printException()

        es_port = 9200
        retries = 5

        if libs.ec2_utils.is_ec2():
            stack = libs.ec2_utils.get_stack()

            if stack is None:
                logs.warn("error: unable to find stack info")
                return None

            es_servers = filter(lambda node: "search" in node.roles, stack.nodes)
            es_servers = map(lambda node: str("%s:%d" % (node.private_ip_address, es_port)), es_servers)

            if len(es_servers) == 0:
                logs.warn("error: no elasticsearch servers found")
                return None
        else:
            es_servers = "%s:%d" % ("localhost", es_port)

        while True:
            try:
                es = pyes.ES(es_servers)
                info = es.collect_info()
                utils.log("[%s] pyes: %s" % (self, pformat(info)))

                return es
            except Exception:
                retries -= 1
                if retries <= 0:
                    raise

                utils.printException()
                time.sleep(1)
Beispiel #36
0
    def updateNodeStatus(self, msg):

        if self.role != MasterService.RL_ACTIVE:
            # Some slave HB could reach us during election...
            if self.role == MasterService.RL_PASSIVE:
                log.warn(
                    "Received slave heartbeat from %s while we're not master."
                    % (msg.node))
            return

        # Check origin of message
        if msg.node not in self.status:
            log.warn("Received slave heartbeat from unknown node %s." %
                     (msg.node))
            return

        now = int(time.time())
        self.status[msg.node] = {
            'timestamp': now,
            'offset': now - msg.ts,
            'vms': msg.vms
        }
Beispiel #37
0
	def _startMaster(self):
		def masterWatchdogFailed(reason):
			log.emerg("Slave heartbeat checks failed: %s." % (reason.getErrorMessage()))
			self.panic()

		def startMasterWatchdog():
			if not self.l_masterDog.running:
				d=self.l_masterDog.start(MasterService.TM_WATCHDOG)
				d.addErrback(masterWatchdogFailed)
				d.addErrback(log.err)

		# Start master heartbeat
		self.s_masterHb.startService()

		# Check state of previous master
		if self.state == MasterService.ST_RECOVERY:
			log.warn("Previous master was recovering something: re-enabling failover.")
			# Force normal mode to re-run failover
			self.state=MasterService.ST_NORMAL

		# Start master's watchdog for slaves failover
		reactor.callLater(2, startMasterWatchdog)
Beispiel #38
0
class UDPListener(DatagramProtocol):
    def __init__(self, onReceive):
        self.c_onReceive = onReceive

    def datagramReceived(self, data, (host, port)):
        try:
            if USE_ZLIB:
                (crc, zip) = data.split(',', 1)
                data = zlib.decompress(zip)
                if int(crc) != zlib.adler32(data):
                    raise Exception("Data from %s is corrupted." % (host))

            msg = json.loads(data)
        except Exception, e:
            log.warn("Error parsing message: %s" % (e))
Beispiel #39
0
    def updateMasterStatus(self, msg):

        if self.master is None:
            self.master = msg.node
            log.info("Found master at %s." % (self.master))
        else:
            # Check origin of message if we known the cluster members
            if msg.node not in self.status:
                log.warn("Received master heartbeat from unknown node %s." %
                         (msg.node))
                return

        # Active master's checks
        if self.role == MasterService.RL_ACTIVE:
            if self.master == msg.node:
                return  # Discard our own master heartbeat
            else:
                # Usecase #8: partition ended with many master
                log.warn(
                    "Received another master's heartbeat from %s ! Trying to recover from partition..."
                    % (msg.node))
                self.triggerElection().addErrback(log.err)

                # Propagate panic mode from another master
                if msg.state == MasterService.ST_PANIC:
                    log.warn(
                        "Concurrent master is in panic mode, so we should be too."
                    )
                    self.panic()
                return

        # Passive master's checks
        if self.master != msg.node:
            log.warn("Received master heartbeat from a wrong master %s !" %
                     (msg.node))
            return

        # Check error mode change to panic
        if not self.isInPanic() and msg.state == MasterService.ST_PANIC:
            log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.")

        # Keep a backup of the active master's state and status
        self.status = msg.status
        self.state = msg.state
        self.masterLastSeen = int(time.time())
Beispiel #40
0
	def updateMasterStatus(self, msg):

		if self.master is None:
			self.master=msg.node
			log.info("Found master at %s." % (self.master))
		else:
			# Check origin of message if we known the cluster members
			if msg.node not in self.status:
				log.warn("Received master heartbeat from unknown node %s." % (msg.node))
				return

		# Active master's checks 
		if self.role == MasterService.RL_ACTIVE:
			if self.master == msg.node:
				return		# Discard our own master heartbeat
			else:
				# Usecase #8: partition ended with many master
				log.warn("Received another master's heartbeat from %s ! Trying to recover from partition..." % (msg.node))
				self.triggerElection().addErrback(log.err) 

				# Propagate panic mode from another master
				if msg.state == MasterService.ST_PANIC:
					log.warn("Concurrent master is in panic mode, so we should be too.")
					self.panic()
				return

		# Passive master's checks
		if self.master != msg.node:
			log.warn("Received master heartbeat from a wrong master %s !" % (msg.node))
			return

		# Check error mode change to panic
		if not self.isInPanic() and msg.state == MasterService.ST_PANIC:
			log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.")

		# Keep a backup of the active master's state and status
		self.status=msg.status
		self.state=msg.state
		self.masterLastSeen=int(time.time())
Beispiel #41
0
	def recordVote(self, msg):
		# Check origin of message
		if msg.node not in self.status:
			log.warn("Vote received from unknown node %s." % (msg.node))
			return

		if self.role != MasterService.RL_VOTING:
			log.warn("Vote received from %s but it's not election time !" % (msg.node))
			return

		if self.currentElection != msg.election:
			log.warn("Vote received for another election from %s. Discarding." % (msg.node))
			return

		self.ballotBox[msg.ballot]=msg.node
Beispiel #42
0
	def unregisterNode(self, name):
		# Can unregister node even if in panic mode

		if self.role != MasterService.RL_ACTIVE:
			log.warn("I'm not master. Cannot unregister %s." % (name))
			raise RPCRefusedError("Not master")

		if name not in self.status:
			log.warn("Unknown node %s try to quit the cluster." % (name))
			raise NodeRefusedError("Unknown node "+name)

		if name == DNSCache.getInstance().name:
			log.warn("I'm the master. Cannot self unregister.")
			raise NodeRefusedError("Cannot unregister master")

		self._unregister(name)
Beispiel #43
0
    def unregisterNode(self, name):
        # Can unregister node even if in panic mode

        if self.role != MasterService.RL_ACTIVE:
            log.warn("I'm not master. Cannot unregister %s." % (name))
            raise RPCRefusedError("Not master")

        if name not in self.status:
            log.warn("Unknown node %s try to quit the cluster." % (name))
            raise NodeRefusedError("Unknown node " + name)

        if name == DNSCache.getInstance().name:
            log.warn("I'm the master. Cannot self unregister.")
            raise NodeRefusedError("Cannot unregister master")

        self._unregister(name)
Beispiel #44
0
 def render(self, context):
     try:
         if self._context_variable is None:
             context_dict = self._simplify_context(context)
         else:
             context_dict = self._context_variable.resolve(context)
         
         result = self._library.render(self._name, context_dict)
         if len(result.strip()) == 0:
             logs.warn("%s.render warning empty result (%s)" % (self, self._name))
         
         return result
     except Exception, e:
         logs.warn("%s.render error (%s): %s" % (self, self._name, e))
         logs.warn(utils.getFormattedException())
         
         return ''
Beispiel #45
0
    def recordVote(self, msg):
        # Check origin of message
        if msg.node not in self.status:
            log.warn("Vote received from unknown node %s." % (msg.node))
            return

        if self.role != MasterService.RL_VOTING:
            log.warn("Vote received from %s but it's not election time !" %
                     (msg.node))
            return

        if self.currentElection != msg.election:
            log.warn(
                "Vote received for another election from %s. Discarding." %
                (msg.node))
            return

        self.ballotBox[msg.ballot] = msg.node
Beispiel #46
0
    def panic(self, noCheck=False):
        """ 
		Engage panic mode.
		Use noCheck=True if you want to panic whatever the cluster role.
		"""
        def panicFailed(reason):
            log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
            self.panic(True)

        if self.state == MasterService.ST_PANIC:
            log.emerg("Panic mode already engaged.")

        elif self.role == MasterService.RL_ACTIVE or noCheck:
            log.emerg("SYSTEM FAILURE: Panic mode engaged.")
            log.emerg(
                "This is a critical error. You should bring your ass over here, right now."
            )
            log.emerg(
                "Please check logs and be sure of what you're doing before re-engaging normal mode."
            )
            self.state = MasterService.ST_PANIC

            # TODO + stop LB
            if self.l_masterDog.running:
                self.l_masterDog.stop()

        elif self.role == MasterService.RL_VOTING:
            # No master during election stage: waiting next master
            log.warn("Panic mode requested during election stage: delaying.")
            self.panicRequested = True

        elif self.role == MasterService.RL_PASSIVE:
            log.warn("I'm slave: asking master to engage panic mode...")

            agent = Agent()
            d = agent.panic()
            d.addErrback(panicFailed)
            d.addErrback(log.err)

        else:  # RL_ALONE or RL_JOINING or RL_LEAVING
            log.warn(
                "I'm not in a running state (master or slave). Cannot engage panic mode."
            )
            raise RPCRefusedError("Not in running state")
Beispiel #47
0
	def panic(self, noCheck=False):
		""" 
		Engage panic mode.
		Use noCheck=True if you want to panic whatever the cluster role.
		"""

		def panicFailed(reason):
			log.emerg("Panic query failed: %s." % (reason.getErrorMessage()))
			self.panic(True)

		if self.state == MasterService.ST_PANIC:
			log.emerg("Panic mode already engaged.")

		elif self.role == MasterService.RL_ACTIVE or noCheck:
			log.emerg("SYSTEM FAILURE: Panic mode engaged.")
			log.emerg("This is a critical error. You should bring your ass over here, right now.")
			log.emerg("Please check logs and be sure of what you're doing before re-engaging normal mode.")
			self.state=MasterService.ST_PANIC

			# TODO + stop LB
			if self.l_masterDog.running:
				self.l_masterDog.stop()

		elif self.role == MasterService.RL_VOTING:
			# No master during election stage: waiting next master
			log.warn("Panic mode requested during election stage: delaying.")
			self.panicRequested=True

		elif self.role == MasterService.RL_PASSIVE:
			log.warn("I'm slave: asking master to engage panic mode...")

			agent=Agent()
			d=agent.panic()
			d.addErrback(panicFailed)
			d.addErrback(log.err)

		else: # RL_ALONE or RL_JOINING or RL_LEAVING
			log.warn("I'm not in a running state (master or slave). Cannot engage panic mode.")
			raise RPCRefusedError("Not in running state")
Beispiel #48
0
import vars_init
vars_init.init(targets)

import vars, state, builder, jwack
from logs import warn, err

try:
    if vars_init.is_toplevel:
        builder.start_stdin_log_reader(status=opt.status, details=opt.details,
            pretty=opt.pretty, color=opt.color,
            debug_locks=opt.debug_locks, debug_pids=opt.debug_pids)
    for t in targets:
        if os.path.exists(t):
            f = state.File(name=t)
            if not f.is_generated:
                warn('%s: exists and not marked as generated; not redoing.\n'
                     % f.nicename())
    state.rollback()
    
    j = atoi(opt.jobs or 1)
    if j < 1 or j > 1000:
        err('invalid --jobs value: %r\n' % opt.jobs)
    jwack.setup(j)
    try:
        assert(state.is_flushed())
        retcode = builder.main(targets, lambda t: (True, True))
        assert(state.is_flushed())
    finally:
        try:
            state.rollback()
        finally:
            try:
Beispiel #49
0
    def joinCluster(self):
        def startHeartbeats():
            self._startSlave()
            self.s_rpc.startService()

            if self.role == MasterService.RL_ACTIVE:
                self._startMaster()

        def joinRefused(reason):
            reason.trap(NodeRefusedError, RPCRefusedError)
            log.err("Join to cluster %s failed: Master %s has refused me: %s" %
                    (core.cfg['CLUSTER_NAME'], self.master,
                     reason.getErrorMessage()))
            self.stopService()

        def joinAccepted(result):
            self.role = MasterService.RL_PASSIVE
            log.info("Join successfull, I'm now part of cluster %s." %
                     (core.cfg['CLUSTER_NAME']))
            startHeartbeats()

        def masterConnected(obj):
            d = obj.callRemote("register", DNSCache.getInstance().name)
            d.addCallbacks(joinAccepted, joinRefused)
            d.addErrback(log.err)
            d.addBoth(lambda _: rpcConnector.disconnect())
            return d

        try:
            if self.master is None:
                # New active master
                if DNSCache.getInstance(
                ).name not in core.cfg['ALLOWED_NODES']:
                    log.warn(
                        "I'm not allowed to create a new cluster. Exiting.")
                    raise Exception("Cluster creation not allowed")

                if DiskHeartbeat.is_in_use():
                    log.err("Heartbeat disk is in use but we are alone !")
                    raise Exception("Heartbeat disk already in use")

                log.info("No master found. I'm now the new master of %s." %
                         (core.cfg['CLUSTER_NAME']))
                self.role = MasterService.RL_ACTIVE
                self.master = DNSCache.getInstance().name
                self.status[self.master] = {
                    'timestamp': 0,
                    'offset': 0,
                    'vms': []
                }
                self.disk.make_slot(DNSCache.getInstance().name)
                startHeartbeats()

            else:
                # Passive master
                self.role = MasterService.RL_JOINING
                log.info("Trying to join cluster %s..." %
                         (core.cfg['CLUSTER_NAME']))

                factory = pb.PBClientFactory()
                rpcConnector = reactor.connectTCP(self.master,
                                                  core.cfg['TCP_PORT'],
                                                  factory)
                d = factory.getRootObject()
                d.addCallback(masterConnected)
                d.addErrback(log.err)
        except Exception, e:
            log.err("Startup failed: %s. Shutting down." % (e))
            self.stopService()
Beispiel #50
0
 def invalidHostname(reason):
     log.warn("Node %s has an invalid name. Refusing." % (name))
     raise NodeRefusedError(reason.getErrorMessage())
Beispiel #51
0
	def start_vms(self, vmnames):
		"""
		Start the specified list of VM on the cluster, one after the other.
		Nodes are choosen with a best-fit decreasing algorithm, so the cluster will not 
		be balanced, but optimized for full-load.
		This function is error-proof: if a vm start to fail, it will try to start others vms 
		and report all errors only at the end.

		vmnames - (List of String) VM hostnames 

		Raise a MultipleError if one of many errors are detected.
		"""
		
		assert type(vmnames) == list, "Param 'vmnames' should be a list."

		# Get nodes
		pool=self.get_nodes()

		# Sort VMs to be started by ram
		vms=[ VM(name) for name in vmnames ]
		vms.sort(key=lambda x: x.get_start_ram(), reverse=True)
		
		failed=dict()
		for vm in vms:
			selected_node=None

			# Check if vm is already started somewhere
			nodes=self.search_vm_started(vm.name)
			if(len(nodes)>0):
				log.info("%s is already started on %s." % (vm.name, ", ".join([n.get_hostname() for n in nodes])))
				continue
	
			# Sort nodes by free ram
			pool.sort(key=lambda x: x.metrics.get_free_ram(False))
			for node in pool:
				if node.metrics.get_free_ram() >= vm.get_start_ram():
					selected_node=node
					break # Select first node with enough space

			if selected_node is None:
				# Not enough room for this one
				failed[vm.name]=NotEnoughRamError("this cluster", "Cannot start "+vm.name)
				continue  # Next !

			log.info("Starting", vm.name, "on", selected_node.get_hostname())

			# Start the vm 
			try:
				self.activate_vm(node,vm.name)

				try:
					selected_node.start(vm.name)
				except SystemExit, e:
					# SystemExit are raised by Xen when xm_create fail
					node.deactivate_lv(vm.name)
					failed[vm.name]=XenError(node.get_hostname(), str(e))
				except Exception, e:
					node.deactivate_lv(vm.name)
					failed[vm.name]=e
				else:
					try:
						selected_node.enable_vm_autostart(vm.name)
					except Exception, e:
						# Don't report failure as an error, autostart link is not important
						log.warn("Cannot enable autostart for %s : %s" % (vm.name, e))
Beispiel #52
0
    def registerNode(self, name):
        def validHostname(result):
            try:
                self.disk.make_slot(name)
            except DiskHeartbeatError, e:
                raise NodeRefusedError("Disk heartbeat failure: %s" % (e))

            self.status[name] = {'timestamp': 0, 'offset': 0, 'vms': []}
            log.info("Node %s has joined the cluster." % (name))

        def invalidHostname(reason):
            log.warn("Node %s has an invalid name. Refusing." % (name))
            raise NodeRefusedError(reason.getErrorMessage())

        if self.isInPanic():
            log.warn("I'm in panic. Cannot register %s." % (name))
            raise RPCRefusedError("Panic mode engaged")

        if self.role != MasterService.RL_ACTIVE:
            log.warn("I'm not master. Cannot register %s." % (name))
            raise RPCRefusedError("Not master")

        if name not in core.cfg['ALLOWED_NODES']:
            log.warn("Node %s not allowed to join this cluster. Refusing." %
                     (name))
            raise NodeRefusedError("Node not allowed to join this cluster.")

        if name in self.status:
            log.warn("Node %s is already joined ! Cannot re-join." % (name))
            raise NodeRefusedError("Node already in cluster")
Beispiel #53
0
class XenCluster:

	"""This class is used to perform action on the xen cluster."""

	def __init__(self, nodes):
		"""This should be private. Use getDeferInstance() instead."""

		assert type(nodes) == dict, "Param 'nodes' should be a dict."
		self.nodes=nodes
		
	@staticmethod
	def getDeferInstance(nodeslist=None):
		"""Instantiate a XenCluster object and associated Nodes.

		This function open SSH and XenAPI connections to all actives nodes.
		It take a (string) list of node's hostname as optionnal argument, if not given, 
		the list will fetched from cxm'master.

		Return a deferred that will be fired when all nodes are ready.
		If a node is not online, the deferred will fail.
		"""

		log.info("Loading cluster...")
		nodes=dict()

		def instantiate(results):
			failedNodes=dict()
			for result in results:
				if not result[0]:
					failedNodes[nodeslist[results.index(result)]]=result[1].getErrorMessage()

			if len(failedNodes)>0:
				raise InstantiationError(failedNodes)

			return XenCluster(nodes)

		def add_node(result, hostname):
			nodes[hostname]=result

		def create_nodes(result):
			ds=list()
			for hostname in result:
				d=threads.deferToThread(lambda x: Node(x), hostname)
				d.addCallback(add_node, hostname)
				ds.append(d)

			dl=defer.DeferredList(ds, consumeErrors=1)
			dl.addCallback(instantiate)
			return dl
			
		def failed(reason):
			raise Exception("Can't connect to local master: %s" % reason.getErrorMessage())

		if not nodeslist:
			agent=Agent()
			d=agent.getNodesList()
			d.addCallback(create_nodes)
			d.addErrback(failed)
			return d
		else:
			return create_nodes(nodeslist)

	def disconnect(self):
		"""Close all connections."""
		for node in self.get_nodes():
			node.disconnect()

	def get_nodes(self):
		"""Fetch the current actives nodes.

		Return a list of Node object.
		"""
		return self.nodes.values()

	def get_node(self,hostname):
		"""Return the Node object of the specified hostname.
	
		Raise a NotInClusterError if the given hostname is not a clusters's node.
		"""
		try:
			return self.nodes[hostname]
		except KeyError:
			raise NotInClusterError(hostname)

	def get_local_node(self):
		"""Return the Node object of the local node.
		
		Raise a NotInClusterError if the local node is not a clusters's node.
		"""
		return self.get_node(socket.gethostname())

	def get_load(self):
		"""
		Return the global load of the cluster, in percentage.
		This load is computed using ram capacities.
		If load is higher than 100%, cluster is overloaded and cannot do failover.
		"""

		def computeLoad(result):
			# The load is computed without the bigger node
			# and so take in account a failure of one node.
			try:
				return (sum(result['used'])*100)/(sum(result['total'])-max(result['total']))
			except ZeroDivisionError:
				# Just one node: cannot do failover
				return 100

		d=self.get_ram_details()
		d.addCallback(computeLoad)
		return d

	def get_ram_details(self):
		"""Return a dict of list with the free, used, and total ram of the cluster. Units: MB"""

		def getValues(node):
			return node.get_metrics().get_ram_infos()

		def appendValues(results):
			used=list()
			free=list()
			total=list()
			for success, result in results:
				if success:
					used.append(result['used'])
					free.append(result['free'])
					total.append(result['total'])
				else:
					raise result
			
			return { 'total': total, 'free':free, 'used':used }

		ds=list()
		for node in self.get_nodes():
			d=threads.deferToThread(getValues, node)
			ds.append(d)

		dl=defer.DeferredList(ds, consumeErrors=True)
		dl.addCallback(appendValues)
		return dl

	def get_vm_started(self):
		"""Return the number of vm started in the cluster."""

		def computeNumber(results):
			nb=0
			for success, result in results:
				if success:
					nb += result
				else:
					raise result
			
			return nb

		ds=list()
		for node in self.get_nodes():
			d=threads.deferToThread(lambda x: x.get_vm_started(), node)
			ds.append(d)

		dl=defer.DeferredList(ds, consumeErrors=True)
		dl.addCallback(computeNumber)
		return dl

	def shutdown_all(self, hard=False):
		"""Shutdown all running vm on the cluster.

		If 'hard' is True, do a hard shutdown (destroy).
		"""
		ds=list()
		for node in self.get_nodes():
			d=threads.deferToThread(lambda x: x.shutdown_all(hard), node)
			ds.append(d)

		dl=defer.DeferredList(ds, consumeErrors=True)
		return dl

	def is_in_cluster(self, hostname):
		"""Return True if the specified hostname is a node of the cluster."""
		return hostname in self.nodes

	def search_vm_started(self,vmname):
		"""Search where the specified vm hostname is running.

		Return a list of Node where the VM is running.
		"""
		started=list()
		for node in self.get_nodes():
			if node.is_vm_started(vmname):
				started.append(node)

		return started

	def search_vm_autostart(self,vmname):
		"""Search where the specified vm hostname has an autostart link.

		Return a list of Node where the autostart link is present.
		"""
		enabled=list()
		for node in self.get_nodes():
			if node.is_vm_autostart_enabled(vmname):
				enabled.append(node)

		return enabled

	def activate_vm(self,selected_node,vmname):
		"""Activate all the LVM logicals volumes of the specified VM exclusively on the selected node.

		selected_node - (Node) Node where to activate the LVs
		vmname - (String) hostname of the vm

		Raise a RunningVmError if the VM is running.
		"""
		for node in self.get_nodes():
			if node.is_vm_started(vmname):
				raise RunningVmError(node.get_hostname(), vmname) 
			else:
				node.deactivate_lv(vmname)

		selected_node.activate_lv(vmname)
				
	def start_vm(self, node, vmname, console):
		"""Start the specified VM on the given node.
		If there is not enough ram on the given node, the VM will be started 
		on the node with the highest free ram and the autostart link will be updated accordingly.

		node - (Node) Selected host
		vmname - (String) VM hostname 
		console - (boolean) Attach console to the domain
		"""

		# Resources checks
		needed_ram=VM(vmname).get_ram()
		free_ram=node.metrics.get_free_ram()
		if needed_ram>free_ram: 
			# Not enough ram, switching to another node
			old_node=node

			# Get the node with the highest free ram (first fit increasing algorithm)
			pool=self.get_nodes()
			pool.sort(key=lambda x: x.metrics.get_free_ram(), reverse=True)
			node=pool[0]

			# Last resources checks
			free_ram=node.metrics.get_free_ram()
			if needed_ram>free_ram:
				raise NotEnoughRamError(node.get_hostname(),"need "+str(needed_ram)+"M, has "+str(free_ram)+"M.")

			log.info(" -> Not enough ram, starting it on %s." % node.get_hostname())

		# Start the VM
		self.activate_vm(node,vmname)
		try:
			node.start(vmname)
		except Exception, e:
			node.deactivate_lv(vmname)
			raise e

		# Update autostart link only if another node has been selected
		if 'old_node' in locals():
			old_node.disable_vm_autostart(vmname)
			node.enable_vm_autostart(vmname)

		# Attach to the console without forking
		if console:
			if node.is_local_node():
				node.get_vm(vmname).attach_console()
			else:
				log.warn("Cannot attach console when using remote Xen-API.")
Beispiel #54
0
			raise
		except NotInClusterError:
			# Next step of recovery process
			pass
		except Exception, e:
			log.err("Cannot get the VMs back:", e)

		if partial_failure:
			# Cannot recover, node still alive
			return False

		# Check if VM are still alive
		if len(vm_list)>0:
			for node in self.get_nodes():
				if node.ping(vm_list):
					log.warn("Some VM on %s are still alive !" % (name))
					return False
				
			log.warn("All VM on %s are dead. Fencing now !" % (name))
		else:
			log.warn("No VM running on %s. Fencing now !" % (name))
		self.get_local_node().fence(name)

		# Remove fenced node from current cluster instance
		if name in self.nodes.keys():
			del self.nodes[name]

		log.info("Restarting dead VM from %s on healthy nodes..." % (name))
		self.start_vms(vm_list)

		return True
Beispiel #55
0
	def processEnded(self, reason):
		log.warn("Inotify has died: %s" % (reason.value))
		try:
			reactor.stop()
		except:
			pass
Beispiel #56
0
def warn_override(name):
    warn('%s - you modified it; skipping\n' % name)
Beispiel #57
0
 def NEWsearchUsers(self, authUserId, query, limit=0, relationship=None):
     query = query.lower()
     query = self._valid_re.sub('', query)
     
     if len(query) == 0:
         return []
     
     users  = []
     seen   = set()
     domain = None
     
     if relationship is not None:
         if relationship == 'followers':
             domain = self.followers_collection.getFollowers(authUserId)
         elif relationship == 'following':
             domain = self.friends_collection.getFriends(authUserId)
         else:
             raise StampedInvalidRelationshipError("invalid relationship")
         
         domain = set(domain)
     
     try:
         user = self.getUserByScreenName(query)
         if user is not None and (domain is None or user.user_id in domain):
             seen.add(user.user_id)
             users.append(user)
     except Exception as e:
         logs.warning("Exact user match not found for '%s': %s" % (query, e))
     
     q = StringQuery(query, default_operator="AND", search_fields=[ "name", "screen_name" ])
     q = CustomScoreQuery(q, lang="mvel", script="""
         ns = doc.?num_stamps.value;
         ns = (ns != null) ? log(ns) : 0;
         nf = doc.?num_friends.value;
         nf = (nf != null) ? log(nf) : 0;
         return _score + ns / 4.0 + nf / 8.0
     """)
     
     if domain:
         q = FilteredQuery(q, IdsFilter('user', list(domain)))
     
     results = self.api._elasticsearch.search(q, 
                                              indexes = [ 'users' ], 
                                              doc_types = [ 'user' ], 
                                              size = limit)
     
     utils.log(pformat(results))
     try:
         user_ids = map(lambda result: result['_id'], results['hits']['hits'])
         users2   = self.lookupUsers(user_ids)
         id_user  = {}
         
         for user in users2:
             id_user[user.user_id] = user
         
         for user_id in user_ids:
             if user_id not in seen:
                 seen.add(user_id)
                 users.append(id_user[user_id])
     except Exception:
         logs.warn("received invalid results from pyes")
         logs.warn(pformat(results))
         return []
     
     return users
Beispiel #58
0
 def processEnded(self, reason):
     log.warn("Inotify has died: %s" % (reason.value))
     try:
         reactor.stop()
     except:
         pass