Beispiel #1
0
    def recover(self, name, vm_list, partial_failure):
        """
		Try to recover a node from a failure, by migrating or re-starting vm.

		name - (String) Name of the failed node.
		vm_list - (List of strings) List of the vm names running on the failed node.
		partial_failure - (bool) True if contact with failed node is not fully lost.

		Return True if recover is successfull
		Return False if recover cannot be done (i.e. service still alive)
		Raise an Exception if something goes wrong.
		"""

        assert type(name) == str, "Param 'name' should be a string."
        assert type(vm_list) == list, "Param 'vm_list' should be a list."
        assert type(partial_failure) == bool, "Param 'partial_failure' should be a bool."

        log.info("Trying to recover", name, "...")

        try:
            # Try to get VM back on alive nodes
            # If a vm is paused, eject will fail
            self.emergency_eject(self.get_node(name))
            log.info("VM from %s successfully migrated on healthy nodes." % (name))

            # Eject successfull, fence the node
            try:
                log.info("Fencing useless node %s ..." % (name))
                self.get_local_node().fence(name)
            except Exception, e:
                # If fencing fail, this is not a big deal, VM are alive
                log.err("Fencing of %s failed:" % (name), e)

            return True  # Succeeded !
Beispiel #2
0
	def checkSlaveHeartbeats(self):
		# Checks slaves timestamps only if we are active master
		if self.role != MasterService.RL_ACTIVE:
			return

		# No failover in panic mode
		if self.state == MasterService.ST_PANIC:
			return

		# No more failover if a recovery is running
		if self.state == MasterService.ST_RECOVERY:
			return

		# No failover if we are alone
		if len(self.status) <= 1:
			return

		# Check net heartbeat
		netFailed=Set()
		for name, values in self.status.items():
			if values['timestamp'] == 0:
				# Do nothing if first heartbeat has not been received yet
				continue

			if values['timestamp']+MasterService.TM_SLAVE <= int(time.time()):
				log.warn("Net heartbeat lost for %s." % (name))
				netFailed.add(name)

		# Get diskhearbeat timestamps
		try:
			tsDisk=self.disk.get_all_ts()
		except Exception, e:
			log.err("Diskheartbeat read failed: %s." % (e))
			raise
Beispiel #3
0
	def recover(self, name, vm_list, partial_failure):
		"""
		Try to recover a node from a failure, by migrating or re-starting vm.

		name - (String) Name of the failed node.
		vm_list - (List of strings) List of the vm names running on the failed node.
		partial_failure - (bool) True if contact with failed node is not fully lost.

		Return True if recover is successfull
		Return False if recover cannot be done (i.e. service still alive)
		Raise an Exception if something goes wrong.
		"""

		assert type(name) == str, "Param 'name' should be a string."
		assert type(vm_list) == list, "Param 'vm_list' should be a list."
		assert type(partial_failure) == bool, "Param 'partial_failure' should be a bool."

		log.info("Trying to recover", name, "...")
		
		try:
			# Try to get VM back on alive nodes
			# If a vm is paused, eject will fail
			self.emergency_eject(self.get_node(name))
			log.info("VM from %s successfully migrated on healthy nodes." % (name))

			# Eject successfull, fence the node
			try:
				log.info("Fencing useless node %s ..." % (name))
				self.get_local_node().fence(name)
			except Exception, e:
				# If fencing fail, this is not a big deal, VM are alive
				log.err("Fencing of %s failed:" % (name), e)

			return True # Succeeded !
Beispiel #4
0
    def checkSlaveHeartbeats(self):
        # Checks slaves timestamps only if we are active master
        if self.role != MasterService.RL_ACTIVE:
            return

        # No failover in panic mode
        if self.state == MasterService.ST_PANIC:
            return

        # No more failover if a recovery is running
        if self.state == MasterService.ST_RECOVERY:
            return

        # No failover if we are alone
        if len(self.status) <= 1:
            return

        # Check net heartbeat
        netFailed = Set()
        for name, values in self.status.items():
            if values['timestamp'] == 0:
                # Do nothing if first heartbeat has not been received yet
                continue

            if values['timestamp'] + MasterService.TM_SLAVE <= int(
                    time.time()):
                log.warn("Net heartbeat lost for %s." % (name))
                netFailed.add(name)

        # Get diskhearbeat timestamps
        try:
            tsDisk = self.disk.get_all_ts()
        except Exception, e:
            log.err("Diskheartbeat read failed: %s." % (e))
            raise
Beispiel #5
0
	def joinCluster(self):

		def startHeartbeats():
			self._startSlave()
			self.s_rpc.startService()

			if self.role == MasterService.RL_ACTIVE:
				self._startMaster() 

		def joinRefused(reason):
			reason.trap(NodeRefusedError, RPCRefusedError)
			log.err("Join to cluster %s failed: Master %s has refused me: %s" % 
				(core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage()))
			self.stopService()

		def joinAccepted(result):
			self.role=MasterService.RL_PASSIVE
			log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME']))
			startHeartbeats()
			
		def masterConnected(obj):
			d = obj.callRemote("register",DNSCache.getInstance().name)
			d.addCallbacks(joinAccepted,joinRefused)
			d.addErrback(log.err)
			d.addBoth(lambda _: rpcConnector.disconnect())
			return d

		try:
			if self.master is None:
				# New active master
				if DNSCache.getInstance().name not in core.cfg['ALLOWED_NODES']:
					log.warn("I'm not allowed to create a new cluster. Exiting.")
					raise Exception("Cluster creation not allowed")

				if DiskHeartbeat.is_in_use():
					log.err("Heartbeat disk is in use but we are alone !")
					raise Exception("Heartbeat disk already in use")

				log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME']))
				self.role=MasterService.RL_ACTIVE
				self.master=DNSCache.getInstance().name
				self.status[self.master]={'timestamp': 0, 'offset': 0, 'vms': []}
				self.disk.make_slot(DNSCache.getInstance().name)
				startHeartbeats()

			else:
				# Passive master
				self.role=MasterService.RL_JOINING
				log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME']))

				factory = pb.PBClientFactory()
				rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory)
				d = factory.getRootObject()
				d.addCallback(masterConnected)
				d.addErrback(log.err)
		except Exception, e:
			log.err("Startup failed: %s. Shutting down." % (e))
			self.stopService()
Beispiel #6
0
        def recoverSucceeded(result, name):
            # result is the return code from XenCluster.recover()
            # If True: success, if False: maybe a partition

            if (result):
                log.info("Successfully recovered node %s." % (name))
                self._unregister(name)
            else:
                log.err("Partial failure, cannot recover", name)
Beispiel #7
0
		def recoverSucceeded(result, name):
			# result is the return code from XenCluster.recover()
			# If True: success, if False: maybe a partition

			if(result):
				log.info("Successfully recovered node %s." % (name))
				self._unregister(name)
			else:
				log.err("Partial failure, cannot recover", name)
Beispiel #8
0
    def dispatchMessage(self, data, host):
        dispatcher = {
            "slavehb": self.updateNodeStatus,
            "masterhb": self.updateMasterStatus,
            "voterequest": self.voteForNewMaster,
            "voteresponse": self.recordVote,
        }

        try:
            msg = MessageHelper.get(data, host)
            log.debugd("Received", msg)
            dispatcher[msg.type()](msg)
        except (MessageError, KeyError), e:
            log.err("Bad message from %s : %s , %s" % (host, data, e))
Beispiel #9
0
 def _sendError(self, reason):
     # Log all stacktrace to view the origin of this error
     log.err("Netheartbeat failure: %s" % (reason))
     if self.retry >= self.MAX_RETRY:
         log.emerg("Too many retry. Asking master to engage panic mode.")
         # Engage panic mode
         agent = Agent()
         d = agent.panic()
         d.addErrback(log.err)
         d.addBoth(lambda x: agent.disconnect())
     else:
         log.warn("Restarting network heartbeat within a few seconds...")
         self.retry += 1  # Will be resetted each elections (or panic recovery)
         reactor.callLater(2, self._run, self._proto)
Beispiel #10
0
	def dispatchMessage(self, data, host):
		dispatcher = {
			"slavehb" : self.updateNodeStatus,
			"masterhb" : self.updateMasterStatus,
			"voterequest" : self.voteForNewMaster,
			"voteresponse" : self.recordVote,
		}

		try:
			msg=MessageHelper.get(data, host)
			log.debugd("Received", msg)
			dispatcher[msg.type()](msg)
		except (MessageError, KeyError), e:
			log.err("Bad message from %s : %s , %s" % (host,data,e))
Beispiel #11
0
	def _sendError(self, reason):
		# Log all stacktrace to view the origin of this error
		log.err("Netheartbeat failure: %s" % (reason))
		if self.retry >= self.MAX_RETRY:
			log.emerg("Too many retry. Asking master to engage panic mode.")
			# Engage panic mode
			agent=Agent()
			d=agent.panic()
			d.addErrback(log.err)
			d.addBoth(lambda x: agent.disconnect())
		else:
			log.warn("Restarting network heartbeat within a few seconds...")
			self.retry+=1	# Will be resetted each elections (or panic recovery)
			reactor.callLater(2, self._run, self._proto)
Beispiel #12
0
	def startService(self):
		def standalone(reason):
			log.info("Starting in standalone mode.")
			self.agent=None
			
		def cluster(result):
			log.info("Starting in cluster mode.")

		Service.startService(self)

		msg=self.node.run("svn status "+core.cfg['VMCONF_DIR'] +" 2>&1").read()
		if len(msg)>0:
			log.err("Your repo is not clean. Please check it : %s" % (msg))
			raise Exception("SVN repo not clean")

		d=self.agent.ping()
		d.addCallbacks(cluster, standalone)
		d.addBoth(lambda _: self.spawnInotify())
		d.addErrback(log.err)
		return d
Beispiel #13
0
def load_cfg():
	"""Load the global configuration file into the cfg dict."""

	type_map = {
		str:  "a string",
		bool: "a boolean",
		int:  "an integer",
		list: "a list",
	}

	try:
		execfile("/etc/xen/cxm.conf",dict(),cfg)

		# Check type of configuration entries
		for key in cfg_type.keys():
			if cfg[key]:
				assert type(cfg[key]) == cfg_type[key], "%s should be %s." % (key, type_map[cfg_type[key]])

	except Exception,e:
		log.err("Configuration file error:", e)
		sys.exit(e)
Beispiel #14
0
    def startService(self):
        def standalone(reason):
            log.info("Starting in standalone mode.")
            self.agent = None

        def cluster(result):
            log.info("Starting in cluster mode.")

        Service.startService(self)

        msg = self.node.run("svn status " + core.cfg['VMCONF_DIR'] +
                            " 2>&1").read()
        if len(msg) > 0:
            log.err("Your repo is not clean. Please check it : %s" % (msg))
            raise Exception("SVN repo not clean")

        d = self.agent.ping()
        d.addCallbacks(cluster, standalone)
        d.addBoth(lambda _: self.spawnInotify())
        d.addErrback(log.err)
        return d
Beispiel #15
0
def load_cfg():
    """Load the global configuration file into the cfg dict."""

    type_map = {
        str: "a string",
        bool: "a boolean",
        int: "an integer",
        list: "a list",
    }

    try:
        execfile("/etc/xen/cxm.conf", dict(), cfg)

        # Check type of configuration entries
        for key in cfg_type.keys():
            if cfg[key]:
                assert type(cfg[key]) == cfg_type[key], "%s should be %s." % (
                    key, type_map[cfg_type[key]])

    except Exception, e:
        log.err("Configuration file error:", e)
        sys.exit(e)
Beispiel #16
0
			try:
				log.info("Fencing useless node %s ..." % (name))
				self.get_local_node().fence(name)
			except Exception, e:
				# If fencing fail, this is not a big deal, VM are alive
				log.err("Fencing of %s failed:" % (name), e)

			return True # Succeeded !
		except NotEnoughRamError:
			# Engage panic mode
			raise
		except NotInClusterError:
			# Next step of recovery process
			pass
		except Exception, e:
			log.err("Cannot get the VMs back:", e)

		if partial_failure:
			# Cannot recover, node still alive
			return False

		# Check if VM are still alive
		if len(vm_list)>0:
			for node in self.get_nodes():
				if node.ping(vm_list):
					log.warn("Some VM on %s are still alive !" % (name))
					return False
				
			log.warn("All VM on %s are dead. Fencing now !" % (name))
		else:
			log.warn("No VM running on %s. Fencing now !" % (name))
Beispiel #17
0
 def commitFailed(reason):
     log.err("SVN failed: %s" % reason.getErrorMessage())
Beispiel #18
0
		def heartbeatFailed(reason):
			log.err("Disk heartbeat failure: %s." % (reason.getErrorMessage()))
			self.stopService()  # Stop slave heartbeat to tell master we have a problem
Beispiel #19
0
        return d

    def _unregister(self, name):
        try:
            del self.status[name]
        except:
            pass

        try:
            self.disk.erase_slot(name)
        except DiskHeartbeatError, e:
            log.warn(
                "Cannot erase slot: %s. You may have to reformat hearbeat disk."
                % (e))
        except Exception, e:
            log.err("Diskheartbeat failure: %s." % (e))
            self.panic()

        DNSCache.getInstance().delete(name)
        log.info("Node %s has been unregistered." % (name))

    def unregisterNode(self, name):
        # Can unregister node even if in panic mode

        if self.role != MasterService.RL_ACTIVE:
            log.warn("I'm not master. Cannot unregister %s." % (name))
            raise RPCRefusedError("Not master")

        if name not in self.status:
            log.warn("Unknown node %s try to quit the cluster." % (name))
            raise NodeRefusedError("Unknown node " + name)
Beispiel #20
0
#!/usr/bin/env python2
import sys, os

import vars_init
vars_init.init_no_state()

import paths
from logs import err

if len(sys.argv[1:]) != 1:
    err('%s: exactly one argument expected.\n' % sys.argv[0])
    sys.exit(1)

want = sys.argv[1]
if not want:
    err('cannot build the empty target ("").\n')
    sys.exit(204)

abswant = os.path.abspath(want)
for dodir, dofile, basedir, basename, ext in paths.possible_do_files(abswant):
    dopath = os.path.join('/', dodir, dofile)
    relpath = os.path.relpath(dopath, '.')
    exists = os.path.exists(dopath)
    assert ('\n' not in relpath)
    print relpath
    if exists:
        sys.exit(0)
sys.exit(1)  # no appropriate dofile found
Beispiel #21
0
    def joinCluster(self):
        def startHeartbeats():
            self._startSlave()
            self.s_rpc.startService()

            if self.role == MasterService.RL_ACTIVE:
                self._startMaster()

        def joinRefused(reason):
            reason.trap(NodeRefusedError, RPCRefusedError)
            log.err("Join to cluster %s failed: Master %s has refused me: %s" %
                    (core.cfg['CLUSTER_NAME'], self.master,
                     reason.getErrorMessage()))
            self.stopService()

        def joinAccepted(result):
            self.role = MasterService.RL_PASSIVE
            log.info("Join successfull, I'm now part of cluster %s." %
                     (core.cfg['CLUSTER_NAME']))
            startHeartbeats()

        def masterConnected(obj):
            d = obj.callRemote("register", DNSCache.getInstance().name)
            d.addCallbacks(joinAccepted, joinRefused)
            d.addErrback(log.err)
            d.addBoth(lambda _: rpcConnector.disconnect())
            return d

        try:
            if self.master is None:
                # New active master
                if DNSCache.getInstance(
                ).name not in core.cfg['ALLOWED_NODES']:
                    log.warn(
                        "I'm not allowed to create a new cluster. Exiting.")
                    raise Exception("Cluster creation not allowed")

                if DiskHeartbeat.is_in_use():
                    log.err("Heartbeat disk is in use but we are alone !")
                    raise Exception("Heartbeat disk already in use")

                log.info("No master found. I'm now the new master of %s." %
                         (core.cfg['CLUSTER_NAME']))
                self.role = MasterService.RL_ACTIVE
                self.master = DNSCache.getInstance().name
                self.status[self.master] = {
                    'timestamp': 0,
                    'offset': 0,
                    'vms': []
                }
                self.disk.make_slot(DNSCache.getInstance().name)
                startHeartbeats()

            else:
                # Passive master
                self.role = MasterService.RL_JOINING
                log.info("Trying to join cluster %s..." %
                         (core.cfg['CLUSTER_NAME']))

                factory = pb.PBClientFactory()
                rpcConnector = reactor.connectTCP(self.master,
                                                  core.cfg['TCP_PORT'],
                                                  factory)
                d = factory.getRootObject()
                d.addCallback(masterConnected)
                d.addErrback(log.err)
        except Exception, e:
            log.err("Startup failed: %s. Shutting down." % (e))
            self.stopService()
Beispiel #22
0
#关闭系统的报错
os.close(sys.stderr.fileno())

if __name__ == '__main__':
    try:
        # 打印版权信息,注释这条前大家加下群吧,或收藏一下我的博客也可以,谢谢大家(〃'▽'〃)
        copyright.main()
        # 各项准备工作检查
        try:
            p = pyaudio.PyAudio()
            print(
                '\033[1;32m     ##################   声卡驱动加载成功!  ################### \033[0m'
            )
        except:
            err = logs.err()
            print(
                '\033[1;31m     ############  声卡驱动加载失败!请检查声卡驱动 ############# \033[0m'
            )
            exit()
        try:
            # 普通情况下直接init就可以了
            # 但我的声卡播放出来声卡有些怪异,所以这里调下频率来解决问题
            pygame.mixer.init(frequency=15500, size=-16, channels=4)
            print(
                '\033[1;32m     ##################   播放功能加载成功!  ################### \033[0m'
            )
        except:
            err = logs.err()
            print(
                '\033[1;31m     ############  播放功能加载失败!请检查声卡驱动 ############# \033[0m'
Beispiel #23
0
    else:
        f = me = None
        debug2('redo-ifchange: not adding depends.\n')
    jwack.setup(1)
    try:
        targets = sys.argv[1:]
        if f:
            for t in targets:
                f.add_dep('m', t)
            f.save()
            state.commit()
        rv = builder.main(targets, should_build)
    finally:
        try:
            state.rollback()
        finally:
            try:
                jwack.force_return_tokens()
            except Exception, e:
                traceback.print_exc(100, sys.stderr)
                err('unexpected error: %r\n' % e)
                rv = 1
except KeyboardInterrupt:
    if vars_init.is_toplevel:
        builder.await_log_reader()
    sys.exit(200)
state.commit()
if vars_init.is_toplevel:
    builder.await_log_reader()
sys.exit(rv)
Beispiel #24
0
#!/usr/bin/env python2
import sys, os
import state
from logs import err

if len(sys.argv[1:]) < 2:
    err('%s: at least 2 arguments expected.\n' % sys.argv[0])
    sys.exit(1)

target = sys.argv[1]
deps = sys.argv[2:]

for d in deps:
    assert (d != target)

me = state.File(name=target)

# Build the known dependencies of our primary target.  This *does* require
# grabbing locks.
os.environ['REDO_NO_OOB'] = '1'
argv = ['redo-ifchange'] + deps
rv = os.spawnvp(os.P_WAIT, argv[0], argv)
if rv:
    sys.exit(rv)

# We know our caller already owns the lock on target, so we don't have to
# acquire another one; tell redo-ifchange about that.  Also, REDO_NO_OOB
# persists from up above, because we don't want to do OOB now either.
# (Actually it's most important for the primary target, since it's the one
# who initiated the OOB in the first place.)
os.environ['REDO_UNLOCKED'] = '1'
Beispiel #25
0
            try:
                log.info("Fencing useless node %s ..." % (name))
                self.get_local_node().fence(name)
            except Exception, e:
                # If fencing fail, this is not a big deal, VM are alive
                log.err("Fencing of %s failed:" % (name), e)

            return True  # Succeeded !
        except NotEnoughRamError:
            # Engage panic mode
            raise
        except NotInClusterError:
            # Next step of recovery process
            pass
        except Exception, e:
            log.err("Cannot get the VMs back:", e)

        if partial_failure:
            # Cannot recover, node still alive
            return False

            # Check if VM are still alive
        if len(vm_list) > 0:
            for node in self.get_nodes():
                if node.ping(vm_list):
                    log.warn("Some VM on %s are still alive !" % (name))
                    return False

            log.warn("All VM on %s are dead. Fencing now !" % (name))
        else:
            log.warn("No VM running on %s. Fencing now !" % (name))
Beispiel #26
0
		def joinRefused(reason):
			reason.trap(NodeRefusedError, RPCRefusedError)
			log.err("Join to cluster %s failed: Master %s has refused me: %s" % 
				(core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage()))
			self.stopService()
Beispiel #27
0
#!/usr/bin/env python2
import sys, os

import vars_init
vars_init.init([])

import state, vars
from logs import err

if len(sys.argv[1:]) != 0:
    err('%s: no arguments expected.\n' % sys.argv[0])
    sys.exit(1)

cwd = os.getcwd()
for f in state.files():
    if f.is_generated and f.read_stamp() != state.STAMP_MISSING:
        print state.relpath(os.path.join(vars.BASE, f.name), cwd)
Beispiel #28
0
		def commitFailed(reason):
			log.err("SVN failed: %s" % reason.getErrorMessage())
Beispiel #29
0
#!/usr/bin/env python2
import sys, os
import vars, state
from logs import err, debug2

if len(sys.argv) > 1:
    err('%s: no arguments expected.\n' % sys.argv[0])
    sys.exit(1)

if os.isatty(0):
    err('%s: you must provide the data to stamp on stdin\n' % sys.argv[0])
    sys.exit(1)

# hashlib is only available in python 2.5 or higher, but the 'sha' module
# produces a DeprecationWarning in python 2.6 or higher.  We want to support
# python 2.4 and above without any stupid warnings, so let's try using hashlib
# first, and downgrade if it fails.
try:
    import hashlib
except ImportError:
    import sha
    sh = sha.sha()
else:
    sh = hashlib.sha1()

while 1:
    b = os.read(0, 4096)
    sh.update(b)
    if not b: break

csum = sh.hexdigest()
Beispiel #30
0
		
		return d
			

	def _unregister(self, name):
		try:
			del self.status[name]
		except:
			pass

		try:
			self.disk.erase_slot(name)
		except DiskHeartbeatError, e:
			log.warn("Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e))
		except Exception, e:
			log.err("Diskheartbeat failure: %s." % (e))
			self.panic()

		DNSCache.getInstance().delete(name)
		log.info("Node %s has been unregistered." % (name))

	def unregisterNode(self, name):
		# Can unregister node even if in panic mode

		if self.role != MasterService.RL_ACTIVE:
			log.warn("I'm not master. Cannot unregister %s." % (name))
			raise RPCRefusedError("Not master")

		if name not in self.status:
			log.warn("Unknown node %s try to quit the cluster." % (name))
			raise NodeRefusedError("Unknown node "+name)
Beispiel #31
0
try:
    if vars_init.is_toplevel:
        builder.start_stdin_log_reader(status=opt.status, details=opt.details,
            pretty=opt.pretty, color=opt.color,
            debug_locks=opt.debug_locks, debug_pids=opt.debug_pids)
    for t in targets:
        if os.path.exists(t):
            f = state.File(name=t)
            if not f.is_generated:
                warn('%s: exists and not marked as generated; not redoing.\n'
                     % f.nicename())
    state.rollback()
    
    j = atoi(opt.jobs or 1)
    if j < 1 or j > 1000:
        err('invalid --jobs value: %r\n' % opt.jobs)
    jwack.setup(j)
    try:
        assert(state.is_flushed())
        retcode = builder.main(targets, lambda t: (True, True))
        assert(state.is_flushed())
    finally:
        try:
            state.rollback()
        finally:
            try:
                jwack.force_return_tokens()
            except Exception, e:
                traceback.print_exc(100, sys.stderr)
                err('unexpected error: %r\n' % e)
                retcode = 1
Beispiel #32
0
 def joinRefused(reason):
     reason.trap(NodeRefusedError, RPCRefusedError)
     log.err("Join to cluster %s failed: Master %s has refused me: %s" %
             (core.cfg['CLUSTER_NAME'], self.master,
              reason.getErrorMessage()))
     self.stopService()