def __init__(self,vmname, id=-1, ram=None, vcpu=None): """Instanciate a VM object, with the optional ram and vcpu metrics.""" self.name=vmname self.id=id self.__ram=ram self.__vcpu=vcpu self.config=dict() self.metrics=None self.devices=dict() try: try: execfile("%s/%s" % (core.cfg['VMCONF_DIR'],vmname) ,dict(),self.config) except IOError: execfile("%s/%s.cfg" % (core.cfg['VMCONF_DIR'],vmname) ,dict(),self.config) except IOError: if not core.cfg['QUIET']: log.warn("Missing configuration file: %s" % (vmname)) log.debug("[VM]", vmname, self.config) # Get devices from config file try: for disk in self.config['disk']: try: self.devices[self.diskre.search(disk).group(1)]=self.diskre.search(disk).group(2) except: if not core.cfg['QUIET']: log.warn("Bad disk input for %s: %s" % (self.name, disk)) except KeyError: pass
def checkSlaveHeartbeats(self): # Checks slaves timestamps only if we are active master if self.role != MasterService.RL_ACTIVE: return # No failover in panic mode if self.state == MasterService.ST_PANIC: return # No more failover if a recovery is running if self.state == MasterService.ST_RECOVERY: return # No failover if we are alone if len(self.status) <= 1: return # Check net heartbeat netFailed = Set() for name, values in self.status.items(): if values['timestamp'] == 0: # Do nothing if first heartbeat has not been received yet continue if values['timestamp'] + MasterService.TM_SLAVE <= int( time.time()): log.warn("Net heartbeat lost for %s." % (name)) netFailed.add(name) # Get diskhearbeat timestamps try: tsDisk = self.disk.get_all_ts() except Exception, e: log.err("Diskheartbeat read failed: %s." % (e)) raise
def countVotes(self): if self.role != MasterService.RL_VOTING: log.warn("Tally triggered but it's not election time !") return if type(self.ballotBox) != dict or len(self.ballotBox) == 0: log.emerg( "No vote received ! There is a critical network failure.") self.panic(True) # noCheck=True because role is not consistent return # Select election winner self.currentElection = None self.lastTallyDate = int(time.time()) self.master = self.ballotBox[max(self.ballotBox.keys())] log.info("New master is %s." % (self.master)) self._startSlave() if self.master == DNSCache.getInstance().name: log.info("I'm the new master.") self.role = MasterService.RL_ACTIVE self._startMaster() else: self.role = MasterService.RL_PASSIVE if self.panicRequested: log.warn("Engaging panic mode requested during election stage.") self.panicRequested = False self.panic()
def countVotes(self): if self.role != MasterService.RL_VOTING: log.warn("Tally triggered but it's not election time !") return if type(self.ballotBox) != dict or len(self.ballotBox) == 0: log.emerg("No vote received ! There is a critical network failure.") self.panic(True) # noCheck=True because role is not consistent return # Select election winner self.currentElection=None self.lastTallyDate=int(time.time()) self.master=self.ballotBox[max(self.ballotBox.keys())] log.info("New master is %s." % (self.master)) self._startSlave() if self.master == DNSCache.getInstance().name: log.info("I'm the new master.") self.role=MasterService.RL_ACTIVE self._startMaster() else: self.role=MasterService.RL_PASSIVE if self.panicRequested: log.warn("Engaging panic mode requested during election stage.") self.panicRequested=False self.panic()
def checkSlaveHeartbeats(self): # Checks slaves timestamps only if we are active master if self.role != MasterService.RL_ACTIVE: return # No failover in panic mode if self.state == MasterService.ST_PANIC: return # No more failover if a recovery is running if self.state == MasterService.ST_RECOVERY: return # No failover if we are alone if len(self.status) <= 1: return # Check net heartbeat netFailed=Set() for name, values in self.status.items(): if values['timestamp'] == 0: # Do nothing if first heartbeat has not been received yet continue if values['timestamp']+MasterService.TM_SLAVE <= int(time.time()): log.warn("Net heartbeat lost for %s." % (name)) netFailed.add(name) # Get diskhearbeat timestamps try: tsDisk=self.disk.get_all_ts() except Exception, e: log.err("Diskheartbeat read failed: %s." % (e)) raise
def _startMaster(self): def masterWatchdogFailed(reason): log.emerg("Slave heartbeat checks failed: %s." % (reason.getErrorMessage())) self.panic() def startMasterWatchdog(): if not self.l_masterDog.running: d = self.l_masterDog.start(MasterService.TM_WATCHDOG) d.addErrback(masterWatchdogFailed) d.addErrback(log.err) # Start master heartbeat self.s_masterHb.startService() # Check state of previous master if self.state == MasterService.ST_RECOVERY: log.warn( "Previous master was recovering something: re-enabling failover." ) # Force normal mode to re-run failover self.state = MasterService.ST_NORMAL # Start master's watchdog for slaves failover reactor.callLater(2, startMasterWatchdog)
def __init__(self): path = os.path.abspath(os.path.dirname(__file__)) root = os.path.dirname(os.path.dirname(path)) name = os.path.join(root, 'templates') self._load_templates(name) self._renderer = pybars.Compiler() self._compiled = {} self._partials = {} for k, v in self.templates.iteritems(): path, source = v source = unicode(source) try: compiled = self._renderer.compile(source) self._compiled[k] = compiled self._partials[k] = compiled except Exception, e: logs.warn("[%s] template compiler error (%s): %s" % (self, path, e)) raise #def m(n, k): # logs.warn("[%s] MISSING: '%s' '%s'" % (n, k)) #self._renderer.register_helper('helperMissing', m) try: pass #def helper(items, options): # return compiled(items) #self._renderer.register_helper(k, compiled) except Exception, e: logs.warn("[%s] template register helper error (%s): %s" % (self, path, e)) raise
def joinCluster(self): def startHeartbeats(): self._startSlave() self.s_rpc.startService() if self.role == MasterService.RL_ACTIVE: self._startMaster() def joinRefused(reason): reason.trap(NodeRefusedError, RPCRefusedError) log.err("Join to cluster %s failed: Master %s has refused me: %s" % (core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage())) self.stopService() def joinAccepted(result): self.role=MasterService.RL_PASSIVE log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME'])) startHeartbeats() def masterConnected(obj): d = obj.callRemote("register",DNSCache.getInstance().name) d.addCallbacks(joinAccepted,joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d try: if self.master is None: # New active master if DNSCache.getInstance().name not in core.cfg['ALLOWED_NODES']: log.warn("I'm not allowed to create a new cluster. Exiting.") raise Exception("Cluster creation not allowed") if DiskHeartbeat.is_in_use(): log.err("Heartbeat disk is in use but we are alone !") raise Exception("Heartbeat disk already in use") log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME'])) self.role=MasterService.RL_ACTIVE self.master=DNSCache.getInstance().name self.status[self.master]={'timestamp': 0, 'offset': 0, 'vms': []} self.disk.make_slot(DNSCache.getInstance().name) startHeartbeats() else: # Passive master self.role=MasterService.RL_JOINING log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME'])) factory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory) d = factory.getRootObject() d.addCallback(masterConnected) d.addErrback(log.err) except Exception, e: log.err("Startup failed: %s. Shutting down." % (e)) self.stopService()
def _unregister(self, name): try: del self.status[name] except: pass try: self.disk.erase_slot(name) except DiskHeartbeatError, e: log.warn("Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e))
def checkMasterHeartbeat(self): # Master failover is still possible even if in panic mode # Master failover only if we are a slave if self.role != MasterService.RL_PASSIVE: return # Usecase #7: master lost if self.masterLastSeen+MasterService.TM_MASTER <= int(time.time()): log.warn("Broadcast heartbeat lost, master has disappeared.") return self.triggerElection()
def checkMasterHeartbeat(self): # Master failover is still possible even if in panic mode # Master failover only if we are a slave if self.role != MasterService.RL_PASSIVE: return # Usecase #7: master lost if self.masterLastSeen + MasterService.TM_MASTER <= int(time.time()): log.warn("Broadcast heartbeat lost, master has disappeared.") return self.triggerElection()
def _stopMaster(self): if self.state == MasterService.ST_RECOVERY: # Recovery will be re-run by next master (VM on current host may be lost) # but first recovery may be still running (cannot kill threads) log.warn("Stopping master during a recovery process !") # Send a last heartbeat before stopping self.s_masterHb.forcePulse() self.s_masterHb.stopService().addErrback(log.err) if self.l_masterDog.running: self.l_masterDog.stop()
def _unregister(self, name): try: del self.status[name] except: pass try: self.disk.erase_slot(name) except DiskHeartbeatError, e: log.warn( "Cannot erase slot: %s. You may have to reformat hearbeat disk." % (e))
def sync_stat(api, stat, reason=""): logs.info("\x1b[0;32m status:%s reason:%s \x1b[0m", stat, reason) if api == None or api == '': logs.warn("sync state api is nil") return patch = [ {"op": "replace", "path": "/status/migrateState", "value": stat}, {"op": "replace", "path": "/status/message", "value": reason} ] sync(api, patch) # sync_stat('http://10.213.44.128:12808/tidb/api/v1/tidbs/006-xinyang1', 'Dumping')
def _transform_stamps(self, stamps): if stamps is None: stamps = [] ret = [] for stamp in stamps: try: ret.append(HTTPStamp().importStamp(stamp).dataExport()) except Exception: logs.warn(utils.getFormattedException()) return ret
def _sendError(self, reason): # Log all stacktrace to view the origin of this error log.err("Netheartbeat failure: %s" % (reason)) if self.retry >= self.MAX_RETRY: log.emerg("Too many retry. Asking master to engage panic mode.") # Engage panic mode agent=Agent() d=agent.panic() d.addErrback(log.err) d.addBoth(lambda x: agent.disconnect()) else: log.warn("Restarting network heartbeat within a few seconds...") self.retry+=1 # Will be resetted each elections (or panic recovery) reactor.callLater(2, self._run, self._proto)
def voteForNewMaster(self, msg): # Elections accepted even if in panic mode def sendVote(result): log.info("Sending our vote...") result.sendMessage() port.stopListening() # Check origin of message if msg.node not in self.status: log.warn("Received vote request from unknown node %s." % (msg.node)) return # Discard current election if there is a new one if self.role == MasterService.RL_VOTING: log.warn("Previous election aborded: new vote request received.") try: self.f_tally.cancel() except: pass log.info("Vote request received from %s." % (msg.node)) self.currentElection = msg.election # Discard vote request if we are leaving if self.role == MasterService.RL_LEAVING: log.info("Vote request ignored: we are leaving this cluster.") return # Stop heartbeating self._stopSlave() if self.role == MasterService.RL_ACTIVE: self._stopMaster() # Prepare election self.role = MasterService.RL_VOTING self.ballotBox = dict() self.f_tally = reactor.callLater( MasterService.TM_TALLY, self.countVotes) # Timout of election stage # Send our vote d = Deferred() port = reactor.listenUDP( 0, UDPSender( d, lambda: MessageVoteResponse().forge(self.currentElection))) d.addCallback(sendVote) d.addErrback(log.err)
def _sendError(self, reason): # Log all stacktrace to view the origin of this error log.err("Netheartbeat failure: %s" % (reason)) if self.retry >= self.MAX_RETRY: log.emerg("Too many retry. Asking master to engage panic mode.") # Engage panic mode agent = Agent() d = agent.panic() d.addErrback(log.err) d.addBoth(lambda x: agent.disconnect()) else: log.warn("Restarting network heartbeat within a few seconds...") self.retry += 1 # Will be resetted each elections (or panic recovery) reactor.callLater(2, self._run, self._proto)
def migrate(self, vmname, src_hostname, dst_hostname): """Live migration of specified VM from src to dst. All params are strings. Raise a NotInClusterError if src or dst are not part of cluster. Raise a NotRunningVmError if vm is not started on src or a RunningVmError if vm is already started on dst. """ # Security checks if not self.is_in_cluster(src_hostname): raise NotInClusterError(src_hostname) if not self.is_in_cluster(dst_hostname): raise NotInClusterError(dst_hostname) dst_node = self.get_node(dst_hostname) src_node = self.get_node(src_hostname) if not src_node.is_vm_started(vmname): raise NotRunningVmError(src_node.get_hostname(), vmname) if dst_node.is_vm_started(vmname): raise RunningVmError(dst_node.get_hostname(), vmname) # Resources checks used_ram = src_node.get_vm(vmname).get_ram() free_ram = dst_node.metrics.get_free_ram() if used_ram > free_ram: raise NotEnoughRamError(dst_node.get_hostname(), "need " + str(used_ram) + "M, has " + str(free_ram) + "M.") # Take care of proper migration dst_node.activate_lv(vmname) src_node.migrate(vmname, dst_node) # Could raise xen.xm.XenAPI.Failure (not SystemExit) src_node.deactivate_lv(vmname) src_node.disable_vm_autostart(vmname) dst_node.enable_vm_autostart(vmname) if core.cfg["POST_MIGRATION_HOOK"]: # Run post migration script in background without error handling try: # Double fork with all filehandles closed, if not, run() hang src_node.run( "(%s %s %s %s 2>&- >&- <&- &)&" % (core.cfg["POST_MIGRATION_HOOK"], vmname, src_node.get_hostname(), dst_node.get_hostname()) ) except Exception, e: log.warn("Post-migration hook failed : %s" % (e))
def updateNodeStatus(self, msg): if self.role != MasterService.RL_ACTIVE: # Some slave HB could reach us during election... if self.role == MasterService.RL_PASSIVE: log.warn("Received slave heartbeat from %s while we're not master." % (msg.node)) return # Check origin of message if msg.node not in self.status: log.warn("Received slave heartbeat from unknown node %s." % (msg.node)) return now=int(time.time()) self.status[msg.node]={'timestamp': now, 'offset': now-msg.ts, 'vms': msg.vms}
def instantiationFailed(reason): reason.trap(InstantiationError) failed=reason.value.value.keys() log.warn("Can't connect to", ", ".join(failed)) # Delete failed nodes from cluster list running=self.getNodesList() for name in failed: running.remove(name) # Re-instanciate cluster without nodes in error d=XenCluster.getDeferInstance(running) d.addCallbacks(startRecover) return d
def instantiationFailed(reason): reason.trap(InstantiationError) failed = reason.value.value.keys() log.warn("Can't connect to", ", ".join(failed)) # Delete failed nodes from cluster list running = self.getNodesList() for name in failed: running.remove(name) # Re-instanciate cluster without nodes in error d = XenCluster.getDeferInstance(running) d.addCallbacks(startRecover) return d
def migrate(self, vmname, src_hostname, dst_hostname): """Live migration of specified VM from src to dst. All params are strings. Raise a NotInClusterError if src or dst are not part of cluster. Raise a NotRunningVmError if vm is not started on src or a RunningVmError if vm is already started on dst. """ # Security checks if not self.is_in_cluster(src_hostname): raise NotInClusterError(src_hostname) if not self.is_in_cluster(dst_hostname): raise NotInClusterError(dst_hostname) dst_node=self.get_node(dst_hostname) src_node=self.get_node(src_hostname) if not src_node.is_vm_started(vmname): raise NotRunningVmError(src_node.get_hostname(), vmname) if dst_node.is_vm_started(vmname): raise RunningVmError(dst_node.get_hostname(), vmname) # Resources checks used_ram=src_node.get_vm(vmname).get_ram() free_ram=dst_node.metrics.get_free_ram() if used_ram>free_ram: raise NotEnoughRamError(dst_node.get_hostname(),"need "+str(used_ram)+"M, has "+str(free_ram)+"M.") # Take care of proper migration dst_node.activate_lv(vmname) src_node.migrate(vmname,dst_node) # Could raise xen.xm.XenAPI.Failure (not SystemExit) src_node.deactivate_lv(vmname) src_node.disable_vm_autostart(vmname) dst_node.enable_vm_autostart(vmname) if core.cfg['POST_MIGRATION_HOOK']: # Run post migration script in background without error handling try: # Double fork with all filehandles closed, if not, run() hang src_node.run("(%s %s %s %s 2>&- >&- <&- &)&" % (core.cfg['POST_MIGRATION_HOOK'], vmname, src_node.get_hostname(), dst_node.get_hostname())) except Exception, e: log.warn("Post-migration hook failed : %s" % (e))
def recoverFromPanic(self): if not self.isInPanic(): log.warn("I'm not in panic. Cannot recover anything.") raise RPCRefusedError("Not in panic mode") # Only master can do recovery if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot recover from panic.") raise RPCRefusedError("Not master") # Back to normal mode log.info("Recovering from panic mode. Back to normals operations.") self.state = MasterService.ST_NORMAL self.s_masterHb.forcePulse() d = self.triggerElection() return d
def recoverFromPanic(self): if not self.isInPanic(): log.warn("I'm not in panic. Cannot recover anything.") raise RPCRefusedError("Not in panic mode") # Only master can do recovery if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot recover from panic.") raise RPCRefusedError("Not master") # Back to normal mode log.info("Recovering from panic mode. Back to normals operations.") self.state=MasterService.ST_NORMAL self.s_masterHb.forcePulse() d=self.triggerElection() return d
def process_user(self, user, categories=None): assert user is not None if categories is None: categories = self._categories retries = 0 while retries < 3: try: for category in categories: ts = { 'user_id' : user.user_id, 'scope' : 'user' } if category != 'default': if category == 'app': ts['subcategory'] = 'app' else: ts['category'] = category ts['limit'] = 100 collage = self._collages[category] stamp_slice = HTTPTimeSlice().dataImport(ts).exportTimeSlice() stamps = self.api.getStampCollection(stamp_slice) entities = map(lambda s: s.entity, stamps) entities = utils.shuffle(entities)[:30] logs.info("creating collage for user '%s' w/ category '%s' and %d entities" % (user.screen_name, category, len(entities))) images = collage.generate_from_user(user, entities) for image in images: filename = "collage-%s-%s-%sx%s.jpg" % (user.screen_name, category, image.size[0], image.size[1]) self.save_image(image, filename) break except Exception, e: logs.warn("unexpected error processing user %s: %s" % (str(user), e)) logs.warn(utils.getFormattedException()) retries += 1 time.sleep(2 ** retries)
def voteForNewMaster(self, msg): # Elections accepted even if in panic mode def sendVote(result): log.info("Sending our vote...") result.sendMessage() port.stopListening() # Check origin of message if msg.node not in self.status: log.warn("Received vote request from unknown node %s." % (msg.node)) return # Discard current election if there is a new one if self.role == MasterService.RL_VOTING: log.warn("Previous election aborded: new vote request received.") try: self.f_tally.cancel() except: pass log.info("Vote request received from %s." % (msg.node)) self.currentElection=msg.election # Discard vote request if we are leaving if self.role == MasterService.RL_LEAVING: log.info("Vote request ignored: we are leaving this cluster.") return # Stop heartbeating self._stopSlave() if self.role == MasterService.RL_ACTIVE: self._stopMaster() # Prepare election self.role=MasterService.RL_VOTING self.ballotBox=dict() self.f_tally=reactor.callLater(MasterService.TM_TALLY, self.countVotes) # Timout of election stage # Send our vote d = Deferred() port = reactor.listenUDP(0, UDPSender(d, lambda: MessageVoteResponse().forge(self.currentElection))) d.addCallback(sendVote) d.addErrback(log.err)
def leaveCluster(self): def masterConnected(obj): d = obj.callRemote("unregister",DNSCache.getInstance().name) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d # Stop slave hearbeat and watchdog self._stopSlave() previousRole=self.role self.role=MasterService.RL_LEAVING if previousRole == MasterService.RL_ACTIVE: # Self-delete our own record self._unregister(DNSCache.getInstance().name) if len(self.status) <= 0: log.warn("I'm the last node, shutting down cluster.") d=defer.succeed(None) else: # New election only if there is at least one node d=self.triggerElection() d.addErrback(log.err) # Stop master hearbeat when vote request has been sent d.addBoth(lambda _: self._stopMaster()) elif previousRole == MasterService.RL_PASSIVE: rpcFactory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], rpcFactory) d = rpcFactory.getRootObject() d.addCallback(masterConnected) else: # RL_ALONE or RL_JOINING or RL_VOTING if previousRole == MasterService.RL_VOTING: # Others nodes will re-trigger an election if we win this one log.warn("Quitting cluster during election stage !") d=defer.succeed(None) return d
def leaveCluster(self): def masterConnected(obj): d = obj.callRemote("unregister", DNSCache.getInstance().name) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d # Stop slave hearbeat and watchdog self._stopSlave() previousRole = self.role self.role = MasterService.RL_LEAVING if previousRole == MasterService.RL_ACTIVE: # Self-delete our own record self._unregister(DNSCache.getInstance().name) if len(self.status) <= 0: log.warn("I'm the last node, shutting down cluster.") d = defer.succeed(None) else: # New election only if there is at least one node d = self.triggerElection() d.addErrback(log.err) # Stop master hearbeat when vote request has been sent d.addBoth(lambda _: self._stopMaster()) elif previousRole == MasterService.RL_PASSIVE: rpcFactory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], rpcFactory) d = rpcFactory.getRootObject() d.addCallback(masterConnected) else: # RL_ALONE or RL_JOINING or RL_VOTING if previousRole == MasterService.RL_VOTING: # Others nodes will re-trigger an election if we win this one log.warn("Quitting cluster during election stage !") d = defer.succeed(None) return d
def fence(self, hostname): """ Fence the given node. You have to make a fencing script that will use iLo, IPMI or other such fencing device. See FENCE_CMD in configuration file. Raise a FenceNodeError if the fence fail of if DISABLE_FENCING is True. """ if core.cfg['DISABLE_FENCING']: raise FenceNodeError(self.get_hostname(), "Fencing disabled by configuration", hostname) if self.get_hostname() == hostname: log.warn("Node is self-fencing !") log.warn("\"Chérie ça va trancher.\"") try: self.run(core.cfg['FENCE_CMD'] + " " + hostname) except ShellError, e: raise FenceNodeError(self.get_hostname(), e.value, hostname)
def revive_tombstoned_entities(entity_id): """ Finds all entities that are tombstoned to the given entity, undo the tombstone. This function will also find all stamps and todos that were transfered to the given entity as a consequence of tombstoning, and return those items to the original entity. """ entities_by_id = {} entity_db = MongoEntityCollection.MongoEntityCollection() for entity in entity_db.getEntitiesByTombstoneId(entity_id): clear_tombstone_id(entity, entity_db, entities_by_id) todo_db = MongoTodoCollection.MongoTodoCollection() todo_seed_db = MongoTodoCollection.MongoSeedTodoCollection() for todo_id in todo_db.getTodoIdsFromEntityId(entity_id): original_entity_id = todo_seed_db.getEntityIdForTodo(todo_id) if original_entity_id is None: logs.warn('Could not find entity for seed todo: ' + todo_id) continue if original_entity_id in entities_by_id: entity = entities_by_id[original_entity_id] else: entity = entity_db.getEntity(original_entity_id) entity = clear_tombstone_id(entity, entity_db, entities_by_id) todo_db.updateTodoEntity(todo_id, entity.minimize()) stamp_db = MongoStampCollection.MongoStampCollection() stamp_seed_db = MongoStampCollection.MongoSeedStampCollection() for stamp_id in stamp_db.getStampIdsForEntity(entity_id): original_entity_id = stamp_seed_db.getStamp(stamp_id).entity.entity_id if original_entity_id in entities_by_id: entity = entities_by_id[original_entity_id] else: entity = entity_db.getEntity(original_entity_id) entity = clear_tombstone_id(entity, entity_db, entities_by_id) stamp_db.updateStampEntity(stamp_id, entity.minimize()) api = globalMongoStampedAPI() for entity in entities_by_id.itervalues(): api.mergeEntity(entity)
def _buildMap(template, stamps): try: result = [] for stamp in stamps: try: if "deleted" not in stamp: result.append(HTTPStamp().importSchema(stamp).dataExport()) except: logs.warn(utils.getFormattedException()) result = json.dumps(result, sort_keys=True) response = render_to_response(template, {"stamps": result}) response["Expires"] = (datetime.utcnow() + timedelta(minutes=10)).ctime() response["Cache-Control"] = "max-age=600" return response except Exception as e: logs.warning("Error: %s" % e) raise Http404
def _elasticsearch(self): try: import pyes except: utils.printException() es_port = 9200 retries = 5 if libs.ec2_utils.is_ec2(): stack = libs.ec2_utils.get_stack() if stack is None: logs.warn("error: unable to find stack info") return None es_servers = filter(lambda node: "search" in node.roles, stack.nodes) es_servers = map(lambda node: str("%s:%d" % (node.private_ip_address, es_port)), es_servers) if len(es_servers) == 0: logs.warn("error: no elasticsearch servers found") return None else: es_servers = "%s:%d" % ("localhost", es_port) while True: try: es = pyes.ES(es_servers) info = es.collect_info() utils.log("[%s] pyes: %s" % (self, pformat(info))) return es except Exception: retries -= 1 if retries <= 0: raise utils.printException() time.sleep(1)
def updateNodeStatus(self, msg): if self.role != MasterService.RL_ACTIVE: # Some slave HB could reach us during election... if self.role == MasterService.RL_PASSIVE: log.warn( "Received slave heartbeat from %s while we're not master." % (msg.node)) return # Check origin of message if msg.node not in self.status: log.warn("Received slave heartbeat from unknown node %s." % (msg.node)) return now = int(time.time()) self.status[msg.node] = { 'timestamp': now, 'offset': now - msg.ts, 'vms': msg.vms }
def _startMaster(self): def masterWatchdogFailed(reason): log.emerg("Slave heartbeat checks failed: %s." % (reason.getErrorMessage())) self.panic() def startMasterWatchdog(): if not self.l_masterDog.running: d=self.l_masterDog.start(MasterService.TM_WATCHDOG) d.addErrback(masterWatchdogFailed) d.addErrback(log.err) # Start master heartbeat self.s_masterHb.startService() # Check state of previous master if self.state == MasterService.ST_RECOVERY: log.warn("Previous master was recovering something: re-enabling failover.") # Force normal mode to re-run failover self.state=MasterService.ST_NORMAL # Start master's watchdog for slaves failover reactor.callLater(2, startMasterWatchdog)
class UDPListener(DatagramProtocol): def __init__(self, onReceive): self.c_onReceive = onReceive def datagramReceived(self, data, (host, port)): try: if USE_ZLIB: (crc, zip) = data.split(',', 1) data = zlib.decompress(zip) if int(crc) != zlib.adler32(data): raise Exception("Data from %s is corrupted." % (host)) msg = json.loads(data) except Exception, e: log.warn("Error parsing message: %s" % (e))
def updateMasterStatus(self, msg): if self.master is None: self.master = msg.node log.info("Found master at %s." % (self.master)) else: # Check origin of message if we known the cluster members if msg.node not in self.status: log.warn("Received master heartbeat from unknown node %s." % (msg.node)) return # Active master's checks if self.role == MasterService.RL_ACTIVE: if self.master == msg.node: return # Discard our own master heartbeat else: # Usecase #8: partition ended with many master log.warn( "Received another master's heartbeat from %s ! Trying to recover from partition..." % (msg.node)) self.triggerElection().addErrback(log.err) # Propagate panic mode from another master if msg.state == MasterService.ST_PANIC: log.warn( "Concurrent master is in panic mode, so we should be too." ) self.panic() return # Passive master's checks if self.master != msg.node: log.warn("Received master heartbeat from a wrong master %s !" % (msg.node)) return # Check error mode change to panic if not self.isInPanic() and msg.state == MasterService.ST_PANIC: log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.") # Keep a backup of the active master's state and status self.status = msg.status self.state = msg.state self.masterLastSeen = int(time.time())
def updateMasterStatus(self, msg): if self.master is None: self.master=msg.node log.info("Found master at %s." % (self.master)) else: # Check origin of message if we known the cluster members if msg.node not in self.status: log.warn("Received master heartbeat from unknown node %s." % (msg.node)) return # Active master's checks if self.role == MasterService.RL_ACTIVE: if self.master == msg.node: return # Discard our own master heartbeat else: # Usecase #8: partition ended with many master log.warn("Received another master's heartbeat from %s ! Trying to recover from partition..." % (msg.node)) self.triggerElection().addErrback(log.err) # Propagate panic mode from another master if msg.state == MasterService.ST_PANIC: log.warn("Concurrent master is in panic mode, so we should be too.") self.panic() return # Passive master's checks if self.master != msg.node: log.warn("Received master heartbeat from a wrong master %s !" % (msg.node)) return # Check error mode change to panic if not self.isInPanic() and msg.state == MasterService.ST_PANIC: log.emerg("SYSTEM FAILURE: Panic mode has been engaged by master.") # Keep a backup of the active master's state and status self.status=msg.status self.state=msg.state self.masterLastSeen=int(time.time())
def recordVote(self, msg): # Check origin of message if msg.node not in self.status: log.warn("Vote received from unknown node %s." % (msg.node)) return if self.role != MasterService.RL_VOTING: log.warn("Vote received from %s but it's not election time !" % (msg.node)) return if self.currentElection != msg.election: log.warn("Vote received for another election from %s. Discarding." % (msg.node)) return self.ballotBox[msg.ballot]=msg.node
def unregisterNode(self, name): # Can unregister node even if in panic mode if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot unregister %s." % (name)) raise RPCRefusedError("Not master") if name not in self.status: log.warn("Unknown node %s try to quit the cluster." % (name)) raise NodeRefusedError("Unknown node "+name) if name == DNSCache.getInstance().name: log.warn("I'm the master. Cannot self unregister.") raise NodeRefusedError("Cannot unregister master") self._unregister(name)
def unregisterNode(self, name): # Can unregister node even if in panic mode if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot unregister %s." % (name)) raise RPCRefusedError("Not master") if name not in self.status: log.warn("Unknown node %s try to quit the cluster." % (name)) raise NodeRefusedError("Unknown node " + name) if name == DNSCache.getInstance().name: log.warn("I'm the master. Cannot self unregister.") raise NodeRefusedError("Cannot unregister master") self._unregister(name)
def render(self, context): try: if self._context_variable is None: context_dict = self._simplify_context(context) else: context_dict = self._context_variable.resolve(context) result = self._library.render(self._name, context_dict) if len(result.strip()) == 0: logs.warn("%s.render warning empty result (%s)" % (self, self._name)) return result except Exception, e: logs.warn("%s.render error (%s): %s" % (self, self._name, e)) logs.warn(utils.getFormattedException()) return ''
def recordVote(self, msg): # Check origin of message if msg.node not in self.status: log.warn("Vote received from unknown node %s." % (msg.node)) return if self.role != MasterService.RL_VOTING: log.warn("Vote received from %s but it's not election time !" % (msg.node)) return if self.currentElection != msg.election: log.warn( "Vote received for another election from %s. Discarding." % (msg.node)) return self.ballotBox[msg.ballot] = msg.node
def panic(self, noCheck=False): """ Engage panic mode. Use noCheck=True if you want to panic whatever the cluster role. """ def panicFailed(reason): log.emerg("Panic query failed: %s." % (reason.getErrorMessage())) self.panic(True) if self.state == MasterService.ST_PANIC: log.emerg("Panic mode already engaged.") elif self.role == MasterService.RL_ACTIVE or noCheck: log.emerg("SYSTEM FAILURE: Panic mode engaged.") log.emerg( "This is a critical error. You should bring your ass over here, right now." ) log.emerg( "Please check logs and be sure of what you're doing before re-engaging normal mode." ) self.state = MasterService.ST_PANIC # TODO + stop LB if self.l_masterDog.running: self.l_masterDog.stop() elif self.role == MasterService.RL_VOTING: # No master during election stage: waiting next master log.warn("Panic mode requested during election stage: delaying.") self.panicRequested = True elif self.role == MasterService.RL_PASSIVE: log.warn("I'm slave: asking master to engage panic mode...") agent = Agent() d = agent.panic() d.addErrback(panicFailed) d.addErrback(log.err) else: # RL_ALONE or RL_JOINING or RL_LEAVING log.warn( "I'm not in a running state (master or slave). Cannot engage panic mode." ) raise RPCRefusedError("Not in running state")
def panic(self, noCheck=False): """ Engage panic mode. Use noCheck=True if you want to panic whatever the cluster role. """ def panicFailed(reason): log.emerg("Panic query failed: %s." % (reason.getErrorMessage())) self.panic(True) if self.state == MasterService.ST_PANIC: log.emerg("Panic mode already engaged.") elif self.role == MasterService.RL_ACTIVE or noCheck: log.emerg("SYSTEM FAILURE: Panic mode engaged.") log.emerg("This is a critical error. You should bring your ass over here, right now.") log.emerg("Please check logs and be sure of what you're doing before re-engaging normal mode.") self.state=MasterService.ST_PANIC # TODO + stop LB if self.l_masterDog.running: self.l_masterDog.stop() elif self.role == MasterService.RL_VOTING: # No master during election stage: waiting next master log.warn("Panic mode requested during election stage: delaying.") self.panicRequested=True elif self.role == MasterService.RL_PASSIVE: log.warn("I'm slave: asking master to engage panic mode...") agent=Agent() d=agent.panic() d.addErrback(panicFailed) d.addErrback(log.err) else: # RL_ALONE or RL_JOINING or RL_LEAVING log.warn("I'm not in a running state (master or slave). Cannot engage panic mode.") raise RPCRefusedError("Not in running state")
import vars_init vars_init.init(targets) import vars, state, builder, jwack from logs import warn, err try: if vars_init.is_toplevel: builder.start_stdin_log_reader(status=opt.status, details=opt.details, pretty=opt.pretty, color=opt.color, debug_locks=opt.debug_locks, debug_pids=opt.debug_pids) for t in targets: if os.path.exists(t): f = state.File(name=t) if not f.is_generated: warn('%s: exists and not marked as generated; not redoing.\n' % f.nicename()) state.rollback() j = atoi(opt.jobs or 1) if j < 1 or j > 1000: err('invalid --jobs value: %r\n' % opt.jobs) jwack.setup(j) try: assert(state.is_flushed()) retcode = builder.main(targets, lambda t: (True, True)) assert(state.is_flushed()) finally: try: state.rollback() finally: try:
def joinCluster(self): def startHeartbeats(): self._startSlave() self.s_rpc.startService() if self.role == MasterService.RL_ACTIVE: self._startMaster() def joinRefused(reason): reason.trap(NodeRefusedError, RPCRefusedError) log.err("Join to cluster %s failed: Master %s has refused me: %s" % (core.cfg['CLUSTER_NAME'], self.master, reason.getErrorMessage())) self.stopService() def joinAccepted(result): self.role = MasterService.RL_PASSIVE log.info("Join successfull, I'm now part of cluster %s." % (core.cfg['CLUSTER_NAME'])) startHeartbeats() def masterConnected(obj): d = obj.callRemote("register", DNSCache.getInstance().name) d.addCallbacks(joinAccepted, joinRefused) d.addErrback(log.err) d.addBoth(lambda _: rpcConnector.disconnect()) return d try: if self.master is None: # New active master if DNSCache.getInstance( ).name not in core.cfg['ALLOWED_NODES']: log.warn( "I'm not allowed to create a new cluster. Exiting.") raise Exception("Cluster creation not allowed") if DiskHeartbeat.is_in_use(): log.err("Heartbeat disk is in use but we are alone !") raise Exception("Heartbeat disk already in use") log.info("No master found. I'm now the new master of %s." % (core.cfg['CLUSTER_NAME'])) self.role = MasterService.RL_ACTIVE self.master = DNSCache.getInstance().name self.status[self.master] = { 'timestamp': 0, 'offset': 0, 'vms': [] } self.disk.make_slot(DNSCache.getInstance().name) startHeartbeats() else: # Passive master self.role = MasterService.RL_JOINING log.info("Trying to join cluster %s..." % (core.cfg['CLUSTER_NAME'])) factory = pb.PBClientFactory() rpcConnector = reactor.connectTCP(self.master, core.cfg['TCP_PORT'], factory) d = factory.getRootObject() d.addCallback(masterConnected) d.addErrback(log.err) except Exception, e: log.err("Startup failed: %s. Shutting down." % (e)) self.stopService()
def invalidHostname(reason): log.warn("Node %s has an invalid name. Refusing." % (name)) raise NodeRefusedError(reason.getErrorMessage())
def start_vms(self, vmnames): """ Start the specified list of VM on the cluster, one after the other. Nodes are choosen with a best-fit decreasing algorithm, so the cluster will not be balanced, but optimized for full-load. This function is error-proof: if a vm start to fail, it will try to start others vms and report all errors only at the end. vmnames - (List of String) VM hostnames Raise a MultipleError if one of many errors are detected. """ assert type(vmnames) == list, "Param 'vmnames' should be a list." # Get nodes pool=self.get_nodes() # Sort VMs to be started by ram vms=[ VM(name) for name in vmnames ] vms.sort(key=lambda x: x.get_start_ram(), reverse=True) failed=dict() for vm in vms: selected_node=None # Check if vm is already started somewhere nodes=self.search_vm_started(vm.name) if(len(nodes)>0): log.info("%s is already started on %s." % (vm.name, ", ".join([n.get_hostname() for n in nodes]))) continue # Sort nodes by free ram pool.sort(key=lambda x: x.metrics.get_free_ram(False)) for node in pool: if node.metrics.get_free_ram() >= vm.get_start_ram(): selected_node=node break # Select first node with enough space if selected_node is None: # Not enough room for this one failed[vm.name]=NotEnoughRamError("this cluster", "Cannot start "+vm.name) continue # Next ! log.info("Starting", vm.name, "on", selected_node.get_hostname()) # Start the vm try: self.activate_vm(node,vm.name) try: selected_node.start(vm.name) except SystemExit, e: # SystemExit are raised by Xen when xm_create fail node.deactivate_lv(vm.name) failed[vm.name]=XenError(node.get_hostname(), str(e)) except Exception, e: node.deactivate_lv(vm.name) failed[vm.name]=e else: try: selected_node.enable_vm_autostart(vm.name) except Exception, e: # Don't report failure as an error, autostart link is not important log.warn("Cannot enable autostart for %s : %s" % (vm.name, e))
def registerNode(self, name): def validHostname(result): try: self.disk.make_slot(name) except DiskHeartbeatError, e: raise NodeRefusedError("Disk heartbeat failure: %s" % (e)) self.status[name] = {'timestamp': 0, 'offset': 0, 'vms': []} log.info("Node %s has joined the cluster." % (name)) def invalidHostname(reason): log.warn("Node %s has an invalid name. Refusing." % (name)) raise NodeRefusedError(reason.getErrorMessage()) if self.isInPanic(): log.warn("I'm in panic. Cannot register %s." % (name)) raise RPCRefusedError("Panic mode engaged") if self.role != MasterService.RL_ACTIVE: log.warn("I'm not master. Cannot register %s." % (name)) raise RPCRefusedError("Not master") if name not in core.cfg['ALLOWED_NODES']: log.warn("Node %s not allowed to join this cluster. Refusing." % (name)) raise NodeRefusedError("Node not allowed to join this cluster.") if name in self.status: log.warn("Node %s is already joined ! Cannot re-join." % (name)) raise NodeRefusedError("Node already in cluster")
class XenCluster: """This class is used to perform action on the xen cluster.""" def __init__(self, nodes): """This should be private. Use getDeferInstance() instead.""" assert type(nodes) == dict, "Param 'nodes' should be a dict." self.nodes=nodes @staticmethod def getDeferInstance(nodeslist=None): """Instantiate a XenCluster object and associated Nodes. This function open SSH and XenAPI connections to all actives nodes. It take a (string) list of node's hostname as optionnal argument, if not given, the list will fetched from cxm'master. Return a deferred that will be fired when all nodes are ready. If a node is not online, the deferred will fail. """ log.info("Loading cluster...") nodes=dict() def instantiate(results): failedNodes=dict() for result in results: if not result[0]: failedNodes[nodeslist[results.index(result)]]=result[1].getErrorMessage() if len(failedNodes)>0: raise InstantiationError(failedNodes) return XenCluster(nodes) def add_node(result, hostname): nodes[hostname]=result def create_nodes(result): ds=list() for hostname in result: d=threads.deferToThread(lambda x: Node(x), hostname) d.addCallback(add_node, hostname) ds.append(d) dl=defer.DeferredList(ds, consumeErrors=1) dl.addCallback(instantiate) return dl def failed(reason): raise Exception("Can't connect to local master: %s" % reason.getErrorMessage()) if not nodeslist: agent=Agent() d=agent.getNodesList() d.addCallback(create_nodes) d.addErrback(failed) return d else: return create_nodes(nodeslist) def disconnect(self): """Close all connections.""" for node in self.get_nodes(): node.disconnect() def get_nodes(self): """Fetch the current actives nodes. Return a list of Node object. """ return self.nodes.values() def get_node(self,hostname): """Return the Node object of the specified hostname. Raise a NotInClusterError if the given hostname is not a clusters's node. """ try: return self.nodes[hostname] except KeyError: raise NotInClusterError(hostname) def get_local_node(self): """Return the Node object of the local node. Raise a NotInClusterError if the local node is not a clusters's node. """ return self.get_node(socket.gethostname()) def get_load(self): """ Return the global load of the cluster, in percentage. This load is computed using ram capacities. If load is higher than 100%, cluster is overloaded and cannot do failover. """ def computeLoad(result): # The load is computed without the bigger node # and so take in account a failure of one node. try: return (sum(result['used'])*100)/(sum(result['total'])-max(result['total'])) except ZeroDivisionError: # Just one node: cannot do failover return 100 d=self.get_ram_details() d.addCallback(computeLoad) return d def get_ram_details(self): """Return a dict of list with the free, used, and total ram of the cluster. Units: MB""" def getValues(node): return node.get_metrics().get_ram_infos() def appendValues(results): used=list() free=list() total=list() for success, result in results: if success: used.append(result['used']) free.append(result['free']) total.append(result['total']) else: raise result return { 'total': total, 'free':free, 'used':used } ds=list() for node in self.get_nodes(): d=threads.deferToThread(getValues, node) ds.append(d) dl=defer.DeferredList(ds, consumeErrors=True) dl.addCallback(appendValues) return dl def get_vm_started(self): """Return the number of vm started in the cluster.""" def computeNumber(results): nb=0 for success, result in results: if success: nb += result else: raise result return nb ds=list() for node in self.get_nodes(): d=threads.deferToThread(lambda x: x.get_vm_started(), node) ds.append(d) dl=defer.DeferredList(ds, consumeErrors=True) dl.addCallback(computeNumber) return dl def shutdown_all(self, hard=False): """Shutdown all running vm on the cluster. If 'hard' is True, do a hard shutdown (destroy). """ ds=list() for node in self.get_nodes(): d=threads.deferToThread(lambda x: x.shutdown_all(hard), node) ds.append(d) dl=defer.DeferredList(ds, consumeErrors=True) return dl def is_in_cluster(self, hostname): """Return True if the specified hostname is a node of the cluster.""" return hostname in self.nodes def search_vm_started(self,vmname): """Search where the specified vm hostname is running. Return a list of Node where the VM is running. """ started=list() for node in self.get_nodes(): if node.is_vm_started(vmname): started.append(node) return started def search_vm_autostart(self,vmname): """Search where the specified vm hostname has an autostart link. Return a list of Node where the autostart link is present. """ enabled=list() for node in self.get_nodes(): if node.is_vm_autostart_enabled(vmname): enabled.append(node) return enabled def activate_vm(self,selected_node,vmname): """Activate all the LVM logicals volumes of the specified VM exclusively on the selected node. selected_node - (Node) Node where to activate the LVs vmname - (String) hostname of the vm Raise a RunningVmError if the VM is running. """ for node in self.get_nodes(): if node.is_vm_started(vmname): raise RunningVmError(node.get_hostname(), vmname) else: node.deactivate_lv(vmname) selected_node.activate_lv(vmname) def start_vm(self, node, vmname, console): """Start the specified VM on the given node. If there is not enough ram on the given node, the VM will be started on the node with the highest free ram and the autostart link will be updated accordingly. node - (Node) Selected host vmname - (String) VM hostname console - (boolean) Attach console to the domain """ # Resources checks needed_ram=VM(vmname).get_ram() free_ram=node.metrics.get_free_ram() if needed_ram>free_ram: # Not enough ram, switching to another node old_node=node # Get the node with the highest free ram (first fit increasing algorithm) pool=self.get_nodes() pool.sort(key=lambda x: x.metrics.get_free_ram(), reverse=True) node=pool[0] # Last resources checks free_ram=node.metrics.get_free_ram() if needed_ram>free_ram: raise NotEnoughRamError(node.get_hostname(),"need "+str(needed_ram)+"M, has "+str(free_ram)+"M.") log.info(" -> Not enough ram, starting it on %s." % node.get_hostname()) # Start the VM self.activate_vm(node,vmname) try: node.start(vmname) except Exception, e: node.deactivate_lv(vmname) raise e # Update autostart link only if another node has been selected if 'old_node' in locals(): old_node.disable_vm_autostart(vmname) node.enable_vm_autostart(vmname) # Attach to the console without forking if console: if node.is_local_node(): node.get_vm(vmname).attach_console() else: log.warn("Cannot attach console when using remote Xen-API.")
raise except NotInClusterError: # Next step of recovery process pass except Exception, e: log.err("Cannot get the VMs back:", e) if partial_failure: # Cannot recover, node still alive return False # Check if VM are still alive if len(vm_list)>0: for node in self.get_nodes(): if node.ping(vm_list): log.warn("Some VM on %s are still alive !" % (name)) return False log.warn("All VM on %s are dead. Fencing now !" % (name)) else: log.warn("No VM running on %s. Fencing now !" % (name)) self.get_local_node().fence(name) # Remove fenced node from current cluster instance if name in self.nodes.keys(): del self.nodes[name] log.info("Restarting dead VM from %s on healthy nodes..." % (name)) self.start_vms(vm_list) return True
def processEnded(self, reason): log.warn("Inotify has died: %s" % (reason.value)) try: reactor.stop() except: pass
def warn_override(name): warn('%s - you modified it; skipping\n' % name)
def NEWsearchUsers(self, authUserId, query, limit=0, relationship=None): query = query.lower() query = self._valid_re.sub('', query) if len(query) == 0: return [] users = [] seen = set() domain = None if relationship is not None: if relationship == 'followers': domain = self.followers_collection.getFollowers(authUserId) elif relationship == 'following': domain = self.friends_collection.getFriends(authUserId) else: raise StampedInvalidRelationshipError("invalid relationship") domain = set(domain) try: user = self.getUserByScreenName(query) if user is not None and (domain is None or user.user_id in domain): seen.add(user.user_id) users.append(user) except Exception as e: logs.warning("Exact user match not found for '%s': %s" % (query, e)) q = StringQuery(query, default_operator="AND", search_fields=[ "name", "screen_name" ]) q = CustomScoreQuery(q, lang="mvel", script=""" ns = doc.?num_stamps.value; ns = (ns != null) ? log(ns) : 0; nf = doc.?num_friends.value; nf = (nf != null) ? log(nf) : 0; return _score + ns / 4.0 + nf / 8.0 """) if domain: q = FilteredQuery(q, IdsFilter('user', list(domain))) results = self.api._elasticsearch.search(q, indexes = [ 'users' ], doc_types = [ 'user' ], size = limit) utils.log(pformat(results)) try: user_ids = map(lambda result: result['_id'], results['hits']['hits']) users2 = self.lookupUsers(user_ids) id_user = {} for user in users2: id_user[user.user_id] = user for user_id in user_ids: if user_id not in seen: seen.add(user_id) users.append(id_user[user_id]) except Exception: logs.warn("received invalid results from pyes") logs.warn(pformat(results)) return [] return users