def startManage(self): logger.info("----------------------------------") logger.info("Management cycle triggered") logger.info("Time: %s" % datetime.today().strftime("%Y-%m-%d %H:%M:%S")) # regular management self.reqBox.manage() self.siteBox.manage() self.intBox.manage() # scaling mReq = self.reqBox.getMachineTypeRequirement() logger.info("Current requirement: %s" % mReq) siteInfo = self.siteBox.siteInformation runningBySite = self.siteBox.runningMachinesCount # contains a list of all machine types merged runningOverall = summarize_dicts(list(runningBySite.values())) machStat = dict() for (key_, value_) in runningOverall.items(): machStat[key_] = MachineStatus(mReq.get(key_, 0), value_) for key_ in mReq: if not key_ in machStat: machStat[key_] = MachineStatus(mReq.get(key_, 0), 0) decision = self.broker.decide(machStat, siteInfo.values()) # Service machines may modify site decision(s). decision = self.siteBox.modServiceMachineDecision(decision) logger.info("Decision: %s" % decision) logger.debug(runningBySite) # make machine counts absolute, as they come in relative from the broker for (ksite, vmach) in decision.items(): logger.debug("vmatch=%s" % vmach) for kmach in vmach: decision[ksite][kmach] += runningBySite[ksite].get(kmach, []) logger.debug("decision[ksite][kmach]=%s" % decision[ksite][kmach]) logger.info("Absolute Decision: %s" % decision) self.siteBox.applyMachineDecision(decision) logger.info(self.mr.getMachineOverview()) MachineRegistryLogger.dump(self.mr.machines) log = JsonLog() log.writeLog() self.manageIterations += 1 lastIteration = False if self.maximumManageIterations is not None: lastIteration = self.maximumManageIterations <= self.manageIterations if self.autoRun is True and lastIteration is False: self.startManagementTimer()
def startManage(self): logger.info("----------------------------------") logger.info("Management cycle triggered") logger.info("Time: %s" % datetime.today().strftime("%Y-%m-%d %H:%M:%S")) # regular management self.reqBox.manage() self.siteBox.manage() self.intBox.manage() # scaling mReq = self.reqBox.getMachineTypeRequirement() logger.info("Current requirement: %s" % mReq) siteInfo = self.siteBox.siteInformation runningBySite = self.siteBox.runningMachinesCount # contains a list of all machine types merged runningOverall = summarize_dicts(list(runningBySite.values())) machStat = dict() for (key_, value_) in runningOverall.items(): machStat[key_] = MachineStatus(mReq.get(key_, 0), value_) for key_ in mReq: if not key_ in machStat: machStat[key_] = MachineStatus(mReq.get(key_, 0), 0) decision = self.broker.decide(machStat, siteInfo.values()) # Service machines may modify site decision(s). decision = self.siteBox.modServiceMachineDecision(decision) logger.info("Decision: %s" % decision) # make machine counts absolute, as they come in relative from the broker for (ksite, vmach) in decision.items(): for kmach in vmach: decision[ksite][kmach] += runningBySite[ksite].get(kmach, []) logger.info("Absolute Decision: %s" % decision) self.siteBox.applyMachineDecision(decision) logger.info(self.mr.getMachineOverview()) MachineRegistryLogger.dump(self.mr.machines) log = JsonLog() log.writeLog() self.manageIterations += 1 lastIteration = False if self.maximumManageIterations is not None: lastIteration = self.maximumManageIterations <= self.manageIterations if self.autoRun is True and lastIteration is False: self.startManagementTimer()
def manage(self): """ managing machine states that change dependant of the state changes on 1and1 cloud site run once per cycle :return: """ try: oao_machines = self.getOneAndOneMachines() except Exception: return # loop over all machines in machine registry for mid in self.mr.getMachines(self.siteName): machine = self.mr.machines[mid] # remove the corresponding machine from the 1and1 machine list try: oao_machine = oao_machines.pop(machine[self.reg_site_server_id]) except KeyError: self.mr.removeMachine(mid) continue # check for status which is handled by integration adapter if machine[self.mr.regStatus] in [self.mr.statusIntegrating]: continue # manage machine in status booting if machine[self.mr.regStatus] == self.mr.statusBooting: # if the 1and1 machine is in status powered if oao_machine[self.status][self.state] == self.state_powered_off: # check if a private network is assigned # if not then assign the right network if self.network not in oao_machine: try: self.assignPrivateNetwork(mid=mid) except Exception: break # if the private network is assigned to the 1and1 machine, add it to the machine registry elif self.reg_site_server_network not in machine: machine[self.reg_site_server_network] = machine[self.network] # if everything is done, start the machine else: try: self.modifyMachineStatus(mid=mid, action=self.command_power_on) except Exception: break machine[self.reg_site_server_status] = self.state_powering_on # it the 1and1machine is powered on, update the ip address, state and the condor name # at the end update the machine status in the machine registry elif oao_machine[self.status][self.state] == self.state_powered_on: machine[self.reg_site_server_status] = self.state_powered_on machine[self.reg_site_server_ip] = oao_machine[self.ips][0][self.ip] machine[self.reg_site_server_condor_name] = self.getCondorName(mid=mid) self.mr.updateMachineStatus(mid=mid, newStatus=self.mr.statusUp) # manage machine in status working or pending disintegration elif machine[self.mr.regStatus] == self.mr.statusWorking or machine[ self.mr.regStatus] == self.mr.statusPendingDisintegration: # if the 1and1 machine is powered on and it is later than "stop time" # move the machine to disintegrating if oao_machine[self.status][self.state] == self.state_powered_on: start_time = datetime.datetime.strptime(self.getConfig(self.configTimeStart), "%H:%M").time() stop_time = datetime.datetime.strptime(self.getConfig(self.configTimeEnd), "%H:%M").time() drain_time = datetime.datetime.strptime(self.getConfig(self.configTimeDrain), "%H:%M").time() current_time = datetime.datetime.now().time() if stop_time < current_time < start_time: self.mr.updateMachineStatus(mid=mid, newStatus=self.mr.statusDisintegrating) # if the 1and1 machine is powering off or powered off, move it to disintegrating elif oao_machine[self.status][self.state] in [self.state_powering_off, self.state_powered_off]: machine[self.reg_site_server_status] = oao_machine[self.status][self.state] self.mr.updateMachineStatus(mid=mid, newStatus=self.mr.statusDisintegrating) # manage machine in status disintegrating elif machine[self.mr.regStatus] == self.mr.statusDisintegrating: # if the machine is still powered on, shut it off if oao_machine[self.status][self.state] == self.state_powered_on: try: self.modifyMachineStatus(mid=mid, action=self.command_power_off) except Exception: break machine[self.reg_site_server_status] = self.state_powering_off # manage machine in status disintegrated elif machine[self.mr.regStatus] == self.mr.statusDisintegrated: # if the 1and1 machine is powered off, set it to status down if oao_machine[self.status][self.state] == self.state_powered_off: machine[self.reg_site_server_status] = self.state_powered_off self.mr.updateMachineStatus(mid=mid, newStatus=self.mr.statusDown) # manage machine in status down elif machine[self.mr.regStatus] == self.mr.statusDown: # if the 1and1 machine is powered off, and the delete option is enabled, delete the 1and1 machine if oao_machine[self.status][self.state] == self.state_powered_off: if self.getConfig(self.configDelete) is True: try: self.modifyMachineStatus(mid=mid, action=self.command_delete) except Exception: break machine[self.reg_site_server_status] = self.state_deleting # add all machines remaining in machine list from 1&1 for oao_machine in oao_machines: # check if machine is already in machine registry if oao_machine in [machine[self.reg_site_server_id] for machine in \ self.mr.getMachines(self.siteName).values()]: continue # create new machine in machine registry mid = self.mr.newMachine() # set some machine specific entries in machine registry self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][self.mr.regMachineType] = self.getConfig(self.configMachines).keys()[0] # machineType self.mr.machines[mid][self.reg_site_server_name] = oao_machines[oao_machine][self.name] self.mr.machines[mid][self.reg_site_server_id] = oao_machines[oao_machine][self.id] self.mr.machines[mid][self.reg_site_server_status] = oao_machines[oao_machine][self.status][self.state] self.mr.machines[mid][self.reg_site_server_datacenter] = oao_machines[oao_machine][self.datacenter][self.id] self.mr.machines[mid][self.reg_site_server_network] = \ self.getIDs(key=self.datacenter, value=oao_machines[oao_machine][self.datacenter][self.id])[2] self.mr.machines[mid][self.reg_site_server_condor_name] = "" self.mr.updateMachineStatus(mid, self.mr.statusBooting) # add current amounts of machines to Json log file # self.logger.info("Current machines running at %s: %d" % (self.siteName, self.runningMachinesCount)) self.logger.info("Current machines running at %s: %d" % (self.siteName, self.runningMachinesCount[ list(self.getConfig(self.configMachines).keys())[0]])) # ["vm-default"])) json_log = JsonLog() json_log.addItem(self.siteName, "machines_requested", int(len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))) json_log.addItem(self.siteName, "condor_nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) json_log.addItem(self.siteName, "condor_nodes_draining", len(self.getSiteMachines(status=self.mr.statusPendingDisintegration)))
def manage(self): # type: () -> None """Manages status changes of machines by checking jobs in Freiburg. Booting = Freiburg batch job for machine was submitted Up = Freiburg batch job is running, VM is Booting, HTCondorIntegrationAdapter switches this to "integrating" and "working". Disintegrated & Down HTCondorIntegrationAdapter is responsible for handling Integrating, Working, PendingDisintegration, Disintegrating """ try: frJobsRunning = self.__runningJobs if frJobsRunning is None: raise ValueError except ValueError: frJobsRunning = {} try: frJobsCompleted = self.__completedJobs if frJobsCompleted is None: raise ValueError except ValueError: frJobsCompleted = {} try: frJobsIdle = self.__idleJobs if frJobsIdle is None: raise ValueError except ValueError: frJobsIdle = {} mr = self.getSiteMachines() for mid in mr: batchJobId = mr[mid][self.regMachineJobId] # Status handled by Integration Adapter if mr[mid][self.mr.regStatus] in [ self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration, self.mr.statusDisintegrating ]: try: frJobsRunning.pop(batchJobId) continue except (KeyError, AttributeError): # AttributeError: frJobsRunning is Empty # KeyError: batchJobId not in frJobsRunning pass # Machines which failed to boot/died/got canceled (return code != 0) -> down # A machine MAY fail to boot with return code 0 or we just missed some states -> regular shutdown if mr[mid][self.mr.regStatus] != self.mr.statusDown: if batchJobId in frJobsCompleted: if mr[mid][self.mr.regStatus] == self.mr.statusBooting: self.logger.info("VM (%s) failed to boot!" % batchJobId) else: if frJobsCompleted[batchJobId] != "0": self.logger.info("VM (%s) died!" % batchJobId) else: self.logger.debug("VM (%s) died with status 0!" % batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusDown) elif batchJobId in frJobsCompleted or self.mr.calcLastStateChange( mid) > 24 * 60 * 60: # Remove machines, which are: # 1. finished in ROCED & Freiburg // 2. Finished for more than 1 day [= job history purge time] self.mr.removeMachine(mid) continue elif batchJobId in frJobsRunning: # ROCED machine down, but job still running frJobsRunning.pop(batchJobId) if self.mr.calcLastStateChange(mid) > 5 * 60: self.__cancelFreiburgMachines(batchJobId) continue if mr[mid][self.mr.regStatus] == self.mr.statusBooting: # batch job running: machine -> up if batchJobId in frJobsRunning: self.mr.updateMachineStatus(mid, self.mr.statusUp) frJobsRunning.pop(batchJobId) # Machine disappeared. If the machine later appears again, it will be added automatically. elif batchJobId not in frJobsIdle and batchJobId not in frJobsCompleted: self.mr.updateMachineStatus(mid, self.mr.statusDown) # All remaining unaccounted batch jobs for batchJobId in frJobsRunning: mid = self.mr.newMachine() # TODO: try to identify machine type, using cores & wall-time self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][ self.mr.regMachineType] = self.__default_machine self.mr.machines[mid][self.regMachineJobId] = batchJobId self.mr.machines[mid][ self.reg_site_server_condor_name] = self.__getCondorName( batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusUp) self.logger.info("Machines using resources (Freiburg): %d" % self.cloudOccupyingMachinesCount) with JsonLog() as jsonLog: jsonLog.addItem( self.siteName, "condor_nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) jsonLog.addItem( self.siteName, "condor_nodes_draining", len([ mid for mid in self.getSiteMachines( status=self.mr.statusPendingDisintegration) if HTCondor.calcDrainStatus(mid)[1] is True ])) jsonLog.addItem( self.siteName, "machines_requested", len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))
def manage(self): """Managing machine states, run once per cycle This function takes care of the machine status and manages state changes: booting -> up disintegrating -> disintegrated It uses machine states in OpenStack and the machine registry machine states to trigger state changes. :return: """ nova_machines = self.__getNovaMachines() # Look for each machine in machine registry and perform necessary status change(s). # # In the process we delete each machine that's in the machine registry from nova_machines. # As a result, nova_machines then contains a list of MISSING machines. # # -> Add these machines to the machines registry. # This can happen, if (somehow) machines boot up at OpenStack without being requested... for mid in self.mr.getMachines(self.siteName): # machine not listed in OpenStack -> remove from machine registry if len(nova_machines) == 0 or mid not in nova_machines: self.mr.removeMachine(mid) continue # check if condor name is set if not self.reg_site_server_condor_name in self.mr.machines[mid]: self.mr.machines[self.reg_site_server_condor_name] = mid # if machine is in error state, move it to disintegrating if nova_machines[mid][self.reg_site_server_status] in [ self.reg_site_server_status_error, self.reg_site_server_status_shutoff ]: self.mr.machines[mid][self.reg_site_server_status] = \ self.reg_site_server_status_error self.mr.updateMachineStatus(mid, self.mr.statusDisintegrating) # status handled by Integration Adapter if self.mr.machines[mid][self.mr.regStatus] in [ self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration ]: del nova_machines[mid] # if status = down, machine was terminated at OpenStack -> remove from machine registry elif self.mr.machines[mid][ self.mr.regStatus] == self.mr.statusDown: self.mr.removeMachine(mid) continue # check if machine could be started correctly elif self.mr.machines[mid][ self.mr.regStatus] == self.mr.statusBooting: # they started correctly when OpenStack state changes to active if nova_machines[mid][ self. reg_site_server_status] == self.reg_site_server_status_active: self.mr.updateMachineStatus(mid, self.mr.statusUp) self.mr.machines[mid][ self.reg_site_server_status] = nova_machines[mid][ self.reg_site_server_status] if mid in nova_machines: del nova_machines[mid] # check if machines is disintegrating elif self.mr.machines[mid][ self.mr.regStatus] == self.mr.statusDisintegrating: # check if machine is in status active (OpenStack status), if so, send stop command if nova_machines[mid][ self. reg_site_server_status] == self.reg_site_server_status_active: self.__openstackStopMachine(mid) # if machine is in status shutoff (OpenStack), update to disintegrated if nova_machines[mid][ self. reg_site_server_status] == self.reg_site_server_status_shutoff: self.mr.updateMachineStatus(mid, self.mr.statusDisintegrated) if mid in nova_machines: del nova_machines[mid] # add running nova machines and information to machine registry for mid in nova_machines: if mid not in self.mr.getMachines(self.siteName): new = self.mr.newMachine(mid) self.mr.machines[new][self.mr.regSite] = self.siteName self.mr.machines[new][self.mr.regSiteType] = self.siteType # TODO: handle different machine types self.mr.machines[new][ self.mr. regMachineType] = self._machineType # self.getConfig( # self.configMachines) # "vm-default" self.mr.machines[new][self.reg_site_server_id] = nova_machines[ mid][self.reg_site_server_id] self.mr.machines[new][ self.reg_site_server_status] = nova_machines[mid][ self.reg_site_server_status] self.mr.machines[new][self.reg_site_server_name] = mid self.mr.machines[new][self.reg_site_server_condor_name] = mid # self.mr.machines[new][self.mr.regMachineCores] = self.getConfig(self.configMachineType)["vm-default"][ # "cores"] if nova_machines[mid][ self. reg_site_server_status] == self.reg_site_server_status_error: self.mr.updateMachineStatus(mid, self.mr.statusDisintegrating) else: self.mr.updateMachineStatus(mid, self.mr.statusWorking) if self.getConfig(self.configUseTime): self.__openstackTimeDepStopMachine() ### # Write Json log file: # requested machines, nodes, draining nodes. ### self.logger.info( "Current machines running at %s: %d" % (self.siteName, self.runningMachinesCount[self.getConfig( self.configMachines).keys()[0]])) json_log = JsonLog() json_log.addItem( self.siteName, "machines_requested", int( len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))) json_log.addItem( self.siteName, "condor_nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) json_log.addItem( self.siteName, "condor_nodes_draining", len( self.getSiteMachines( status=self.mr.statusPendingDisintegration)))
def manage(self): # type: () -> None """Manages status changes of machines by checking jobs in Freiburg. Booting = Freiburg batch job for machine was submitted Up = Freiburg batch job is running, VM is Booting, IntegrationAdapter switches this to "integrating" and "working". Disintegrated & Down IntegrationAdapter is responsible for handling Integrating, Working, PendingDisintegration, Disintegrating """ # Getting List of running, completed and idle Machines from the MOAB XML output: try: frJobs = self.__getJobs if frJobs is None: raise ValueError except ValueError: frJobs = [{}, {}, {}, {}] frJobsIdle = merge_dicts(frJobs[0], frJobs[1]) frJobsBlocked = frJobs[1] frJobsRunning = frJobs[2] frJobsCompleted = frJobs[3] mr = self.getSiteMachines() for mid in mr: batchJobId = mr[mid][self.regMachineJobId] # Status handled by Integration Adapter if mr[mid][self.mr.regStatus] in [ self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration, self.mr.statusDisintegrating ]: try: frJobsRunning.pop(batchJobId) self.logger.debug( 'Removing batch-job %s from list of running Jobs' % mr[mid][self.regMachineJobId]) continue except (KeyError, AttributeError, IndexError): # AttributeError: frJobsRunning is Empty # KeyError: batchJobId not in frJobsRunning self.logger.debug( 'Matching between machine registry entry %s and batch-job ID (%s) failed during removal of machines with ignorable states.' % (mid, mr[mid][self.regMachineJobId])) pass # Machines which failed to boot/died/got canceled (return code != 0) -> down # A machine MAY fail to boot with return code 0 or we just missed some states -> regular shutdown if mr[mid][self.mr.regStatus] != self.mr.statusDown: if batchJobId in frJobsCompleted: if mr[mid][self.mr.regStatus] == self.mr.statusBooting: self.logger.info("VM (%s) failed to boot!" % batchJobId) else: if frJobsCompleted[batchJobId] != "0": self.logger.info("VM (%s) died!" % batchJobId) else: self.logger.debug("VM (%s) died with status 0!" % batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusDown) elif batchJobId in frJobsCompleted or self.mr.calcLastStateChange( mid) > 24 * 60 * 60: # Remove machines, which are: # 1. finished in ROCED & Freiburg // 2. Finished for more than 1 day [= job history purge time] self.mr.removeMachine(mid) continue elif batchJobId in frJobsRunning: # ROCED machine down, but job still running try: frJobsRunning.pop(batchJobId) self.logger.debug( 'Removing batch-job (%s) from list of running Jobs' % batchJobId) except (KeyError, AttributeError, IndexError): self.logger.debug( 'Matching between machine registry entry %s and batch-job ID (%s) failed during removal of down machines with still alive MOAB job.' % (mid, mr[mid][self.regMachineJobId])) pass if self.mr.calcLastStateChange(mid) > 5 * 60: self.__cancelFreiburgMachines(batchJobId) continue if mr[mid][self.mr.regStatus] == self.mr.statusBooting: # batch job running: machine -> up if batchJobId in frJobsRunning: del self.vanishedVMs[mid] self.mr.updateMachineStatus(mid, self.mr.statusUp) frJobsRunning.pop(batchJobId) # Machine disappeared. If the MOAB job is completed. elif batchJobId not in frJobsIdle and batchJobId not in frJobsCompleted: self.logger.info( 'Corresponding MOAB-job (%s) for machine %s was not found (%s retry) ' % (mr[mid][self.regMachineJobId], mid, self.vanishedVMs[mid])) self.vanishedVMs[mid] += 1 if self.vanishedVMs[mid] >= 5: self.logger.debug( "Corresponding Moab-job %s for machine %s was not found for 3 cycles" % (mr[mid][self.regMachineJobId], mid)) self.mr.updateMachineStatus(mid, self.mr.statusDown) del self.vanishedVMs[mid] else: del self.vanishedVMs[mid] # All remaining unaccounted batch jobs for batchJobId in frJobsRunning: mid = self.mr.newMachine() # TODO: try to identify machine type, using cores & wall-time self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][ self.mr.regMachineType] = self.__default_machine self.mr.machines[mid][self.regMachineJobId] = batchJobId self.mr.machines[mid][ self.reg_site_server_node_name] = self.__getVMName(batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusUp) self.logger.info("Machines using resources (Freiburg): %d" % self.cloudOccupyingMachinesCount) with JsonLog() as jsonLog: jsonLog.addItem( self.siteName, "nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) jsonLog.addItem( self.siteName, "nodes_draining", len([ mid for mid in self.getSiteMachines( status=self.mr.statusPendingDisintegration) if self.mr.machines[mid][self.mr.regMachineBusy] is True ])) jsonLog.addItem( self.siteName, "machines_requested", len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))
def manage(self): # type: () -> None """Manages status changes of machines by checking jobs in Freiburg. Booting = Freiburg batch job for machine was submitted Up = Freiburg batch job is running, VM is Booting, IntegrationAdapter switches this to "integrating" and "working". Disintegrated & Down IntegrationAdapter is responsible for handling Integrating, Working, PendingDisintegration, Disintegrating """ try: jobs = self.moabJobs if jobs is None: raise ValueError else: frJobsIdle = jobs['jobsIdle'] frJobsRunning = jobs['jobsRunning'] except ValueError: frJobsIdle = {} frJobsRunning = {} try: #frJobsCompleted = self.__completedJobs frJobsCompleted = self.completedMoabJobs if frJobsCompleted is None: raise ValueError except ValueError: frJobsCompleted = {} mr = self.getSiteMachines() self.logger.info( "Number of site machines before any magic happens: {}".format( len(mr))) #self.logger.debug( mr) #tmi self.logger.debug("Currently registered machines:") for mid in mr: batchJobId = mr[mid][self.regMachineJobId] # First check if machine is idle moab job: if batchJobId in frJobsIdle: print 'before {}'.format(len(frJobsIdle)) self.logger.info( 'Found idle job {}. Continuing'.format(batchJobId)) try: self.logger.info('Removed {}.'.format(batchJobId)) frJobsIdle.pop(batchJobId) except: pass print 'after {}'.format(len(frJobsIdle)) continue #print "mid =" + str(mid) # Status handled by Integration Adapter if mr[mid][self.mr.regStatus] in [ self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration, self.mr.statusDisintegrating ]: ip = '' try: ip = frJobsRunning[batchJobId]['IP'] except: self.logger.warning("VM with no IP. JobId=%s" % batchJobId) if ip != '': #self.mr.updateMachineStatus(mid, self.mr.statusUp) self.mr.updateMachineIp(mid, ip) try: ip = frJobsRunning[batchJobId]['IP'] except: self.logger.warning("Couldn't update machine IP. Removing") self.mr.removeMachine(mid) continue self.logger.debug( "Moab Job ID: {}. IP: {}. Status: {}.".format( batchJobId, ip, mr[mid][self.mr.regStatus])) try: frJobsRunning.pop(batchJobId) continue except (KeyError, AttributeError): # AttributeError: frJobsRunning is Empty # KeyError: batchJobId not in frJobsRunning pass # Machines which failed to boot/died/got canceled (return code != 0) -> down # A machine MAY fail to boot with return code 0 or we just missed some states -> regular shutdown if mr[mid][self.mr.regStatus] != self.mr.statusDown: if batchJobId in frJobsCompleted: if mr[mid][self.mr.regStatus] == self.mr.statusBooting: self.logger.info("VM (%s) failed to boot!" % batchJobId) else: if frJobsCompleted[batchJobId] != "0": self.logger.info("VM (%s) died!" % batchJobId) else: self.logger.debug("VM (%s) died with status 0!" % batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusDown) elif batchJobId in frJobsCompleted or self.mr.calcLastStateChange( mid) > 24 * 60 * 60: # Remove machines, which are: # 1. finished in ROCED & Freiburg // 2. Finished for more than 1 day [= job history purge time] self.mr.removeMachine(mid) continue elif batchJobId in frJobsRunning: # ROCED machine down, but job still running frJobsRunning.pop(batchJobId) if self.mr.calcLastStateChange(mid) > 5 * 60: self.__cancelFreiburgMachines(batchJobId) continue if mr[mid][self.mr.regStatus] == self.mr.statusBooting: # batch job running: machine -> up if batchJobId in frJobsRunning: self.logger.debug( "Job is running! {}".format(frJobsRunning)) ip = '' try: ip = frJobsRunning[batchJobId]['IP'] except: self.logger.warning("VM with no IP. JobId=%s" % batchJobId) if ip != '': self.mr.updateMachineStatus(mid, self.mr.statusUp) self.mr.updateMachineIp(mid, ip) self.logger.debug("IP={}".format(ip)) frJobsRunning.pop(batchJobId) # Machine disappeared. If the machine later appears again, it will be added automatically. elif batchJobId not in frJobsIdle and batchJobId not in frJobsCompleted: self.mr.updateMachineStatus(mid, self.mr.statusDown) # All remaining unaccounted batch jobs for batchJobId in frJobsRunning: self.logger.info("remaining batchJobId={}".format(batchJobId)) mid = self.mr.newMachine() # TODO: try to identify machine type, using cores & wall-time self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][ self.mr.regMachineType] = self.__default_machine self.mr.machines[mid][self.regMachineJobId] = batchJobId self.mr.machines[mid][ self.reg_site_server_node_name] = self.__getVMName(batchJobId) ip = '' try: ip = frJobsRunning[batchJobId]['IP'] except: self.logger.warning("VM with no IP. JobId=%s" % batchJobId) if ip != '': self.mr.updateMachineStatus(mid, self.mr.statusUp) self.mr.updateMachineIp(mid, ip) print "IP=" + str(ip) else: self.mr.updateMachineStatus(mid, self.mr.statusBooting) for batchJobId in frJobsIdle: self.logger.info("remaining batchJobId (idling)=" + str(batchJobId)) mid = self.mr.newMachine() self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][ self.mr.regMachineType] = self.__default_machine self.mr.machines[mid][self.regMachineJobId] = batchJobId self.mr.machines[mid][ self.reg_site_server_node_name] = self.__getVMName(batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusBooting) self.logger.info("Machines using resources (Freiburg): %d" % self.cloudOccupyingMachinesCount) with JsonLog() as jsonLog: jsonLog.addItem( self.siteName, "nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) jsonLog.addItem( self.siteName, "nodes_draining", len( self.getSiteMachines( status=self.mr.statusPendingDisintegration))) jsonLog.addItem( self.siteName, "machines_requested", len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))
def manage(self): """ managing machine states that change dependant of the state changes on 1and1 cloud site run once per cycle :return: """ # get machines from EC2 ec2_machines_status, ec2_machines_list = self.getEC2Machines() machines_to_stop = list() machines_to_terminate = list() # if something fails while receiving response from EC2 a type "None" will be returned if ec2_machines_status is None: # or (len(oao_machines) == 0): return for mid in self.mr.getMachines(self.siteName): machine = self.mr.machines[mid] # check if machine is already deleted on site and remove it from machine registry # if not machine[self.reg_site_server_id] in ec2_machines_status: # self.mr.removeMachine(mid) # continue # check for status which is handled by integration adapter if machine[self.mr.regStatus] in [ self.mr.statusUp, self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration ]: del ec2_machines_status[machine[self.reg_site_server_id]] # down # if machine status in machine registry is down and machine is still listed on EC2 cloud, terminate machine elif machine[self.mr.regStatus] == self.mr.statusDown: if not machine[self.reg_site_server_id] in ec2_machines_list: self.mr.removeMachine(mid) # del ec2_machines_status[machine[self.reg_site_server_id]] self.cleanupEC2() continue elif machine[self.mr.regStatus] == self.mr.statusDisintegrated: if not machine[self.reg_site_server_id] in ec2_machines_status: machines_to_terminate.append(mid) self.mr.updateMachineStatus(mid, self.mr.statusDown) else: del ec2_machines_status[machine[self.reg_site_server_id]] elif machine[self.mr.regStatus] == self.mr.statusDisintegrating: # self.terminateEC2Machine(self.stop, mid) if machine[self.reg_site_server_id] in ec2_machines_status: machines_to_stop.append(mid) del ec2_machines_status[machine[self.reg_site_server_id]] # TODO: use this status transition from up to integrating instead of the one used in integration adapter.onEvent # if machine[self.mr.regStatus] == self.mr.statusUp: # if ec2_machines_status[machine[self.reg_site_server_id]][self.ec2_instance_status][ # "Status"] == "initializing": # self.mr.updateMachineStatus(mid, self.mr.statusIntegrating) # del ec2_machines_status[machine[self.reg_site_server_id]] # booting -> up # check if machine status booting elif machine[self.mr.regStatus] == self.mr.statusBooting: if machine[self.reg_site_server_id] in ec2_machines_status: self.mr.updateMachineStatus(mid, self.mr.statusUp) else: continue if (ec2_machines_status[machine[self.reg_site_server_id]][ self.ec2_instance_status]["Status"] == "initializing"): pass elif (ec2_machines_status[machine[self.reg_site_server_id]][ self.ec2_instance_status]["Status"] == "ok"): self.mr.updateMachineStatus(mid, self.mr.statusUp) del ec2_machines_status[machine[self.reg_site_server_id]] self.terminateEC2Machine(self.stop, machines_to_stop) self.terminateEC2Machine(self.terminate, machines_to_terminate) # add all machines remaining in machine list from 1&1 for machine in ec2_machines_status: # if machine is listed in the service machine section, skip it! if not machine in self.getConfig(self.configServiceIDs): # create new machine in machine registry mid = self.mr.newMachine() self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][ self.mr.regMachineType] = self.ec2 # machineType # self.mr.machines[mid][self.reg_site_server_name] = oao_machines[vm][self.oao_name] self.mr.machines[mid][self.reg_site_server_id] = machine # self.mr.machines[mid][self.reg_site_server_status] = ec2_machines_status[machine][self.ec2_instance_status] self.mr.machines[mid][ self.reg_site_server_condor_name] = machine self.mr.updateMachineStatus(mid, self.mr.statusBooting) # add current amounts of machines to Json log file self.logger.info( "Current machines running at %s: %d" % (self.siteName, self.runningMachinesCount[self._machineType])) json_log = JsonLog() json_log.addItem( self.siteName, "machines_requested", int( len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))) json_log.addItem( self.siteName, "condor_nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) json_log.addItem( self.siteName, "condor_nodes_draining", len( self.getSiteMachines( status=self.mr.statusPendingDisintegration)))
def manage(self): """Managing machine states, run once per cycle This function takes care of the machine status and manages state changes: booting -> up disintegrating -> disintegrated It uses machine states in OpenStack and the machine registry machine states to trigger state changes. :return: """ nova_machines = self.__getNovaMachines() # Look for each machine in machine registry and perform necessary status change(s). # # In the process we delete each machine that's in the machine registry from nova_machines. # As a result, nova_machines then contains a list of MISSING machines. # # -> Add these machines to the machines registry. # This can happen, if (somehow) machines boot up at OpenStack without being requested... for mid in self.mr.getMachines(self.siteName): # machine not listed in OpenStack -> remove from machine registry if len(nova_machines) == 0 or mid not in nova_machines: self.mr.removeMachine(mid) continue # check if condor name is set if not self.reg_site_server_condor_name in self.mr.machines[mid]: self.mr.machines[self.reg_site_server_condor_name] = mid # if machine is in error state, move it to disintegrating if nova_machines[mid][self.reg_site_server_status] in [ self.reg_site_server_status_error, self.reg_site_server_status_shutoff]: self.mr.machines[mid][self.reg_site_server_status] = \ self.reg_site_server_status_error self.mr.updateMachineStatus(mid, self.mr.statusDisintegrating) # status handled by Integration Adapter if self.mr.machines[mid][self.mr.regStatus] in [self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration]: del nova_machines[mid] # if status = down, machine was terminated at OpenStack -> remove from machine registry elif self.mr.machines[mid][self.mr.regStatus] == self.mr.statusDown: self.mr.removeMachine(mid) continue # check if machine could be started correctly elif self.mr.machines[mid][self.mr.regStatus] == self.mr.statusBooting: # they started correctly when OpenStack state changes to active if nova_machines[mid][ self.reg_site_server_status] == self.reg_site_server_status_active: self.mr.updateMachineStatus(mid, self.mr.statusUp) self.mr.machines[mid][self.reg_site_server_status] = nova_machines[mid][ self.reg_site_server_status] if mid in nova_machines: del nova_machines[mid] # check if machines is disintegrating elif self.mr.machines[mid][self.mr.regStatus] == self.mr.statusDisintegrating: # check if machine is in status active (OpenStack status), if so, send stop command if nova_machines[mid][ self.reg_site_server_status] == self.reg_site_server_status_active: self.__openstackStopMachine(mid) # if machine is in status shutoff (OpenStack), update to disintegrated if nova_machines[mid][ self.reg_site_server_status] == self.reg_site_server_status_shutoff: self.mr.updateMachineStatus(mid, self.mr.statusDisintegrated) if mid in nova_machines: del nova_machines[mid] # add running nova machines and information to machine registry for mid in nova_machines: if mid not in self.mr.getMachines(self.siteName): new = self.mr.newMachine(mid) self.mr.machines[new][self.mr.regSite] = self.siteName self.mr.machines[new][self.mr.regSiteType] = self.siteType # TODO: handle different machine types self.mr.machines[new][self.mr.regMachineType] = self._machineType # self.getConfig( # self.configMachines) # "vm-default" self.mr.machines[new][self.reg_site_server_id] = nova_machines[mid][ self.reg_site_server_id] self.mr.machines[new][self.reg_site_server_status] = nova_machines[mid][ self.reg_site_server_status] self.mr.machines[new][self.reg_site_server_name] = mid self.mr.machines[new][self.reg_site_server_condor_name] = mid # self.mr.machines[new][self.mr.regMachineCores] = self.getConfig(self.configMachineType)["vm-default"][ # "cores"] if nova_machines[mid][ self.reg_site_server_status] == self.reg_site_server_status_error: self.mr.updateMachineStatus(mid, self.mr.statusDisintegrating) else: self.mr.updateMachineStatus(mid, self.mr.statusWorking) if self.getConfig(self.configUseTime): self.__openstackTimeDepStopMachine() ### # Write Json log file: # requested machines, nodes, draining nodes. ### self.logger.info("Current machines running at %s: %d" % (self.siteName, self.runningMachinesCount[self.getConfig(self.configMachines).keys()[0]])) json_log = JsonLog() json_log.addItem(self.siteName, "machines_requested", int(len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))) json_log.addItem(self.siteName, "condor_nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) json_log.addItem(self.siteName, "condor_nodes_draining", len(self.getSiteMachines(status=self.mr.statusPendingDisintegration)))