def runningMachinesCount(self): """Return dictionary with number of machines running at Freiburg. Depending on config file this may account for draining slots (claimed|retiring = working vs. claimed|idle = offline). The number of running machines needs to be recalculated when accounting for draining slots. Claimed but retiring slots are still being counted as working slots and thus contributing to the number of running machines -> remove idle draining slots from running machines and recalculate the actual number of running machines. :return {machine_type: integer, ...}: """ # fall back to base method if required if self.getConfig(self.configIgnoreDrainingMachines) is True: return super(FreiburgSiteAdapter, self).runningMachinesCount else: runningMachines = self.runningMachines runningMachinesCount = dict() for machineType in runningMachines: # calculate number of drained slots (idle and not accepting new jobs -> not usable) nDrainedSlots = 0 for mid in runningMachines[machineType]: nDrainedSlots += HTCondor.calcDrainStatus(mid)[0] nCores = self.getConfig(self.ConfigMachines)[machineType]["cores"] nMachines = len(runningMachines[machineType]) # Calculate the number of available slots # Little trick: floor division with negative values: -9//4 = -3 nDrainedSlots = -nDrainedSlots runningMachinesCount[machineType] = nMachines + nDrainedSlots // nCores if nDrainedSlots != 0: self.logger.debug("%s: running: %d, drained slots: %d" " -> recalculated running machines count: %s" % (machineType, nMachines, nDrainedSlots, runningMachinesCount[machineType])) return runningMachinesCount
def runningMachinesCount(self): """Return dictionary with number of machines running at Freiburg. Depending on config file this may account for draining slots (claimed|retiring = working vs. claimed|idle = offline). The number of running machines needs to be recalculated when accounting for draining slots. Claimed but retiring slots are still being counted as working slots and thus contributing to the number of running machines -> remove idle draining slots from running machines and recalculate the actual number of running machines. :return {machine_type: integer, ...}: """ # fall back to base method if required if self.getConfig(self.configIgnoreDrainingMachines) is True: return super(FreiburgSiteAdapter, self).runningMachinesCount else: runningMachines = self.runningMachines runningMachinesCount = dict() for machineType in runningMachines: # calculate number of drained slots (idle and not accepting new jobs -> not usable) nDrainedSlots = 0 for mid in runningMachines[machineType]: nDrainedSlots += HTCondor.calcDrainStatus(mid)[0] nCores = self.getConfig( self.ConfigMachines)[machineType]["cores"] nMachines = len(runningMachines[machineType]) # Calculate the number of available slots # Little trick: floor division with negative values: -9//4 = -3 nDrainedSlots = -nDrainedSlots runningMachinesCount[ machineType] = nMachines + nDrainedSlots // nCores if nDrainedSlots != 0: self.logger.debug( "%s: running: %d, drained slots: %d" " -> recalculated running machines count: %s" % (machineType, nMachines, nDrainedSlots, runningMachinesCount[machineType])) return runningMachinesCount
def manage(self): # type: () -> None """Manages status changes of machines by checking jobs in Freiburg. Booting = Freiburg batch job for machine was submitted Up = Freiburg batch job is running, VM is Booting, HTCondorIntegrationAdapter switches this to "integrating" and "working". Disintegrated & Down HTCondorIntegrationAdapter is responsible for handling Integrating, Working, PendingDisintegration, Disintegrating """ try: frJobsRunning = self.__runningJobs if frJobsRunning is None: raise ValueError except ValueError: frJobsRunning = {} try: frJobsCompleted = self.__completedJobs if frJobsCompleted is None: raise ValueError except ValueError: frJobsCompleted = {} try: frJobsIdle = self.__idleJobs if frJobsIdle is None: raise ValueError except ValueError: frJobsIdle = {} mr = self.getSiteMachines() for mid in mr: batchJobId = mr[mid][self.regMachineJobId] # Status handled by Integration Adapter if mr[mid][self.mr.regStatus] in [ self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration, self.mr.statusDisintegrating ]: try: frJobsRunning.pop(batchJobId) continue except (KeyError, AttributeError): # AttributeError: frJobsRunning is Empty # KeyError: batchJobId not in frJobsRunning pass # Machines which failed to boot/died/got canceled (return code != 0) -> down # A machine MAY fail to boot with return code 0 or we just missed some states -> regular shutdown if mr[mid][self.mr.regStatus] != self.mr.statusDown: if batchJobId in frJobsCompleted: if mr[mid][self.mr.regStatus] == self.mr.statusBooting: self.logger.info("VM (%s) failed to boot!" % batchJobId) else: if frJobsCompleted[batchJobId] != "0": self.logger.info("VM (%s) died!" % batchJobId) else: self.logger.debug("VM (%s) died with status 0!" % batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusDown) elif batchJobId in frJobsCompleted or self.mr.calcLastStateChange( mid) > 24 * 60 * 60: # Remove machines, which are: # 1. finished in ROCED & Freiburg // 2. Finished for more than 1 day [= job history purge time] self.mr.removeMachine(mid) continue elif batchJobId in frJobsRunning: # ROCED machine down, but job still running frJobsRunning.pop(batchJobId) if self.mr.calcLastStateChange(mid) > 5 * 60: self.__cancelFreiburgMachines(batchJobId) continue if mr[mid][self.mr.regStatus] == self.mr.statusBooting: # batch job running: machine -> up if batchJobId in frJobsRunning: self.mr.updateMachineStatus(mid, self.mr.statusUp) frJobsRunning.pop(batchJobId) # Machine disappeared. If the machine later appears again, it will be added automatically. elif batchJobId not in frJobsIdle and batchJobId not in frJobsCompleted: self.mr.updateMachineStatus(mid, self.mr.statusDown) # All remaining unaccounted batch jobs for batchJobId in frJobsRunning: mid = self.mr.newMachine() # TODO: try to identify machine type, using cores & wall-time self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][ self.mr.regMachineType] = self.__default_machine self.mr.machines[mid][self.regMachineJobId] = batchJobId self.mr.machines[mid][ self.reg_site_server_condor_name] = self.__getCondorName( batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusUp) self.logger.info("Machines using resources (Freiburg): %d" % self.cloudOccupyingMachinesCount) with JsonLog() as jsonLog: jsonLog.addItem( self.siteName, "condor_nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) jsonLog.addItem( self.siteName, "condor_nodes_draining", len([ mid for mid in self.getSiteMachines( status=self.mr.statusPendingDisintegration) if HTCondor.calcDrainStatus(mid)[1] is True ])) jsonLog.addItem( self.siteName, "machines_requested", len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))
def terminateMachines(self, machineType, count): """Terminate machines in Freiburg. Working machines are untouched by default, but they may get put into drain mode if the configuration is set accordingly. :param machineType: :param count: :return: """ # booting machines, sorted by request time (newest first). bootingMachines = self.getSiteMachines(self.mr.statusBooting, machineType) try: bootingMachines = sorted( bootingMachines.items(), key=lambda machine_: machine_[1][self.mr.regStatusLastUpdate], reverse=True) except KeyError: bootingMachines = [] # Running machines, sorted by load (idle first). These machines are put into drain mode if self.getConfig(self.configDrainWorkingMachines) is True: workingMachines = merge_dicts( self.getSiteMachines(self.mr.statusIntegrating, machineType), self.getSiteMachines(self.mr.statusWorking, machineType), self.getSiteMachines(self.mr.statusPendingDisintegration, machineType)) try: workingMachines = sorted( workingMachines.items(), key=lambda machine_: HTCondor.calcMachineLoad(machine_[0]), reverse=True) except KeyError: workingMachines = [] # Merge lists machinesToRemove = bootingMachines + workingMachines else: machinesToRemove = bootingMachines # needed amount of machines machinesToRemove = machinesToRemove[0:count] # list of batch job ids to terminate/drain idsToTerminate = [] idsToDrain = [] idsRemoved = [] idsInvalidated = [] for mid, machine in machinesToRemove: if machine[self.mr.regStatus] == self.mr.statusBooting: # booting machines can be terminated immediately idsToTerminate.append(machine[self.regMachineJobId]) elif self.getConfig(self.configDrainWorkingMachines): if HTCondor.calcDrainStatus(mid)[1] is True: continue # working machines should be set to drain mode idsToDrain.append(machine[self.regMachineJobId]) self.logger.debug("Machines to terminate (%d): %s" % (len(idsToTerminate), ", ".join(idsToTerminate))) if idsToTerminate: idsRemoved, idsInvalidated = self.__cancelFreiburgMachines( idsToTerminate) self.logger.debug("Machines to drain (%d): %s" % (len(idsToDrain), ", ".join(idsToDrain))) if idsToDrain: [ HTCondor.drainMachine(mid) for mid, machine in self.getSiteMachines().items() if machine[self.regMachineJobId] in idsToDrain ] if len(idsRemoved + idsInvalidated) > 0: # update status [ self.mr.updateMachineStatus(mid, self.mr.statusDown) for mid, machine in self.getSiteMachines().items() if machine[self.regMachineJobId] in idsRemoved + idsInvalidated ]
def manage(self): # type: () -> None """Manages status changes of machines by checking jobs in Freiburg. Booting = Freiburg batch job for machine was submitted Up = Freiburg batch job is running, VM is Booting, HTCondorIntegrationAdapter switches this to "integrating" and "working". Disintegrated & Down HTCondorIntegrationAdapter is responsible for handling Integrating, Working, PendingDisintegration, Disintegrating """ try: frJobsRunning = self.__runningJobs if frJobsRunning is None: raise ValueError except ValueError: frJobsRunning = {} try: frJobsCompleted = self.__completedJobs if frJobsCompleted is None: raise ValueError except ValueError: frJobsCompleted = {} try: frJobsIdle = self.__idleJobs if frJobsIdle is None: raise ValueError except ValueError: frJobsIdle = {} mr = self.getSiteMachines() for mid in mr: batchJobId = mr[mid][self.regMachineJobId] # Status handled by Integration Adapter if mr[mid][self.mr.regStatus] in [self.mr.statusIntegrating, self.mr.statusWorking, self.mr.statusPendingDisintegration, self.mr.statusDisintegrating]: try: frJobsRunning.pop(batchJobId) continue except (KeyError, AttributeError): # AttributeError: frJobsRunning is Empty # KeyError: batchJobId not in frJobsRunning pass # Machines which failed to boot/died/got canceled (return code != 0) -> down # A machine MAY fail to boot with return code 0 or we just missed some states -> regular shutdown if mr[mid][self.mr.regStatus] != self.mr.statusDown: if batchJobId in frJobsCompleted: if mr[mid][self.mr.regStatus] == self.mr.statusBooting: self.logger.info("VM (%s) failed to boot!" % batchJobId) else: if frJobsCompleted[batchJobId] != "0": self.logger.info("VM (%s) died!" % batchJobId) else: self.logger.debug("VM (%s) died with status 0!" % batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusDown) elif batchJobId in frJobsCompleted or self.mr.calcLastStateChange(mid) > 24 * 60 * 60: # Remove machines, which are: # 1. finished in ROCED & Freiburg // 2. Finished for more than 1 day [= job history purge time] self.mr.removeMachine(mid) continue elif batchJobId in frJobsRunning: # ROCED machine down, but job still running frJobsRunning.pop(batchJobId) if self.mr.calcLastStateChange(mid) > 5*60: self.__cancelFreiburgMachines(batchJobId) continue if mr[mid][self.mr.regStatus] == self.mr.statusBooting: # batch job running: machine -> up if batchJobId in frJobsRunning: self.mr.updateMachineStatus(mid, self.mr.statusUp) frJobsRunning.pop(batchJobId) # Machine disappeared. If the machine later appears again, it will be added automatically. elif batchJobId not in frJobsIdle and batchJobId not in frJobsCompleted: self.mr.updateMachineStatus(mid, self.mr.statusDown) # All remaining unaccounted batch jobs for batchJobId in frJobsRunning: mid = self.mr.newMachine() # TODO: try to identify machine type, using cores & wall-time self.mr.machines[mid][self.mr.regSite] = self.siteName self.mr.machines[mid][self.mr.regSiteType] = self.siteType self.mr.machines[mid][self.mr.regMachineType] = self.__default_machine self.mr.machines[mid][self.regMachineJobId] = batchJobId self.mr.machines[mid][self.reg_site_server_condor_name] = self.__getCondorName(batchJobId) self.mr.updateMachineStatus(mid, self.mr.statusUp) self.logger.info("Machines using resources (Freiburg): %d" % self.cloudOccupyingMachinesCount) with JsonLog() as jsonLog: jsonLog.addItem(self.siteName, "condor_nodes", len(self.getSiteMachines(status=self.mr.statusWorking))) jsonLog.addItem(self.siteName, "condor_nodes_draining", len([mid for mid in self.getSiteMachines(status=self.mr.statusPendingDisintegration) if HTCondor.calcDrainStatus(mid)[1] is True])) jsonLog.addItem(self.siteName, "machines_requested", len(self.getSiteMachines(status=self.mr.statusBooting)) + len(self.getSiteMachines(status=self.mr.statusUp)) + len(self.getSiteMachines(status=self.mr.statusIntegrating)))
def terminateMachines(self, machineType, count): """Terminate machines in Freiburg. Working machines are untouched by default, but they may get put into drain mode if the configuration is set accordingly. :param machineType: :param count: :return: """ # booting machines, sorted by request time (newest first). bootingMachines = self.getSiteMachines(self.mr.statusBooting, machineType) try: bootingMachines = sorted(bootingMachines.items(), key=lambda machine_: machine_[1][self.mr.regStatusLastUpdate], reverse=True) except KeyError: bootingMachines = [] # Running machines, sorted by load (idle first). These machines are put into drain mode if self.getConfig(self.configDrainWorkingMachines) is True: workingMachines = merge_dicts( self.getSiteMachines(self.mr.statusIntegrating, machineType), self.getSiteMachines(self.mr.statusWorking, machineType), self.getSiteMachines(self.mr.statusPendingDisintegration, machineType)) try: workingMachines = sorted(workingMachines.items(), key=lambda machine_: HTCondor.calcMachineLoad(machine_[0]), reverse=True) except KeyError: workingMachines = [] # Merge lists machinesToRemove = bootingMachines + workingMachines else: machinesToRemove = bootingMachines # needed amount of machines machinesToRemove = machinesToRemove[0:count] # list of batch job ids to terminate/drain idsToTerminate = [] idsToDrain = [] idsRemoved = [] idsInvalidated = [] for mid, machine in machinesToRemove: if machine[self.mr.regStatus] == self.mr.statusBooting: # booting machines can be terminated immediately idsToTerminate.append(machine[self.regMachineJobId]) elif self.getConfig(self.configDrainWorkingMachines): if HTCondor.calcDrainStatus(mid)[1] is True: continue # working machines should be set to drain mode idsToDrain.append(machine[self.regMachineJobId]) self.logger.debug("Machines to terminate (%d): %s" % (len(idsToTerminate), ", ".join(idsToTerminate))) if idsToTerminate: idsRemoved, idsInvalidated = self.__cancelFreiburgMachines(idsToTerminate) self.logger.debug("Machines to drain (%d): %s" % (len(idsToDrain), ", ".join(idsToDrain))) if idsToDrain: [HTCondor.drainMachine(mid) for mid, machine in self.getSiteMachines().items() if machine[self.regMachineJobId] in idsToDrain] if len(idsRemoved + idsInvalidated) > 0: # update status [self.mr.updateMachineStatus(mid, self.mr.statusDown) for mid, machine in self.getSiteMachines().items() if machine[self.regMachineJobId] in idsRemoved + idsInvalidated]