def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric self.ssb = Dashboard(self.dashboard) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader( self.config.AgentStatusWatcher.centralWMStatsURL)
def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric self.ssb = Dashboard(self.dashboard) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric self.ssb = Dashboard(self.dashboard) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader( self.config.AgentStatusWatcher.centralWMStatsURL) @timeFunction def algorithm(self, parameters): """ _algorithm_ Update site state and thresholds, based on differences between resource control database and info available in SSB. 1. Get info from Resource Control database 2. Get info from SSB 3. Get information about teams and number of agents from WMStats 4. Change site state when needed (this triggers a condor clasAd fetch) 5. Change site thresholds when needed (and task thresholds) Sites from SSB are validated with PhEDEx node names """ if not self.enabled: logging.info( "This component is not enabled in the configuration. Doing nothing." ) return try: sitesRC = self.resourceControl.listSitesSlots() logging.debug("Info from resource control: %s", sitesRC) # first, update site status ssbSiteStatus = self.getSiteStatus() self.checkStatusChanges(sitesRC, ssbSiteStatus) # now fetch site slots thresholds sitesSSB = self.getInfoFromSSB() if not sitesSSB: logging.error( "One or more of the SSB metrics is down. Please contact the Dashboard team." ) return logging.debug("Info from SSB: %s", sitesSSB) # get number of agents working in the same team (not in DrainMode) self.getAgentsByTeam() # Check which site slots need to be updated in the database self.checkSlotsChanges(sitesRC, sitesSSB) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc()) logging.info( "Resource control cycle finished updating site state and thresholds." ) def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view for agents and teams """ if isDrainMode(self.config): # maximize pending thresholds to get this agent drained ASAP self.agentsNumByTeam = 1 return agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam( filterDrain=True) except Exception: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.warning( "agentInfo couch view is not available, use default value %s", self.agentsNumByTeam) else: self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam) logging.debug( "Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam) return def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB). Returns a dict of dicts where the first key is the site name. """ ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric) ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric) ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots) return ssbSiteSlots def checkStatusChanges(self, infoRC, infoSSB): """ _checkStatusChanges_ Checks which sites need to have their site state updated in resource control, based on: 1. settings defined for the component (config.py) 2. site state changes between SSB and RC """ # First sets list of forced sites to down (HLT @FNAL is an example) for site in self.forceSiteDown: if site in infoRC and infoRC[site]['state'] != 'Down': logging.info("Forcing site %s to Down", site) self.updateSiteState(site, 'Down') infoSSB.pop(site, None) # if onlySSB sites, force all the sites not in SSB to down if self.onlySSB: for site in set(infoRC).difference(set(infoSSB)): if infoRC[site]['state'] != 'Down': logging.info('Only SSBsites, forcing site %s to Down', site) self.updateSiteState(site, 'Down') # normally set all the others for site in set(infoRC).intersection(set(infoSSB)): if infoRC[site]['state'] != infoSSB[site]['state']: logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state']) self.updateSiteState(site, infoSSB[site]['state']) return def checkSlotsChanges(self, infoRC, infoSSB): """ _checkSlotsChanges_ Checks which sites need to have their running and/or pending slots updated in resource control database, based on: 1. number of agents connected to the same team 2. and slots provided by the Dashboard team (SSB) If site slots are updated, then updates the task level too. """ logging.debug( "Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent) for site in set(infoRC).intersection(set(infoSSB)): if self.tier0Mode and site.startswith('T1_'): # T1 cores utilization for Tier0 infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100 infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100 else: # round very small sites to the bare minimum infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'], self.minCPUSlots) infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'], self.minIOSlots) CPUBound = infoSSB[site]['slotsCPU'] IOBound = infoSSB[site]['slotsIO'] sitePending = max( int(CPUBound / self.agentsNumByTeam * self.pendingSlotsSitePercent / 100), self.minCPUSlots) # update site slots, if needed if infoRC[site]['running_slots'] != CPUBound or infoRC[site][ 'pending_slots'] != sitePending: # Update site running and pending slots logging.info( "Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound) self.resourceControl.setJobSlotsForSite( site, pendingJobSlots=sitePending, runningJobSlots=CPUBound) # now handle the task level thresholds self.checkTaskSlotsChanges(site, CPUBound, IOBound) def thresholdsByVOName(self, infoCpu, infoIo): """ _thresholdsByVOName_ Creates a dictionary with CPU and IO slots keyed by the site name. If any of the thresholds is missing or has an invalid value, the whole site thresholds is skipped. """ ssbSiteSlots = {} for entry in infoCpu: if entry['Value'] is None: logging.warn( 'Site %s has invalid CPU thresholds in SSB. Taking no action', entry['VOName']) else: ssbSiteSlots[entry['VOName']] = { 'slotsCPU': int(entry['Value']) } # then iterate over the IO slots for entry in infoIo: if entry['Value'] is None: logging.warn( 'Site %s has invalid IO thresholds in SSB. Taking no action', entry['VOName']) else: ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value']) # Before proceeding, remove sites without both metrics for site in ssbSiteSlots.keys(): if len(ssbSiteSlots[site]) != 2: logging.warn("Site: %s has incomplete SSB metrics, see %s", site, ssbSiteSlots[site]) ssbSiteSlots.pop(site) return ssbSiteSlots def getSiteStatus(self): """ _getSiteStatus_ Fetch site state from SSB and map it to agent state """ ssbState = self.ssb.getMetric(self.siteStatusMetric) ssbSiteState = {} for site in ssbState: voname = site['VOName'] status = site['Status'] if voname not in ssbSiteState: statusAgent = self.getState(str(status)) if not statusAgent: logging.error( "Unknown status '%s' for site %s, please check SSB", status, voname) else: ssbSiteState[voname] = {'state': statusAgent} else: logging.warning( 'I have a duplicated status entry in SSB for %s', voname) return ssbSiteState def getState(self, stateSSB): """ _getState_ Translates SSB states into resource control state """ ssb2agent = { 'enabled': 'Normal', 'drain': 'Draining', 'disabled': 'Down', 'test': 'Draining' } # 'test' state behaviour varies between production and tier0 agents ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining" return ssb2agent.get(stateSSB) def updateSiteState(self, siteName, state): """ _updateSiteState_ Update only the site state in the resource control database. """ try: self.resourceControl.changeSiteState(siteName, state) except Exception as ex: logging.error("Failed to update %s state to %s:", siteName, state) logging.error(str(ex)) logging.error("Traceback: \n%s", traceback.format_exc()) return def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound): """ _checkTaskSlotsChanges_ Update the CPU and IOBound slots for a given site. """ siteTaskSlots = self.resourceControl.thresholdBySite(siteName) taskCPUPending = max( int(CPUBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minCPUSlots) taskIOPending = max( int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minIOSlots) updateTasks = False if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0][ 'task_pending_slots'] != taskCPUPending: updateTasks = True elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[ 0]['task_pending_slots'] != taskIOPending: updateTasks = True if updateTasks: logging.info( "Updating %s CPU tasks thresholds for pend/runn: %d/%d", siteName, taskCPUPending, CPUBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksCPU, maxSlots=CPUBound, pendingSlots=taskCPUPending) logging.info( "Updating %s IO tasks thresholds for pend/runn: %d/%d", siteName, taskIOPending, IOBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksIO, maxSlots=IOBound, pendingSlots=taskIOPending) if self.tier0Mode: # Set task thresholds for Tier0 logging.debug("Updating %s Express and Repack task thresholds.", siteName) expressSlots = int(CPUBound * self.runningExpressPercent / 100) pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Express', expressSlots, pendingExpress) repackSlots = int(CPUBound * self.runningRepackPercent / 100) pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Repack', repackSlots, pendingRepack)
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric self.ssb = Dashboard(self.dashboard) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL) @timeFunction def algorithm(self, parameters): """ _algorithm_ Update site state and thresholds, based on differences between resource control database and info available in SSB. 1. Get info from Resource Control database 2. Get info from SSB 3. Get information about teams and number of agents from WMStats 4. Change site state when needed (this triggers a condor clasAd fetch) 5. Change site thresholds when needed (and task thresholds) Sites from SSB are validated with PhEDEx node names """ if not self.enabled: logging.info("This component is not enabled in the configuration. Doing nothing.") return try: sitesRC = self.resourceControl.listSitesSlots() logging.debug("Info from resource control: %s", sitesRC) # first, update site status ssbSiteStatus = self.getSiteStatus() self.checkStatusChanges(sitesRC, ssbSiteStatus) # now fetch site slots thresholds sitesSSB = self.getInfoFromSSB() if not sitesSSB: logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.") return logging.debug("Info from SSB: %s", sitesSSB) # get number of agents working in the same team (not in DrainMode) self.getAgentsByTeam() # Check which site slots need to be updated in the database self.checkSlotsChanges(sitesRC, sitesSSB) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc()) logging.info("Resource control cycle finished updating site state and thresholds.") def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view for agents and teams """ if isDrainMode(self.config): # maximize pending thresholds to get this agent drained ASAP self.agentsNumByTeam = 1 return agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True) except Exception: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam) else: self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam) logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam) return def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB). Returns a dict of dicts where the first key is the site name. """ ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric) ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric) ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots) return ssbSiteSlots def checkStatusChanges(self, infoRC, infoSSB): """ _checkStatusChanges_ Checks which sites need to have their site state updated in resource control, based on: 1. settings defined for the component (config.py) 2. site state changes between SSB and RC """ # First sets list of forced sites to down (HLT @FNAL is an example) for site in self.forceSiteDown: if site in infoRC and infoRC[site]['state'] != 'Down': logging.info("Forcing site %s to Down", site) self.updateSiteState(site, 'Down') infoSSB.pop(site, None) # if onlySSB sites, force all the sites not in SSB to down if self.onlySSB: for site in set(infoRC).difference(set(infoSSB)): if infoRC[site]['state'] != 'Down': logging.info('Only SSBsites, forcing site %s to Down', site) self.updateSiteState(site, 'Down') # normally set all the others for site in set(infoRC).intersection(set(infoSSB)): if infoRC[site]['state'] != infoSSB[site]['state']: logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state']) self.updateSiteState(site, infoSSB[site]['state']) return def checkSlotsChanges(self, infoRC, infoSSB): """ _checkSlotsChanges_ Checks which sites need to have their running and/or pending slots updated in resource control database, based on: 1. number of agents connected to the same team 2. and slots provided by the Dashboard team (SSB) If site slots are updated, then updates the task level too. """ logging.debug("Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent) for site in set(infoRC).intersection(set(infoSSB)): if self.tier0Mode and site.startswith('T1_'): # T1 cores utilization for Tier0 infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100 infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100 else: # round very small sites to the bare minimum infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'], self.minCPUSlots) infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'], self.minIOSlots) CPUBound = infoSSB[site]['slotsCPU'] IOBound = infoSSB[site]['slotsIO'] sitePending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsSitePercent / 100), self.minCPUSlots) # update site slots, if needed if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending: # Update site running and pending slots logging.info("Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound) self.resourceControl.setJobSlotsForSite(site, pendingJobSlots=sitePending, runningJobSlots=CPUBound) # now handle the task level thresholds self.checkTaskSlotsChanges(site, CPUBound, IOBound) def thresholdsByVOName(self, infoCpu, infoIo): """ _thresholdsByVOName_ Creates a dictionary with CPU and IO slots keyed by the site name. If any of the thresholds is missing or has an invalid value, the whole site thresholds is skipped. """ ssbSiteSlots = {} for entry in infoCpu: if entry['Value'] is None: logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName']) continue ssbSiteSlots[entry['VOName']] = {'slotsCPU': int(entry['Value'])} # then iterate over the IO slots for entry in infoIo: if entry['VOName'] not in ssbSiteSlots: logging.warn('Site %s does not have CPU thresholds in SSB. Taking no action', entry['VOName']) ssbSiteSlots.pop(entry['VOName'], None) continue if entry['Value'] is None: logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName']) ssbSiteSlots.pop(entry['VOName'], None) continue ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value']) return ssbSiteSlots def getSiteStatus(self): """ _getSiteStatus_ Fetch site state from SSB and map it to agent state """ ssbState = self.ssb.getMetric(self.siteStatusMetric) ssbSiteState = {} for site in ssbState: voname = site['VOName'] status = site['Status'] if voname not in ssbSiteState: statusAgent = self.getState(str(status)) if not statusAgent: logging.error("Unknown status '%s' for site %s, please check SSB", status, voname) else: ssbSiteState[voname] = {'state': statusAgent} else: logging.warning('I have a duplicated status entry in SSB for %s', voname) return ssbSiteState def getState(self, stateSSB): """ _getState_ Translates SSB states into resource control state """ ssb2agent = {'enabled': 'Normal', 'drain': 'Draining', 'disabled': 'Down', 'test': 'Draining'} # 'test' state behaviour varies between production and tier0 agents ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining" return ssb2agent.get(stateSSB) def updateSiteState(self, siteName, state): """ _updateSiteState_ Update only the site state in the resource control database. """ try: self.resourceControl.changeSiteState(siteName, state) except Exception as ex: logging.error("Failed to update %s state to %s:", siteName, state) logging.error(str(ex)) logging.error("Traceback: \n%s", traceback.format_exc()) return def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound): """ _checkTaskSlotsChanges_ Update the CPU and IOBound slots for a given site. """ siteTaskSlots = self.resourceControl.thresholdBySite(siteName) taskCPUPending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minCPUSlots) taskIOPending = max(int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minIOSlots) updateTasks = False if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0]['task_pending_slots'] != taskCPUPending: updateTasks = True elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[0]['task_pending_slots'] != taskIOPending: updateTasks = True if updateTasks: logging.info("Updating %s CPU tasks thresholds for pend/runn: %d/%d", siteName, taskCPUPending, CPUBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksCPU, maxSlots=CPUBound, pendingSlots=taskCPUPending) logging.info("Updating %s IO tasks thresholds for pend/runn: %d/%d", siteName, taskIOPending, IOBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksIO, maxSlots=IOBound, pendingSlots=taskIOPending) if self.tier0Mode: # Set task thresholds for Tier0 logging.debug("Updating %s Express and Repack task thresholds.", siteName) expressSlots = int(CPUBound * self.runningExpressPercent / 100) pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Express', expressSlots, pendingExpress) repackSlots = int(CPUBound * self.runningRepackPercent / 100) pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Repack', repackSlots, pendingRepack)
def __init__(self, config): ''' Initialise the RESTModel and add some methods to it. ''' RESTModel.__init__(self, config) del self.methods['POST'] validator = Validator({'dbi':self.dbi}) self.dashboard = Dashboard(dict = { 'endpoint': self.config.services.dashboard, 'cachepath': self.config.services.cachepath, 'logger': self}) self.samtests = SAM(dict = { 'endpoint': self.config.services.sam, 'cachepath': self.config.services.cachepath, 'cert': config.services.hostcert, 'key': config.services.hostkey, 'logger': self}) self.methods['GET'] = {'list':{'args':['name', 'scheme'], 'call': self.list, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name]}, 'status':{'args': ['name'], 'call': self.status, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name]}, 'software':{'args': ['name'], 'call': self.software, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name]}, 'resource_element':{'args':['name', 'type'], 'call': self.resource_element, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name, validator.validate_resource_type]}, 'resource_pledge':{'args':['name', 'quarter'], 'call': self.resource_pledge, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name, validator.validate_quarter]}, 'pledge_history':{'args':['name'], 'call': self.pledge_history, 'version': 1}, 'contacts':{'args':['name', 'role'], 'call': self.contacts, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name, validator.validate_role]}, 'groups':{'args': ['name'], 'call': self.groups, 'version': 1}, 'links':{'args': ['name'], 'call': self.links, 'version': 1, 'validation': [validator.validate_scheme, validator.validate_name]}, 'associations':{'args': ['parent', 'child', 'scheme'], 'call': self.associations, 'version': 1, 'validation': [validator.validate_scheme, validator.validate_associations]}, 'names':{'args':['name', 'scheme', 'limit'], 'call': self.names, 'version': 1, 'validation': [validator.validate_scheme, validator.validate_limit_scheme, validator.validate_name]}}
class Get(RESTModel): ''' Get: Get data related to the sites known to SiteDB ''' def __init__(self, config): ''' Initialise the RESTModel and add some methods to it. ''' RESTModel.__init__(self, config) del self.methods['POST'] validator = Validator({'dbi':self.dbi}) self.dashboard = Dashboard(dict = { 'endpoint': self.config.services.dashboard, 'cachepath': self.config.services.cachepath, 'logger': self}) self.samtests = SAM(dict = { 'endpoint': self.config.services.sam, 'cachepath': self.config.services.cachepath, 'cert': config.services.hostcert, 'key': config.services.hostkey, 'logger': self}) self.methods['GET'] = {'list':{'args':['name', 'scheme'], 'call': self.list, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name]}, 'status':{'args': ['name'], 'call': self.status, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name]}, 'software':{'args': ['name'], 'call': self.software, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name]}, 'resource_element':{'args':['name', 'type'], 'call': self.resource_element, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name, validator.validate_resource_type]}, 'resource_pledge':{'args':['name', 'quarter'], 'call': self.resource_pledge, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name, validator.validate_quarter]}, 'pledge_history':{'args':['name'], 'call': self.pledge_history, 'version': 1}, 'contacts':{'args':['name', 'role'], 'call': self.contacts, 'version': 2, 'validation': [validator.validate_scheme, validator.validate_name, validator.validate_role]}, 'groups':{'args': ['name'], 'call': self.groups, 'version': 1}, 'links':{'args': ['name'], 'call': self.links, 'version': 1, 'validation': [validator.validate_scheme, validator.validate_name]}, 'associations':{'args': ['parent', 'child', 'scheme'], 'call': self.associations, 'version': 1, 'validation': [validator.validate_scheme, validator.validate_associations]}, 'names':{'args':['name', 'scheme', 'limit'], 'call': self.names, 'version': 1, 'validation': [validator.validate_scheme, validator.validate_limit_scheme, validator.validate_name]}} def list(self, *args, **kwargs): """ Return a list of sites matching name in the chosen format Args: name='T%', scheme='cms_name' """ input = self.sanitise_input(args, kwargs, 'list') binds = [] for n in self.makelist(input['name']): binds.append({'name': n + '%'}) sql = "" if input['scheme'] == 'resource': sql = """select * from siteinfo_v2 where id in ( select site from resource_element_v2 where fqdn like :name')""" elif input['scheme'] == 'lcg_name': # TODO: this needs a schema change and a refactor... sql = """select * from siteinfo_v2 where id in(select SITE_CMS_NAME_MAP.SITE_ID from SAM_NAME join SAM_CMS_NAME_MAP on SAM_CMS_NAME_MAP.SAM_ID = SAM_NAME.id join SITE_CMS_NAME_MAP on SITE_CMS_NAME_MAP.CMS_NAME_ID = SAM_CMS_NAME_MAP.CMS_NAME_ID where SAM_NAME.NAME like :name)""" else: sql = "select * from siteinfo_v2 where %s like :name" % input['scheme'] result = self.dbi.processData(sql, binds) data = self.formatDict(result) return {'binds': binds, 'sitelist':data} def status(self, *args, **kwargs): """ return the status of a given site Args: name """ input = self.sanitise_input(args, kwargs, 'status') return self.dashboard.getStatus(name=input['name']) def software(self, *args, **kwargs): """ Return a list of software installed at the site as reported by SAM tests and it's pin status. Args: names TODO: add in pin status """ input = self.sanitise_input(args, kwargs, 'software') celist = self.resource_element(name=input['name'], type='CE') sw = [] pinsql = """select release, arch from pinned_releases where ce_id = (select id from resource_element_v2 where fqdn = :ce)""" mansql = """select MANUALINSTALL from resource_element_v2 where fqdn = :ce and RESTYPE='CE'""" for ce in celist['resource_element']: result = self.dbi.processData(pinsql, {'ce': ce['fqdn']}) pins = self.formatDict(result) sorted_pins = {} for pin in pins: if pin['arch'] in sorted_pins.keys(): sorted_pins[pin['arch']].append(pin['release']) else: sorted_pins[pin['arch']] = [pin['release']] result = self.dbi.processData(mansql, {'ce': ce['fqdn']}) manual = False if self.formatDict(result)[0]['manualinstall']: manual = True installed = self.samtests.getCMSSWInstalls(ce['fqdn']) sw.append({ce['fqdn']: {'installed': installed, 'pinned': sorted_pins, 'manual': manual}}) return sw def resource_element(self, *args, **kwargs): """ Return the names of a resource element of _type_ for _site_ Args: name, type """ input = self.sanitise_input(args, kwargs, 'resource_element') data = {} binds = [] sql ="""select resource_element_v2.fqdn, resource_element_v2.restype, siteinfo_v2.cms_name from resource_element_v2 join siteinfo_v2 on siteinfo_v2.id = resource_element_v2.site where siteinfo_v2.cms_name like :name and restype like :type""" for n in self.makelist(input['name']): binds.append({'name': n + '%', 'type' : input['type']}) result = self.dbi.processData(sql, binds) data['resource_element'] = self.formatDict(result) data['binds'] = binds return data def resource_pledge(self, *args, **kwargs): """ Return the pledged resources available at _site_ during _quarter_ Args: names, quarter """ input = self.sanitise_input(args, kwargs, 'resource_pledge') sql = """select siteinfo_v2.cms_name, max(PLEDGEQUARTER) quarter_pledged, cpu, job_slots, disk_store, tape_store, wan_store, local_store, national_bandwidth, opn_bandwidth from resource_pledge join siteinfo_v2 on siteinfo_v2.id = RESOURCE_PLEDGE.site where siteinfo_v2.cms_name like :site and PLEDGEQUARTER <= :quarter and pledgedate in ( select max(RESOURCE_PLEDGE.pledgedate) from RESOURCE_PLEDGE join siteinfo_v2 on siteinfo_v2.id = RESOURCE_PLEDGE.site where siteinfo_v2.cms_name like :site and PLEDGEQUARTER <= :quarter group by cms_name ) group by siteinfo_v2.cms_name, cpu, job_slots, disk_store, tape_store, wan_store, local_store, national_bandwidth, opn_bandwidth order by siteinfo_v2.cms_name, max(PLEDGEQUARTER) desc""" data = {} try: binds = [] for n in self.makelist(input['name']): binds.append({'site': n + '%','quarter': input['quarter']}) result = self.dbi.processData(sql, binds) data['resource_pledge'] = self.formatDict(result) def red_fun(x, y): d = {} d['job_slots'] = x.get('job_slots', 0) + y.get('job_slots', 0) d['local_store'] = x.get('local_store', 0) + y.get('local_store', 0) d['wan_store'] = x.get('wan_store', 0) + y.get('wan_store', 0) d['disk_store'] = x.get('disk_store', 0) + y.get('disk_store', 0) d['tape_store'] = x.get('tape_store', 0) + y.get('tape_store', 0) d['national_bandwidth'] = x.get('national_bandwidth', 0) + y.get('national_bandwidth', 0) d['opn_bandwidth'] = x.get('opn_bandwidth', 0) + y.get('opn_bandwidth', 0) d['cpu'] = x.get('cpu', 0) + y.get('cpu', 0) return d data['resource_totals'] = reduce(red_fun, data['resource_pledge']) data['binds'] = binds except Exception, e: self.exception("Could not get resource_pledge for input:" % input) data = {"exception": e, "message": "Could not get resource_pledge", "execeptiontype": str(type(e)).split("'")[1], 'binds': binds} return data