예제 #1
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)
예제 #2
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
예제 #3
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info(
                "This component is not enabled in the configuration. Doing nothing."
            )
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            # first, update site status
            ssbSiteStatus = self.getSiteStatus()
            self.checkStatusChanges(sitesRC, ssbSiteStatus)

            # now fetch site slots thresholds
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                logging.error(
                    "One or more of the SSB metrics is down. Please contact the Dashboard team."
                )
                return

            logging.debug("Info from SSB: %s", sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info(
            "Resource control cycle finished updating site state and thresholds."
        )

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view for agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(
                filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning(
                "agentInfo couch view is not available, use default value %s",
                self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName,
                                                    self.agentsNumByTeam)
            logging.debug(
                "Agents connected to the same team (not in DrainMode): %d",
                self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric)
        ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric)

        ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots)

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoSSB.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down',
                                 site)
                    self.updateSiteState(site, 'Down')

        # normally set all the others
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site,
                             infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return

    def checkSlotsChanges(self, infoRC, infoSSB):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then updates the task level too.
        """
        logging.debug(
            "Settings for site and task pending slots: %s%% and %s%%",
            self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and site.startswith('T1_'):
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100
            else:
                # round very small sites to the bare minimum
                infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'],
                                                self.minCPUSlots)
                infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'],
                                               self.minIOSlots)
            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']

            sitePending = max(
                int(CPUBound / self.agentsNumByTeam *
                    self.pendingSlotsSitePercent / 100), self.minCPUSlots)

            # update site slots, if needed
            if infoRC[site]['running_slots'] != CPUBound or infoRC[site][
                    'pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info(
                    "Updating %s site thresholds for pend/runn: %d/%d", site,
                    sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(
                    site,
                    pendingJobSlots=sitePending,
                    runningJobSlots=CPUBound)

            # now handle the task level thresholds
            self.checkTaskSlotsChanges(site, CPUBound, IOBound)

    def thresholdsByVOName(self, infoCpu, infoIo):
        """
        _thresholdsByVOName_

        Creates a dictionary with CPU and IO slots keyed by the site name.
        If any of the thresholds is missing or has an invalid value, the whole
        site thresholds is skipped.
        """
        ssbSiteSlots = {}
        for entry in infoCpu:
            if entry['Value'] is None:
                logging.warn(
                    'Site %s has invalid CPU thresholds in SSB. Taking no action',
                    entry['VOName'])
            else:
                ssbSiteSlots[entry['VOName']] = {
                    'slotsCPU': int(entry['Value'])
                }

        # then iterate over the IO slots
        for entry in infoIo:
            if entry['Value'] is None:
                logging.warn(
                    'Site %s has invalid IO thresholds in SSB. Taking no action',
                    entry['VOName'])
            else:
                ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value'])

        # Before proceeding, remove sites without both metrics
        for site in ssbSiteSlots.keys():
            if len(ssbSiteSlots[site]) != 2:
                logging.warn("Site: %s has incomplete SSB metrics, see %s",
                             site, ssbSiteSlots[site])
                ssbSiteSlots.pop(site)

        return ssbSiteSlots

    def getSiteStatus(self):
        """
        _getSiteStatus_

        Fetch site state from SSB and map it to agent state
        """
        ssbState = self.ssb.getMetric(self.siteStatusMetric)

        ssbSiteState = {}
        for site in ssbState:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteState:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error(
                        "Unknown status '%s' for site %s, please check SSB",
                        status, voname)
                else:
                    ssbSiteState[voname] = {'state': statusAgent}
            else:
                logging.warning(
                    'I have a duplicated status entry in SSB for %s', voname)

        return ssbSiteState

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {
            'enabled': 'Normal',
            'drain': 'Draining',
            'disabled': 'Down',
            'test': 'Draining'
        }
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return

    def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound):
        """
        _checkTaskSlotsChanges_

        Update the CPU and IOBound slots for a given site.
        """
        siteTaskSlots = self.resourceControl.thresholdBySite(siteName)
        taskCPUPending = max(
            int(CPUBound / self.agentsNumByTeam *
                self.pendingSlotsTaskPercent / 100), self.minCPUSlots)
        taskIOPending = max(
            int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent /
                100), self.minIOSlots)

        updateTasks = False
        if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0][
                'task_pending_slots'] != taskCPUPending:
            updateTasks = True
        elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[
                0]['task_pending_slots'] != taskIOPending:
            updateTasks = True

        if updateTasks:
            logging.info(
                "Updating %s CPU tasks thresholds for pend/runn: %d/%d",
                siteName, taskCPUPending, CPUBound)
            self.resourceControl.insertThreshold(siteName,
                                                 taskType=self.tasksCPU,
                                                 maxSlots=CPUBound,
                                                 pendingSlots=taskCPUPending)
            logging.info(
                "Updating %s IO tasks thresholds for pend/runn: %d/%d",
                siteName, taskIOPending, IOBound)
            self.resourceControl.insertThreshold(siteName,
                                                 taskType=self.tasksIO,
                                                 maxSlots=IOBound,
                                                 pendingSlots=taskIOPending)

        if self.tier0Mode:
            # Set task thresholds for Tier0
            logging.debug("Updating %s Express and Repack task thresholds.",
                          siteName)
            expressSlots = int(CPUBound * self.runningExpressPercent / 100)
            pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent /
                                 100)
            self.resourceControl.insertThreshold(siteName, 'Express',
                                                 expressSlots, pendingExpress)

            repackSlots = int(CPUBound * self.runningRepackPercent / 100)
            pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent /
                                100)
            self.resourceControl.insertThreshold(siteName, 'Repack',
                                                 repackSlots, pendingRepack)
예제 #4
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """

    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            # first, update site status
            ssbSiteStatus = self.getSiteStatus()
            self.checkStatusChanges(sitesRC, ssbSiteStatus)

            # now fetch site slots thresholds
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
                return

            logging.debug("Info from SSB: %s", sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info("Resource control cycle finished updating site state and thresholds.")

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view for agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric)
        ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric)

        ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots)

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoSSB.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down', site)
                    self.updateSiteState(site, 'Down')

        # normally set all the others
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return

    def checkSlotsChanges(self, infoRC, infoSSB):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then updates the task level too.
        """
        logging.debug("Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent,
                      self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and site.startswith('T1_'):
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100
            else:
                # round very small sites to the bare minimum
                infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'], self.minCPUSlots)
                infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'], self.minIOSlots)
            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']

            sitePending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsSitePercent / 100),
                              self.minCPUSlots)

            # update site slots, if needed
            if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info("Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(site, pendingJobSlots=sitePending,
                                                        runningJobSlots=CPUBound)

            # now handle the task level thresholds
            self.checkTaskSlotsChanges(site, CPUBound, IOBound)

    def thresholdsByVOName(self, infoCpu, infoIo):
        """
        _thresholdsByVOName_

        Creates a dictionary with CPU and IO slots keyed by the site name.
        If any of the thresholds is missing or has an invalid value, the whole
        site thresholds is skipped.
        """
        ssbSiteSlots = {}
        for entry in infoCpu:
            if entry['Value'] is None:
                logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName'])
                continue
            ssbSiteSlots[entry['VOName']] = {'slotsCPU': int(entry['Value'])}

        # then iterate over the IO slots
        for entry in infoIo:
            if entry['VOName'] not in ssbSiteSlots:
                logging.warn('Site %s does not have CPU thresholds in SSB. Taking no action', entry['VOName'])
                ssbSiteSlots.pop(entry['VOName'], None)
                continue
            if entry['Value'] is None:
                logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName'])
                ssbSiteSlots.pop(entry['VOName'], None)
                continue
            ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value'])

        return ssbSiteSlots

    def getSiteStatus(self):
        """
        _getSiteStatus_

        Fetch site state from SSB and map it to agent state
        """
        ssbState = self.ssb.getMetric(self.siteStatusMetric)

        ssbSiteState = {}
        for site in ssbState:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteState:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error("Unknown status '%s' for site %s, please check SSB", status, voname)
                else:
                    ssbSiteState[voname] = {'state': statusAgent}
            else:
                logging.warning('I have a duplicated status entry in SSB for %s', voname)

        return ssbSiteState

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {'enabled': 'Normal',
                     'drain': 'Draining',
                     'disabled': 'Down',
                     'test': 'Draining'}
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return

    def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound):
        """
        _checkTaskSlotsChanges_

        Update the CPU and IOBound slots for a given site.
        """
        siteTaskSlots = self.resourceControl.thresholdBySite(siteName)
        taskCPUPending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100),
                             self.minCPUSlots)
        taskIOPending = max(int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minIOSlots)

        updateTasks = False
        if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0]['task_pending_slots'] != taskCPUPending:
            updateTasks = True
        elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[0]['task_pending_slots'] != taskIOPending:
            updateTasks = True

        if updateTasks:
            logging.info("Updating %s CPU tasks thresholds for pend/runn: %d/%d", siteName,
                         taskCPUPending, CPUBound)
            self.resourceControl.insertThreshold(siteName, taskType=self.tasksCPU, maxSlots=CPUBound,
                                                 pendingSlots=taskCPUPending)
            logging.info("Updating %s IO tasks thresholds for pend/runn: %d/%d", siteName,
                         taskIOPending, IOBound)
            self.resourceControl.insertThreshold(siteName, taskType=self.tasksIO, maxSlots=IOBound,
                                                 pendingSlots=taskIOPending)

        if self.tier0Mode:
            # Set task thresholds for Tier0
            logging.debug("Updating %s Express and Repack task thresholds.", siteName)
            expressSlots = int(CPUBound * self.runningExpressPercent / 100)
            pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100)
            self.resourceControl.insertThreshold(siteName, 'Express', expressSlots, pendingExpress)

            repackSlots = int(CPUBound * self.runningRepackPercent / 100)
            pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100)
            self.resourceControl.insertThreshold(siteName, 'Repack', repackSlots, pendingRepack)
예제 #5
0
 def __init__(self, config):
     '''
     Initialise the RESTModel and add some methods to it.
     '''
     RESTModel.__init__(self, config)
     
     del self.methods['POST']
     
     validator = Validator({'dbi':self.dbi})
     
     self.dashboard = Dashboard(dict = {
               'endpoint': self.config.services.dashboard,
               'cachepath': self.config.services.cachepath,
               'logger': self})
     self.samtests = SAM(dict = {
               'endpoint': self.config.services.sam,
               'cachepath': self.config.services.cachepath,
               'cert': config.services.hostcert,
               'key': config.services.hostkey,
               'logger': self})
     
     self.methods['GET'] = {'list':{'args':['name', 'scheme'],
                                     'call': self.list,
                                     'version': 2,
                                     'validation': [validator.validate_scheme, 
                                                    validator.validate_name]},
                           'status':{'args': ['name'],
                                     'call': self.status,
                                     'version': 2,
                                     'validation': [validator.validate_scheme, 
                                                    validator.validate_name]},
                           'software':{'args': ['name'],
                                     'call': self.software,
                                     'version': 2,
                                     'validation': [validator.validate_scheme, 
                                                    validator.validate_name]},
                           'resource_element':{'args':['name', 'type'],
                                     'call': self.resource_element,
                                     'version': 2,
                                     'validation': [validator.validate_scheme,
                                                    validator.validate_name,
                                                    validator.validate_resource_type]},
                           'resource_pledge':{'args':['name', 'quarter'],
                                     'call': self.resource_pledge,
                                     'version': 2,
                                     'validation': [validator.validate_scheme,
                                                    validator.validate_name,
                                                    validator.validate_quarter]},
                           'pledge_history':{'args':['name'],
                                     'call': self.pledge_history,
                                     'version': 1},
                           'contacts':{'args':['name', 'role'],
                                     'call': self.contacts,
                                     'version': 2,
                                     'validation': [validator.validate_scheme,
                                                    validator.validate_name,
                                                    validator.validate_role]},
                           'groups':{'args': ['name'],
                                     'call': self.groups,
                                     'version': 1},
                           'links':{'args': ['name'],
                                     'call': self.links,
                                     'version': 1,
                                     'validation': [validator.validate_scheme,
                                                    validator.validate_name]},
                           'associations':{'args': ['parent', 'child', 'scheme'],
                                     'call': self.associations,
                                     'version': 1,
                                     'validation': [validator.validate_scheme,
                                                    validator.validate_associations]},
                           'names':{'args':['name', 'scheme', 'limit'],
                                     'call': self.names,
                                     'version': 1,
                                     'validation': [validator.validate_scheme,
                                                    validator.validate_limit_scheme, 
                                                    validator.validate_name]}}
예제 #6
0
class Get(RESTModel):
    '''
    Get: Get data related to the sites known to SiteDB
     
    '''

    def __init__(self, config):
        '''
        Initialise the RESTModel and add some methods to it.
        '''
        RESTModel.__init__(self, config)
        
        del self.methods['POST']
        
        validator = Validator({'dbi':self.dbi})
        
        self.dashboard = Dashboard(dict = {
                  'endpoint': self.config.services.dashboard,
                  'cachepath': self.config.services.cachepath,
                  'logger': self})
        self.samtests = SAM(dict = {
                  'endpoint': self.config.services.sam,
                  'cachepath': self.config.services.cachepath,
                  'cert': config.services.hostcert,
                  'key': config.services.hostkey,
                  'logger': self})
        
        self.methods['GET'] = {'list':{'args':['name', 'scheme'],
                                        'call': self.list,
                                        'version': 2,
                                        'validation': [validator.validate_scheme, 
                                                       validator.validate_name]},
                              'status':{'args': ['name'],
                                        'call': self.status,
                                        'version': 2,
                                        'validation': [validator.validate_scheme, 
                                                       validator.validate_name]},
                              'software':{'args': ['name'],
                                        'call': self.software,
                                        'version': 2,
                                        'validation': [validator.validate_scheme, 
                                                       validator.validate_name]},
                              'resource_element':{'args':['name', 'type'],
                                        'call': self.resource_element,
                                        'version': 2,
                                        'validation': [validator.validate_scheme,
                                                       validator.validate_name,
                                                       validator.validate_resource_type]},
                              'resource_pledge':{'args':['name', 'quarter'],
                                        'call': self.resource_pledge,
                                        'version': 2,
                                        'validation': [validator.validate_scheme,
                                                       validator.validate_name,
                                                       validator.validate_quarter]},
                              'pledge_history':{'args':['name'],
                                        'call': self.pledge_history,
                                        'version': 1},
                              'contacts':{'args':['name', 'role'],
                                        'call': self.contacts,
                                        'version': 2,
                                        'validation': [validator.validate_scheme,
                                                       validator.validate_name,
                                                       validator.validate_role]},
                              'groups':{'args': ['name'],
                                        'call': self.groups,
                                        'version': 1},
                              'links':{'args': ['name'],
                                        'call': self.links,
                                        'version': 1,
                                        'validation': [validator.validate_scheme,
                                                       validator.validate_name]},
                              'associations':{'args': ['parent', 'child', 'scheme'],
                                        'call': self.associations,
                                        'version': 1,
                                        'validation': [validator.validate_scheme,
                                                       validator.validate_associations]},
                              'names':{'args':['name', 'scheme', 'limit'],
                                        'call': self.names,
                                        'version': 1,
                                        'validation': [validator.validate_scheme,
                                                       validator.validate_limit_scheme, 
                                                       validator.validate_name]}}
        
    def list(self, *args, **kwargs):
        """
        Return a list of sites matching name in the chosen format
        Args: name='T%', scheme='cms_name'
        """
        input = self.sanitise_input(args, kwargs, 'list')
        binds = []
        
        for n in self.makelist(input['name']): 
            binds.append({'name': n + '%'})
        sql = ""
        if input['scheme'] == 'resource':
            sql = """select * from siteinfo_v2 where id in (
                    select site from resource_element_v2 where fqdn like :name')"""
        elif input['scheme'] == 'lcg_name':
            # TODO: this needs a schema change and a refactor...
            sql = """select * from siteinfo_v2 where id in(select SITE_CMS_NAME_MAP.SITE_ID from SAM_NAME
  join SAM_CMS_NAME_MAP on SAM_CMS_NAME_MAP.SAM_ID = SAM_NAME.id
  join SITE_CMS_NAME_MAP on SITE_CMS_NAME_MAP.CMS_NAME_ID = SAM_CMS_NAME_MAP.CMS_NAME_ID
where SAM_NAME.NAME like :name)"""
        else:
            sql = "select * from siteinfo_v2 where %s like :name" % input['scheme']
            
        result = self.dbi.processData(sql, binds)
        data = self.formatDict(result)
        
        return {'binds': binds, 'sitelist':data}
         
    def status(self, *args, **kwargs):
        """
        return the status of a given site 
        Args: name
        """
        input = self.sanitise_input(args, kwargs, 'status')
        
        return self.dashboard.getStatus(name=input['name'])  
    
    def software(self, *args, **kwargs):
        """
        Return a list of software installed at the site as reported by SAM tests
        and it's pin status.
        
        Args: names
        
        TODO: add in pin status
        """
        input = self.sanitise_input(args, kwargs, 'software')
        celist = self.resource_element(name=input['name'], type='CE')
        sw = []
        pinsql = """select release, arch from pinned_releases 
        where ce_id = (select id from resource_element_v2 where fqdn = :ce)"""
        mansql = """select MANUALINSTALL from resource_element_v2 
        where fqdn = :ce and RESTYPE='CE'"""
        for ce in celist['resource_element']:
            result = self.dbi.processData(pinsql, {'ce': ce['fqdn']})
            pins = self.formatDict(result)
            sorted_pins = {}
            for pin in pins:
                if pin['arch'] in sorted_pins.keys():
                    sorted_pins[pin['arch']].append(pin['release'])
                else:
                    sorted_pins[pin['arch']] = [pin['release']]
            result = self.dbi.processData(mansql, {'ce': ce['fqdn']})
            manual = False
            if self.formatDict(result)[0]['manualinstall']:
                manual = True
            installed = self.samtests.getCMSSWInstalls(ce['fqdn'])
            sw.append({ce['fqdn']: {'installed': installed,
                                    'pinned': sorted_pins,
                                    'manual': manual}})
        return sw
    
    def resource_element(self, *args, **kwargs):
        """
        Return the names of a resource element of _type_ for _site_
        Args: name, type
        """
        input = self.sanitise_input(args, kwargs, 'resource_element')
        data = {}
        binds = []
        sql ="""select resource_element_v2.fqdn, 
                        resource_element_v2.restype,
                        siteinfo_v2.cms_name
                        from resource_element_v2
                        join siteinfo_v2 on siteinfo_v2.id = resource_element_v2.site
                    where siteinfo_v2.cms_name like :name 
                    and restype like :type"""
        for n in self.makelist(input['name']): 
            binds.append({'name': n + '%', 'type' : input['type']})
        result = self.dbi.processData(sql, binds)
        data['resource_element'] = self.formatDict(result)
        data['binds'] = binds
                
        return data
    
    def resource_pledge(self, *args, **kwargs):
        """
        Return the pledged resources available at _site_ during _quarter_
        Args: names, quarter
        """
        input = self.sanitise_input(args, kwargs, 'resource_pledge')
        sql = """select
    siteinfo_v2.cms_name, max(PLEDGEQUARTER) quarter_pledged,
    cpu, job_slots, disk_store, tape_store, wan_store, local_store, 
    national_bandwidth, opn_bandwidth
from resource_pledge
 join siteinfo_v2 on siteinfo_v2.id = RESOURCE_PLEDGE.site
where siteinfo_v2.cms_name like :site and PLEDGEQUARTER <= :quarter
and pledgedate in (
    select
        max(RESOURCE_PLEDGE.pledgedate)
        from RESOURCE_PLEDGE 
        join siteinfo_v2 on siteinfo_v2.id = RESOURCE_PLEDGE.site
    where siteinfo_v2.cms_name like :site and PLEDGEQUARTER <= :quarter

    group by cms_name
)
group by siteinfo_v2.cms_name, cpu, job_slots, disk_store, tape_store, wan_store, local_store, 
    national_bandwidth, opn_bandwidth

order by siteinfo_v2.cms_name, max(PLEDGEQUARTER) desc"""
        
        data = {}
        try:
            binds = []
            for n in self.makelist(input['name']): 
                    binds.append({'site': n + '%','quarter': input['quarter']})
            result = self.dbi.processData(sql, binds)
            data['resource_pledge'] = self.formatDict(result)
            
            def red_fun(x, y):
                d = {}
                d['job_slots'] = x.get('job_slots', 0) + y.get('job_slots', 0)
                d['local_store'] = x.get('local_store', 0) + y.get('local_store', 0)
                d['wan_store'] = x.get('wan_store', 0) + y.get('wan_store', 0)
                d['disk_store'] = x.get('disk_store', 0) + y.get('disk_store', 0)
                d['tape_store'] = x.get('tape_store', 0) + y.get('tape_store', 0)
                d['national_bandwidth'] = x.get('national_bandwidth', 0) + y.get('national_bandwidth', 0)
                d['opn_bandwidth'] = x.get('opn_bandwidth', 0) + y.get('opn_bandwidth', 0)
                d['cpu'] = x.get('cpu', 0) + y.get('cpu', 0)
                return d
            
            data['resource_totals'] = reduce(red_fun, data['resource_pledge'])
            data['binds'] = binds
        except Exception, e:
            self.exception("Could not get resource_pledge for input:" % input)
            data = {"exception": e, 
                    "message": "Could not get resource_pledge",
                    "execeptiontype": str(type(e)).split("'")[1],
                    'binds': binds}
        return data