예제 #1
0
    def testPossibleSites(self):
        """
        Workqueue element data location check (same as workRestrictions)
        """
        # test element ala MonteCarlo
        ele = WorkQueueElement(SiteWhitelist=["T1_IT_CNAF", "T2_DE_DESY"])
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])

        # test element with InputDataset but no location
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": []}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and no match location
        ele['Inputs'] = {
            "/MY/BLOCK/NAME#73e99a52": ["T1_US_FNAL", "T2_CH_CERN"]
        }
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and valid location
        ele['Inputs'] = {
            "/MY/BLOCK/NAME#73e99a52":
            ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]
        }
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])

        # test element with InputDataset and ParentData with no location
        ele['Inputs'] = {
            "/MY/BLOCK/NAME#73e99a52":
            ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]
        }
        ele['ParentFlag'] = True
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": []}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and ParentData with no match location
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": ["T1_IT_CNAF"]}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and ParentData with valid location
        ele['ParentData'] = {
            "/MY/BLOCK2/NAME#002590494c06": ["T1_US_FNAL", "T2_DE_DESY"]
        }
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])

        # test element with InputDataset, PileupData and ParentData with no location
        ele['Inputs'] = {
            "/MY/BLOCK/NAME#73e99a52":
            ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]
        }
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": ["T2_DE_DESY"]}
        ele['PileupData'] = {"/MY/DATASET/NAME": []}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset, PileupData and ParentData with no match location
        ele['PileupData'] = {"/MY/DATASET/NAME": ["T1_IT_CNAF", "T2_CH_CERN"]}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset, PileupData and ParentData with valid location
        ele['PileupData'] = {"/MY/DATASET/NAME": ["T1_IT_CNAF", "T2_DE_DESY"]}
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])
예제 #2
0
    def testPossibleSitesLocationFlags(self):
        """
        Workqueue element data location check, using the input and PU data location flags
        """
        ele = WorkQueueElement(SiteWhitelist=["T1_IT_CNAF", "T2_DE_DESY"])

        # test element with InputDataset and no location, but input flag on
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": []}
        ele['NoInputUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and one match, but input flag on
        ele['Inputs'] = {
            "/MY/BLOCK/NAME#73e99a52": ["T1_IT_CNAF", "T2_CH_CERN"]
        }
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and one match, but pu flag on
        ele['NoInputUpdate'] = False
        ele['NoPileupUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF"])
        # test element with InputDataset and one match, but both flags on
        ele['NoInputUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])

        # test element with InputDataset and ParentData and no location, but both flags on
        ele['ParentFlag'] = True
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": []}
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and ParentData and no location, but input flag on
        ele['NoPileupUpdate'] = False
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and ParentData and no location, but pileup flag on
        ele['NoInputUpdate'] = False
        ele['NoPileupUpdate'] = True
        self.assertEqual(possibleSites(ele), [])

        # test element with InputDataset, PileupData and ParentData with no location, but pileup flag on
        ele['Inputs'] = {
            "/MY/BLOCK/NAME#73e99a52":
            ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]
        }
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": ["T2_DE_DESY"]}
        ele['PileupData'] = {"/MY/DATASET/NAME": []}
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])
        # test element with InputDataset, PileupData and ParentData with no location, but both flags on
        ele['NoInputUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset, PileupData and ParentData with no location, but input flag on
        ele['NoPileupUpdate'] = False
        self.assertEqual(possibleSites(ele), [])
예제 #3
0
def getGlobalSiteStatusSummary(elements, status=None, dataLocality=False):
    """
    _getGlobalSiteStatusSummary_

    Given a dict with workqueue elements keyed by status, such as this format:
    {u'Canceled': [{u'Inputs': {}, u'Jobs': 18,...}, {u'Jobs': 11,...}],
     u'Running': [{'Priority': 190000,..}, ...]}

    Creates a summary of jobs and number of wq elements in each status
    distributed among the sites whitelisted. There are 2 job distribution:
     *) unique top level jobs per site and per status and (equally
        distributed among all sites)
     *) possible top level jobs per site and per status (consider all
        jobs can run in a single location)

    If status is provided, then skip any workqueue element not in the
    given status. Otherwise filter only active workqueue status.

    If dataLocality is set to True, then it considers only sites that pass
    the data location constraint.
    """
    if status and isinstance(status, basestring):
        activeStatus = [status]
    elif status and isinstance(status, (list, tuple)):
        activeStatus = status
    else:
        activeStatus = elements.keys()

    uniqueJobsSummary = {}
    possibleJobsSummary = {}

    for st in activeStatus:
        uniqueJobsSummary.setdefault(st, {})
        possibleJobsSummary.setdefault(st, {})
        for elem in elements.get(st, []):
            elem = elem['WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement']
            if dataLocality:
                commonSites = possibleSites(elem)
            else:
                commonSites = list(set(elem['SiteWhitelist']) - set(elem['SiteBlacklist']))

            try:
                jobsPerSite = elem['Jobs'] / len(commonSites)
            except ZeroDivisionError:
                commonSites = ['NoPossibleSite']
                jobsPerSite = elem['Jobs']

            for site in commonSites:
                uniqueJobsSummary[st].setdefault(site, {'Jobs': 0, 'NumElems': 0})
                possibleJobsSummary[st].setdefault(site, {'Jobs': 0, 'NumElems': 0})

                uniqueJobsSummary[st][site]['Jobs'] += jobsPerSite
                uniqueJobsSummary[st][site]['NumElems'] += 1
                possibleJobsSummary[st][site]['Jobs'] += elem['Jobs']
                possibleJobsSummary[st][site]['NumElems'] += 1

    return uniqueJobsSummary, possibleJobsSummary
    def testPossibleSitesLocationFlags(self):
        """
        Workqueue element data location check, using the input and PU data location flags
        """
        ele = WorkQueueElement(SiteWhitelist=["T1_IT_CNAF", "T2_DE_DESY"])

        # test element with InputDataset and no location, but input flag on
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": []}
        ele['NoInputUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and one match, but input flag on
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": ["T1_IT_CNAF", "T2_CH_CERN"]}
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and one match, but pu flag on
        ele['NoInputUpdate'] = False
        ele['NoPileupUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF"])
        # test element with InputDataset and one match, but both flags on
        ele['NoInputUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])

        # test element with InputDataset and ParentData and no location, but both flags on
        ele['ParentFlag'] = True
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": []}
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and ParentData and no location, but input flag on
        ele['NoPileupUpdate'] = False
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset and ParentData and no location, but pileup flag on
        ele['NoInputUpdate'] = False
        ele['NoPileupUpdate'] = True
        self.assertEqual(possibleSites(ele), [])

        # test element with InputDataset, PileupData and ParentData with no location, but pileup flag on
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]}
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": ["T2_DE_DESY"]}
        ele['PileupData'] = {"/MY/DATASET/NAME": []}
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])
        # test element with InputDataset, PileupData and ParentData with no location, but both flags on
        ele['NoInputUpdate'] = True
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])
        # test element with InputDataset, PileupData and ParentData with no location, but input flag on
        ele['NoPileupUpdate'] = False
        self.assertEqual(possibleSites(ele), [])
    def getPossibleSitesByRequest(self, requestName):

        if requestName not in self.wqResultsByRequest:
            return None
        # this will include all the possible sites on the requests
        # TODO: when different blocks are located in different site it need to handled
        sites = set()
        for ele in self.wqResultsByRequest[requestName]['Elements']:
            sites = sites | set(possibleSites(ele))
        return sites
    def getPossibleSitesByRequest(self, requestName):

        if requestName not in self.wqResultsByRequest:
            return None
        # this will include all the possible sites on the requests
        # TODO: when different blocks are located in different site it need to handled
        sites = set()
        for ele in self.wqResultsByRequest[requestName]['Elements']:
            sites = sites | set(possibleSites(ele))
        return sites
    def testPossibleSites(self):
        """
        Workqueue element data location check (same as workRestrictions)
        """
        # test element ala MonteCarlo
        ele = WorkQueueElement(SiteWhitelist=["T1_IT_CNAF", "T2_DE_DESY"])
        self.assertEqual(possibleSites(ele), ["T1_IT_CNAF", "T2_DE_DESY"])

        # test element with InputDataset but no location
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": []}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and no match location
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": ["T1_US_FNAL", "T2_CH_CERN"]}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and valid location
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]}
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])

        # test element with InputDataset and ParentData with no location
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]}
        ele['ParentFlag'] = True
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": []}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and ParentData with no match location
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": ["T1_IT_CNAF"]}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset and ParentData with valid location
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": ["T1_US_FNAL", "T2_DE_DESY"]}
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])

        # test element with InputDataset, PileupData and ParentData with no location
        ele['Inputs'] = {"/MY/BLOCK/NAME#73e99a52": ["T1_US_FNAL", "T2_CH_CERN", "T2_DE_DESY"]}
        ele['ParentData'] = {"/MY/BLOCK2/NAME#002590494c06": ["T2_DE_DESY"]}
        ele['PileupData'] = {"/MY/DATASET/NAME": []}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset, PileupData and ParentData with no match location
        ele['PileupData'] = {"/MY/DATASET/NAME": ["T1_IT_CNAF", "T2_CH_CERN"]}
        self.assertEqual(possibleSites(ele), [])
        # test element with InputDataset, PileupData and ParentData with valid location
        ele['PileupData'] = {"/MY/DATASET/NAME": ["T1_IT_CNAF", "T2_DE_DESY"]}
        self.assertEqual(possibleSites(ele), ["T2_DE_DESY"])
예제 #8
0
def printElementsSummary(reqName, elements, queueUrl):
    """
    Print the local couchdb situation based on the WQE status
    """
    print("Summary for %s and request %s" % (queueUrl, reqName))
    for elem in elements:
        if elem['Status'] != "Available":
            continue
        targetSites = possibleSites(elem)
        commonDataLoc = commonDataLocation(elem)
        print("  Element '%s' has the following site intersection: %s, with common data location: %s"
              % (elem.id, targetSites, commonDataLoc))
        printDataLocation(elem)

        if not targetSites and commonDataLoc:
            print("    this workflow has to be assigned to: %s" % commonDataLoc)
        if not targetSites and not commonDataLoc:
            print("    this workflow has to be assigned with AAA flags enabled according to input/PU location")
    def elementsWithHigherPriorityInSameSites(self,
                                              requestName,
                                              returnFormat="dict"):

        if requestName not in self.wqResultsByRequest:
            return None

        priority = self.wqResultsByRequest[requestName]['Priority']
        creationTime = self.wqResultsByRequest[requestName]['Elements'][0][
            'CreationTime']

        sites = self.getPossibleSitesByRequest(requestName)

        sortedElements = []
        for reqName in self.wqResultsByRequest:
            # skip the workflow
            if reqName == requestName:
                continue
            if self.wqResultsByRequest[reqName]['Priority'] >= priority:
                for element in self.wqResultsByRequest[reqName]['Elements']:
                    if element['CreationTime'] > creationTime:
                        continue
                    if len(sites) > 0:
                        commonSites = possibleSites(element)
                        if len(set(commonSites) & sites) > 0:
                            sortedElements.append(element)
                    else:
                        sortedElements.append(element)
        # sort elements to get them in priority first and timestamp order
        sortedElements.sort(key=lambda element: element['CreationTime'])
        sortedElements.sort(key=lambda x: x['Priority'], reverse=True)
        if returnFormat == "list":
            return sortedElements
        elif returnFormat == "dict":
            sortedByRequest = defaultdict(list)
            for ele in sortedElements:
                sortedByRequest[ele['RequestName']].append(ele)

            for request in sortedByRequest:
                sortedByRequest[request] = WorkQueueElementResult(
                    Elements=sortedByRequest[request])
            return sortedByRequest
    def elementsWithHigherPriorityInSameSites(self, requestName, returnFormat="dict"):

        if requestName not in self.wqResultsByRequest:
            return None

        priority = self.wqResultsByRequest[requestName]['Priority']
        creationTime = self.wqResultsByRequest[requestName]['Elements'][0]['CreationTime']

        sites = self.getPossibleSitesByRequest(requestName)

        sortedElements = []
        for reqName in self.wqResultsByRequest:
            # skip the workflow
            if reqName == requestName:
                continue
            if self.wqResultsByRequest[reqName]['Priority'] >= priority:
                for element in self.wqResultsByRequest[reqName]['Elements']:
                    if element['CreationTime'] > creationTime:
                        continue
                    if len(sites) > 0:
                        commonSites = possibleSites(element)
                        if len(set(commonSites) & sites) > 0:
                            sortedElements.append(element)
                    else:
                        sortedElements.append(element)
        # sort elements to get them in priority first and timestamp order
        sortedElements.sort(key=lambda element: element['CreationTime'])
        sortedElements.sort(key = lambda x: x['Priority'], reverse = True)
        if returnFormat == "list":
            return sortedElements
        elif returnFormat == "dict":
            sortedByRequest = defaultdict(list)
            for ele in sortedElements:
                sortedByRequest[ele['RequestName']].append(ele)

            for request in sortedByRequest:
                sortedByRequest[request] = WorkQueueElementResult(Elements=sortedByRequest[request])
            return sortedByRequest
예제 #11
0
    def availableWork(self,
                      thresholds,
                      siteJobCounts,
                      team=None,
                      wfs=None,
                      excludeWorkflows=None,
                      numElems=9999999):
        """
        Get work which is available to be run

        Assume thresholds is a dictionary; keys are the site name, values are
        the maximum number of running jobs at that site.

        Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site
        name and task priorities.  The value is the number of jobs running at that
        priority.

        It will pull work until it reaches the number of elements configured (numElems).
        Since it's also used for calculating free resources, default it to "infinity"

        Note: this method will be called with no limit of work elements when it's simply
        calculating the resources available (based on what is in LQ), before it gets work
        from GQ
        """
        self.logger.info("Getting up to %d available work from %s", numElems,
                         self.queueUrl)

        excludeWorkflows = excludeWorkflows or []
        elements = []
        sortedElements = []

        # We used to pre-filter sites, looking to see if there are idle job slots
        # We don't do this anymore, as we may over-allocate
        # jobs to sites if the new jobs have a higher priority.

        # If there are no sites, punt early.
        if not thresholds:
            self.logger.error("No thresholds is set: Please check")
            return elements, thresholds, siteJobCounts

        options = {}
        options['include_docs'] = True
        options['descending'] = True
        options['resources'] = thresholds
        if team:
            options['team'] = team
            self.logger.info("setting team to %s" % team)
        if wfs:
            result = []
            for i in xrange(0, len(wfs), 20):
                options['wfs'] = wfs[i:i + 20]
                data = self.db.loadList('WorkQueue', 'workRestrictions',
                                        'availableByPriority', options)
                result.extend(json.loads(data))
        else:
            result = self.db.loadList('WorkQueue', 'workRestrictions',
                                      'availableByPriority', options)
            result = json.loads(result)
            if len(result) == 0:
                self.logger.info(
                    """No available work in WQ or didn't pass workqueue restriction
                                    - check Pileup, site white list, etc""")
            self.logger.debug("Available Work:\n %s \n for resources\n %s" %
                              (result, thresholds))
        # Iterate through the results; apply whitelist / blacklist / data
        # locality restrictions.  Only assign jobs if they are high enough
        # priority.
        for i in result:
            element = CouchWorkQueueElement.fromDocument(self.db, i)
            # filter out exclude list from abvaling
            if element['RequestName'] not in excludeWorkflows:
                sortedElements.append(element)

        # sort elements to get them in priority first and timestamp order
        sortedElements.sort(key=lambda element: element['CreationTime'])
        sortedElements.sort(key=lambda x: x['Priority'], reverse=True)

        for element in sortedElements:
            if numElems <= 0:
                self.logger.info(
                    "Reached the maximum number of elements to be pulled: %d",
                    len(elements))
                break

            if not possibleSites(element):
                self.logger.info("No possible sites for %s with doc id %s",
                                 element['RequestName'], element.id)
                continue

            prio = element['Priority']
            possibleSite = None
            sites = thresholds.keys()
            random.shuffle(sites)
            for site in sites:
                if element.passesSiteRestriction(site):
                    # Count the number of jobs currently running of greater priority
                    curJobCount = sum([
                        x[1] if x[0] >= prio else 0
                        for x in siteJobCounts.get(site, {}).items()
                    ])
                    self.logger.debug(
                        "Job Count: %s, site: %s thresholds: %s" %
                        (curJobCount, site, thresholds[site]))
                    if curJobCount < thresholds[site]:
                        possibleSite = site
                        break

            if possibleSite:
                numElems -= 1
                self.logger.debug("Possible site exists %s" %
                                  str(possibleSite))
                elements.append(element)
                if possibleSite not in siteJobCounts:
                    siteJobCounts[possibleSite] = {}
                siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \
                                                    element['Jobs'] * element.get('blowupFactor', 1.0)
            else:
                self.logger.debug(
                    "No available resources for %s with doc id %s",
                    element['RequestName'], element.id)

        return elements, thresholds, siteJobCounts
예제 #12
0
    def availableWork(self, thresholds, siteJobCounts, teams=None, wfs=None,
                      excludeWorkflows=None, numElems=9999999):
        """
        Get work which is available to be run

        Assume thresholds is a dictionary; keys are the site name, values are
        the maximum number of running jobs at that site.

        Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site
        name and task priorities.  The value is the number of jobs running at that
        priority.

        It will pull work until it reaches the number of elements configured (numElems).
        Since it's also used for calculating free resources, default it to "infinity"

        Note: this method will be called with no limit of work elements when it's simply
        calculating the resources available (based on what is in LQ), before it gets work
        from GQ
        """
        self.logger.info("Getting up to %d available work from %s", numElems, self.queueUrl)

        excludeWorkflows = excludeWorkflows or []
        elements = []
        sortedElements = []

        # We used to pre-filter sites, looking to see if there are idle job slots
        # We don't do this anymore, as we may over-allocate
        # jobs to sites if the new jobs have a higher priority.

        # If there are no sites, punt early.
        if not thresholds:
            self.logger.error("No thresholds is set: Please check")
            return elements, thresholds, siteJobCounts

        options = {}
        options['include_docs'] = True
        options['descending'] = True
        options['resources'] = thresholds
        if teams:
            options['teams'] = teams
            self.logger.info("setting teams %s" % teams)
        if wfs:
            result = []
            for i in xrange(0, len(wfs), 20):
                options['wfs'] = wfs[i:i + 20]
                data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options)
                result.extend(json.loads(data))
        else:
            result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options)
            result = json.loads(result)
            if len(result) == 0:
                self.logger.info("""No available work in WQ or didn't pass workqueue restriction
                                    - check Pileup, site white list, etc""")
            self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds))
        # Iterate through the results; apply whitelist / blacklist / data
        # locality restrictions.  Only assign jobs if they are high enough
        # priority.
        for i in result:
            element = CouchWorkQueueElement.fromDocument(self.db, i)
            # filter out exclude list from abvaling
            if element['RequestName'] not in excludeWorkflows:
                sortedElements.append(element)

        # sort elements to get them in priority first and timestamp order
        sortedElements.sort(key=lambda element: element['CreationTime'])
        sortedElements.sort(key=lambda x: x['Priority'], reverse=True)

        for element in sortedElements:
            if numElems <= 0:
                self.logger.info("Reached the maximum number of elements to be pulled: %d", len(elements))
                break

            if not possibleSites(element):
                self.logger.info("No possible sites for %s with doc id %s", element['RequestName'], element.id)
                continue

            prio = element['Priority']
            possibleSite = None
            sites = thresholds.keys()
            random.shuffle(sites)
            for site in sites:
                if element.passesSiteRestriction(site):
                    # Count the number of jobs currently running of greater priority
                    curJobCount = sum(map(lambda x: x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items()))
                    self.logger.debug("Job Count: %s, site: %s thresholds: %s" % (curJobCount, site, thresholds[site]))
                    if curJobCount < thresholds[site]:
                        possibleSite = site
                        break

            if possibleSite:
                numElems -= 1
                self.logger.debug("Possible site exists %s" % str(possibleSite))
                elements.append(element)
                if possibleSite not in siteJobCounts:
                    siteJobCounts[possibleSite] = {}
                siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \
                                                    element['Jobs'] * element.get('blowupFactor', 1.0)
            else:
                self.logger.info("No available resources for %s with doc id %s", element['RequestName'], element.id)

        return elements, thresholds, siteJobCounts
def getGlobalSiteStatusSummary(elements, status=None, dataLocality=False):
    """
    _getGlobalSiteStatusSummary_

    Given a dict with workqueue elements keyed by status, such as this format:
    {u'Canceled': [{u'Inputs': {}, u'Jobs': 18,...}, {u'Jobs': 11,...}],
     u'Running': [{'Priority': 190000,..}, ...]}

    Creates a summary of jobs and number of wq elements in each status
    distributed among the sites whitelisted. There are 2 job distribution:
     *) unique top level jobs per site and per status and (equally
        distributed among all sites)
     *) possible top level jobs per site and per status (consider all
        jobs can run in a single location)

    If status is provided, then skip any workqueue element not in the
    given status. Otherwise filter only active workqueue status.

    If dataLocality is set to True, then it considers only sites that pass
    the data location constraint.
    """
    if status and isinstance(status, (str, bytes)):
        activeStatus = [status]
    elif status and isinstance(status, (list, tuple)):
        activeStatus = status
    else:
        activeStatus = list(elements)

    uniqueJobsSummary = {}
    possibleJobsSummary = {}

    for st in activeStatus:
        uniqueJobsSummary.setdefault(st, {})
        possibleJobsSummary.setdefault(st, {})
        uniqueJobs = {}
        possibleJobs = {}
        for elem in elements.get(st, []):
            elem = elem[
                'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement']
            if dataLocality:
                commonSites = possibleSites(elem)
            else:
                commonSites = list(
                    set(elem['SiteWhitelist']) - set(elem['SiteBlacklist']))

            try:
                jobsPerSite = elem['Jobs'] / len(commonSites)
            except ZeroDivisionError:
                commonSites = ['NoPossibleSite']
                jobsPerSite = elem['Jobs']

            for site in commonSites:
                uniqueJobs.setdefault(site, {'sum_jobs': 0, 'num_elem': 0})
                possibleJobs.setdefault(site, {'sum_jobs': 0, 'num_elem': 0})

                uniqueJobs[site]['sum_jobs'] += ceil(jobsPerSite)
                uniqueJobs[site]['num_elem'] += 1
                possibleJobs[site]['sum_jobs'] += ceil(elem['Jobs'])
                possibleJobs[site]['num_elem'] += 1

        uniqueJobsSummary[st].update(uniqueJobs)
        possibleJobsSummary[st].update(possibleJobs)

    return uniqueJobsSummary, possibleJobsSummary
예제 #14
0
    def availableWork(self,
                      thresholds,
                      siteJobCounts,
                      team=None,
                      excludeWorkflows=None,
                      numElems=9999999):
        """
        Get work - either from local or global queue - which is available to be run.

        :param thresholds: a dictionary key'ed by the site name, values representing the
            maximum number of jobs allowed at that site.
        :param siteJobCounts: a dictionary-of-dictionaries key'ed by the site name; value
            is a dictionary with the number of jobs running at a given priority.
        :param team: a string with the team name we want to pull work for
        :param excludeWorkflows: list of (aborted) workflows that should not be accepted
        :param numElems: integer with the maximum number of elements to be accepted (default
            to a very large number when pulling work from local queue, read unlimited)
        :return: a tuple with the elements accepted and an overview of job counts per site
        """
        excludeWorkflows = excludeWorkflows or []
        elements = []
        # If there are no sites, punt early.
        if not thresholds:
            self.logger.error("No thresholds is set: Please check")
            return elements, siteJobCounts

        self.logger.info("Current siteJobCounts:")
        for site, jobsByPrio in viewitems(siteJobCounts):
            self.logger.info("    %s : %s", site, jobsByPrio)

        self.logger.info("Getting up to %d available work from %s", numElems,
                         self.queueUrl)
        self.logger.info("  for team name: %s", team)
        self.logger.info("  with excludeWorkflows: %s", excludeWorkflows)
        self.logger.info("  for thresholds: %s", thresholds)

        # FIXME: magic numbers
        docsSliceSize = 1000
        options = {}
        options['include_docs'] = True
        options['descending'] = True
        options['resources'] = thresholds
        options['limit'] = docsSliceSize
        # FIXME: num_elem option can likely be deprecated, but it needs synchronization
        # between agents and global workqueue... for now, make sure it can return the slice size
        options['num_elem'] = docsSliceSize
        if team:
            options['team'] = team

        # Fetch workqueue elements in slices, using the CouchDB "limit" and "skip"
        # options for couch views. Conditions to stop this loop are:
        #  a) have a hard stop at 50k+1 (we might have to make this configurable)
        #  b) stop as soon as an empty slice is returned by Couch (thus all docs have
        #     already been retrieve)
        #  c) or, once "numElems" elements have been accepted
        numSkip = 0
        breakOut = False
        while True:
            if breakOut:
                # then we have reached the maximum number of elements to be accepted
                break
            self.logger.info("  with limit docs: %s, and skip first %s docs",
                             docsSliceSize, numSkip)
            options['skip'] = numSkip

            result = self.db.loadList('WorkQueue', 'workRestrictions',
                                      'availableByPriority', options)
            result = json.loads(result)
            if result:
                self.logger.info(
                    "Retrieved %d elements from workRestrictions list for: %s",
                    len(result), self.queueUrl)
            else:
                self.logger.info(
                    "All the workqueue elements have been exhausted for: %s ",
                    self.queueUrl)
                break
            # update number of documents to skip in the next cycle
            numSkip += docsSliceSize

            # Convert python dictionary into Couch WQE objects, skipping aborted workflows
            # And sort them by creation time and priority, such that highest priority and
            # oldest elements come first in the list
            sortedElements = []
            for i in result:
                element = CouchWorkQueueElement.fromDocument(self.db, i)
                # make sure not to acquire work for aborted or force-completed workflows
                if element['RequestName'] in excludeWorkflows:
                    msg = "Skipping aborted/force-completed workflow: %s, work id: %s"
                    self.logger.info(msg, element['RequestName'], element._id)
                else:
                    sortedElements.append(element)
            sortAvailableElements(sortedElements)

            for element in sortedElements:
                if numElems <= 0:
                    msg = "Reached maximum number of elements to be accepted, "
                    msg += "configured to: {}, from queue: {}".format(
                        len(elements), self.queueUrl)
                    self.logger.info(msg)
                    breakOut = True  # get out of the outer loop as well
                    break
                commonSites = possibleSites(element)
                prio = element['Priority']
                # shuffle list of common sites all the time to give everyone the same chance
                random.shuffle(commonSites)
                possibleSite = None
                for site in commonSites:
                    if site in thresholds:
                        # Count the number of jobs currently running of greater priority, if they
                        # are less than the site thresholds, then accept this element
                        curJobCount = sum([
                            x[1] if x[0] >= prio else 0
                            for x in viewitems(siteJobCounts.get(site, {}))
                        ])
                        self.logger.debug(
                            "Job Count: %s, site: %s thresholds: %s" %
                            (curJobCount, site, thresholds[site]))
                        if curJobCount < thresholds[site]:
                            possibleSite = site
                            break

                if possibleSite:
                    self.logger.info(
                        "Accepting workflow: %s, with prio: %s, element id: %s, for site: %s",
                        element['RequestName'], prio, element.id, possibleSite)
                    numElems -= 1
                    elements.append(element)
                    siteJobCounts.setdefault(possibleSite, {})
                    siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \
                                                        element['Jobs'] * element.get('blowupFactor', 1.0)
                else:
                    self.logger.debug(
                        "No available resources for %s with doc id %s",
                        element['RequestName'], element.id)

        self.logger.info(
            "And %d elements passed location and siteJobCounts restrictions for: %s",
            len(elements), self.queueUrl)
        return elements, siteJobCounts
예제 #15
0
    def calculateAvailableWork(self, thresholds, siteJobCounts):
        """
        A short version of the `availableWork` method, which is used only to calculate
        the amount of work already available at the local workqueue.
        :param thresholds: a dictionary key'ed by the site name, values representing the
            maximum number of jobs allowed at that site.
        :param siteJobCounts: a dictionary-of-dictionaries key'ed by the site name; value
            is a dictionary with the number of jobs running at a given priority.
        :return: a tuple with the elements accepted and an overview of job counts per site
        """
        # NOTE: this method can be less verbose as well
        elements = []
        # If there are no sites, punt early.
        if not thresholds:
            self.logger.error("No thresholds is set: Please check")
            return elements, siteJobCounts

        self.logger.info("Calculating available work from queue %s",
                         self.queueUrl)

        options = {}
        options['include_docs'] = True
        options['descending'] = True
        options['resources'] = thresholds
        options['num_elem'] = 9999999  # magic number!
        result = self.db.loadList('WorkQueue', 'workRestrictions',
                                  'availableByPriority', options)
        result = json.loads(result)
        self.logger.info(
            "Retrieved %d elements from workRestrictions list for: %s",
            len(result), self.queueUrl)

        # Convert python dictionary into Couch WQE objects
        # And sort them by creation time and priority, such that highest priority and
        # oldest elements come first in the list
        sortedElements = []
        for item in result:
            element = CouchWorkQueueElement.fromDocument(self.db, item)
            sortedElements.append(element)
        sortAvailableElements(sortedElements)

        for element in sortedElements:
            commonSites = possibleSites(element)
            prio = element['Priority']
            # shuffle list of common sites all the time to give everyone the same chance
            random.shuffle(commonSites)
            possibleSite = None
            for site in commonSites:
                if site in thresholds:
                    # Count the number of jobs currently running of greater priority, if they
                    # are less than the site thresholds, then accept this element
                    curJobCount = sum([
                        x[1] if x[0] >= prio else 0
                        for x in viewitems(siteJobCounts.get(site, {}))
                    ])
                    self.logger.debug("Job Count: %s, site: %s thresholds: %s",
                                      curJobCount, site, thresholds[site])
                    if curJobCount < thresholds[site]:
                        possibleSite = site
                        break

            if possibleSite:
                self.logger.debug(
                    "Meant to accept workflow: %s, with prio: %s, element id: %s, for site: %s",
                    element['RequestName'], prio, element.id, possibleSite)
                elements.append(element)
                siteJobCounts.setdefault(possibleSite, {})
                siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \
                                                    element['Jobs'] * element.get('blowupFactor', 1.0)
            else:
                self.logger.debug(
                    "No available resources for %s with localdoc id %s",
                    element['RequestName'], element.id)

        self.logger.info(
            "And %d elements passed location and siteJobCounts restrictions for: %s",
            len(elements), self.queueUrl)
        return elements, siteJobCounts
예제 #16
0
    def availableWork(self, thresholds, siteJobCounts, team=None, wfs=None,
                      excludeWorkflows=None, numElems=9999999):
        """
        Get work which is available to be run

        Assume thresholds is a dictionary; keys are the site name, values are
        the maximum number of running jobs at that site.

        Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site
        name and task priorities.  The value is the number of jobs running at that
        priority.

        It will pull work until it reaches the number of elements configured (numElems).
        Since it's also used for calculating free resources, default it to "infinity"

        Note: this method will be called with no limit of work elements when it's simply
        calculating the resources available (based on what is in LQ), before it gets work
        from GQ
        """
        self.logger.info("Getting up to %d available work from %s", numElems, self.queueUrl)
        self.logger.info("  for team name: %s", team)
        self.logger.info("  for wfs: %s", wfs)
        self.logger.info("  with excludeWorkflows: %s", excludeWorkflows)
        self.logger.info("  for thresholds: %s", thresholds)

        excludeWorkflows = excludeWorkflows or []
        elements = []
        sortedElements = []

        # We used to pre-filter sites, looking to see if there are idle job slots
        # We don't do this anymore, as we may over-allocate
        # jobs to sites if the new jobs have a higher priority.

        # If there are no sites, punt early.
        if not thresholds:
            self.logger.error("No thresholds is set: Please check")
            return elements, thresholds, siteJobCounts

        options = {}
        options['include_docs'] = True
        options['descending'] = True
        options['num_elem'] = numElems
        options['resources'] = thresholds
        if team:
            options['team'] = team
        if wfs:
            result = []
            for i in xrange(0, len(wfs), 20):
                options['wfs'] = wfs[i:i + 20]
                data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options)
                result.extend(json.loads(data))
        else:
            result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options)
            result = json.loads(result)
            if not result:
                self.logger.info("No available work or it did not pass work/data restrictions for: %s ",
                                 self.queueUrl)
            else:
                self.logger.info("Retrieved %d elements from workRestrictions list for: %s",
                                 len(result), self.queueUrl)

        # Iterate through the results; apply whitelist / blacklist / data
        # locality restrictions.  Only assign jobs if they are high enough
        # priority.
        for i in result:
            element = CouchWorkQueueElement.fromDocument(self.db, i)
            # make sure not to acquire work for aborted or force-completed workflows
            if element['RequestName'] in excludeWorkflows:
                msg = "Skipping aborted/force-completed workflow: %s, work id: %s"
                self.logger.info(msg, element['RequestName'], element._id)
            else:
                sortedElements.append(element)
        # sort elements to get them in priority first and timestamp order
        sortedElements.sort(key=lambda element: element['CreationTime'])
        sortedElements.sort(key=lambda x: x['Priority'], reverse=True)

        sites = thresholds.keys()
        self.logger.info("Current siteJobCounts:")
        for site, jobsByPrio in siteJobCounts.items():
            self.logger.info("    %s : %s", site, jobsByPrio)

        for element in sortedElements:
            commonSites = possibleSites(element)
            prio = element['Priority']
            possibleSite = None
            random.shuffle(sites)
            for site in sites:
                if site in commonSites:
                    # Count the number of jobs currently running of greater priority
                    curJobCount = sum([x[1] if x[0] >= prio else 0 for x in siteJobCounts.get(site, {}).items()])
                    self.logger.debug("Job Count: %s, site: %s thresholds: %s" % (curJobCount, site, thresholds[site]))
                    if curJobCount < thresholds[site]:
                        possibleSite = site
                        break

            if possibleSite:
                elements.append(element)
                siteJobCounts.setdefault(possibleSite, {})
                siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \
                                                    element['Jobs'] * element.get('blowupFactor', 1.0)
            else:
                self.logger.debug("No available resources for %s with doc id %s", element['RequestName'], element.id)

        self.logger.info("And %d elements passed location and siteJobCounts restrictions for: %s",
                         len(elements), self.queueUrl)
        return elements, thresholds, siteJobCounts