Exemple #1
0
    def getFilesSortedByLocation(self, lumisPerJob):
        """
        _getFilesSortedByLocation_

        Retrieves a list of files available and sort them by location.
        If the fileset is closed, resume the splitting. Otherwise check whether
        there are enough lumis in each of these locations. If lumis don't
        match the desired lumis_per_job splitting parameter, then skip those
        files until further cycles.
        :param lumisPerJob: number of lumi sections desired in the splitting
        :return: a dictionary of files, key'ed by a frozenset location
        """
        lDict = self.sortByLocation()
        if not self.loadRunLumi:
            return lDict  # then it's a DataStruct/CRAB splitting

        checkMinimumWork = self.checkForAmountOfWork()

        # first, check whether we have enough files to reach the desired lumis_per_job
        for sites in lDict.keys():
            fileLumis = self.loadRunLumi.execute(files=lDict[sites])
            if checkMinimumWork:
                # fileLumis has a format like {230: {1: [1]}, 232: {1: [2]}, 304: {1: [3]}, 306: {1: [4]}}
                availableLumisPerLocation = [runL for fileItem in fileLumis.values() for runL in fileItem.values()]

                if lumisPerJob > len(flattenList(availableLumisPerLocation)):
                    # then we don't split these files for the moment
                    lDict.pop(sites)
                    continue
            for f in lDict[sites]:
                lumiDict = fileLumis.get(f['id'], {})
                for run in lumiDict.keys():
                    f.addRun(run=Run(run, *lumiDict[run]))

        return lDict
 def testFlattenList(self):
     """
     Test the flattenList function (returns a flat list out
     of a list of lists)
     """
     doubleList = [list(range(1, 4)), list(range(10, 11)), list(range(15, 18))]
     flatList = flattenList(doubleList)
     self.assertEqual(len(flatList), 7)
     self.assertEqual(set(flatList), set([1, 2, 3, 10, 15, 16, 17]))
 def testFlattenList(self):
     """
     Test the flattenList function (returns a flat list out
     of a list of lists)
     """
     doubleList = [range(1, 4), range(10, 11), range(15, 18)]
     flatList = flattenList(doubleList)
     self.assertEqual(len(flatList), 7)
     self.assertEqual(set(flatList), set([1, 2, 3, 10, 15, 16, 17]))
Exemple #4
0
    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """

        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if not self.timeouts:
            # Then we've set ourselves to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = defaultdict(list)

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime = job.get('status_time', None)
            timeout = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero",
                              job['id'])
                continue
            if timeout and statusTime:
                if time.time() - float(statusTime) > float(timeout):
                    # Timeout status is used by JobTracker to fail jobs in WMBS database
                    logging.info(
                        "Killing job %i because it has exceeded timeout for status '%s'",
                        job['id'], globalState)
                    job['status'] = 'Timeout'
                    jobsToKill[globalState].append(job)

        timeOutCodeMap = {"Running": 71304, "Pending": 71305, "Error": 71306}
        # We need to show that the jobs are in state timeout
        # and then kill them.
        jobsToKillList = flattenList(listvalues(jobsToKill))
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKillList)
        for preJobStatus in jobsToKill:
            eCode = timeOutCodeMap.get(
                preJobStatus, 71307
            )  # it shouldn't have 71307 (states should be among Running, Pending, Error)
            self.bossAir.kill(jobs=jobsToKill[preJobStatus],
                              killMsg=WM_JOB_ERROR_CODES[eCode],
                              errorCode=eCode)
        myThread.transaction.commit()

        return
Exemple #5
0
    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """


        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if not self.timeouts:
            # Then we've set ourselves to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = defaultdict(list)

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime = job.get('status_time', None)
            timeout = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero", job['id'])
                continue
            if timeout and statusTime:
                if time.time() - float(statusTime) > float(timeout):
                    # Timeout status is used by JobTracker to fail jobs in WMBS database
                    logging.info("Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState)
                    job['status'] = 'Timeout'
                    jobsToKill[globalState].append(job)

        timeOutCodeMap = {"Running": 71304, "Pending": 71305, "Error": 71306}
        # We need to show that the jobs are in state timeout
        # and then kill them.
        jobsToKillList = flattenList(jobsToKill.values())
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKillList)
        for preJobStatus in jobsToKill:
            eCode = timeOutCodeMap.get(preJobStatus, 71307) # it shouldn't have 71307 (states should be among Running, Pending, Error)
            self.bossAir.kill(jobs=jobsToKill[preJobStatus], killMsg=WM_JOB_ERROR_CODES[eCode], errorCode=eCode)
        myThread.transaction.commit()

        return
Exemple #6
0
    def getFilesSortedByLocation(self, lumisPerJob):
        """
        _getFilesSortedByLocation_

        Retrieves a list of files available and sort them by location.
        If the fileset is closed, resume the splitting. Otherwise check whether
        there are enough lumis in each of these locations. If lumis don't
        match the desired lumis_per_job splitting parameter, then skip those
        files until further cycles.
        :param lumisPerJob: number of lumi sections desired in the splitting
        :return: a dictionary of files, key'ed by a frozenset location
        """
        lDict = self.sortByLocation()
        if not self.loadRunLumi:
            return lDict  # then it's a DataStruct/CRAB splitting

        checkMinimumWork = self.checkForAmountOfWork()

        # first, check whether we have enough files to reach the desired lumis_per_job
        for sites in lDict.keys():
            fileLumis = self.loadRunLumi.execute(files=lDict[sites])
            if not fileLumis:
                logging.warning("Empty fileLumis dict for workflow %s, subs %s.",
                                self.subscription.workflowName(), self.subscription['id'])
            if checkMinimumWork:
                # fileLumis has a format like {230: {1: [1]}, 232: {1: [2]}, 304: {1: [3]}, 306: {1: [4]}}
                availableLumisPerLocation = [runL for fileItem in fileLumis.values() for runL in fileItem.values()]

                if lumisPerJob > len(flattenList(availableLumisPerLocation)):
                    # then we don't split these files for the moment
                    lDict.pop(sites)
                    continue
            for f in lDict[sites]:
                lumiDict = fileLumis.get(f['id'], {})
                for run in lumiDict.keys():
                    f.addRun(run=Run(run, *lumiDict[run]))

        return lDict
Exemple #7
0
def getWMBSInfo(config):
    connectToDB()
    myThread = threading.currentThread()
    formatter = DBFormatter(logging, myThread.dbi)

    workflows = formatter.formatDict(myThread.dbi.processData(knownWorkflows))
    workflows = [wf['name'] for wf in workflows]
    print("\n*** WORKFLOWS: found %d distinct workflows in this agent." %
          len(workflows))
    workflowsDict = fetchWorkflowsSpec(config, workflows)
    printWfStatus(workflows, workflowsDict)

    for st in ('Available', 'Negotiating', 'Acquired', 'Running'):
        print(
            "\n*** WORKQUEUE: elements still marked as %s in LQ workqueue / workqueue_inbox."
            % st)
        checkLocalWQStatus(config, st)

    for st in ("Acquired", "Running"):
        print("\n*** WORKQUEUE: elements still marked as %s in GQ workqueue." %
              st)
        checkGlobalWQStatus(config, st)

    workflows = formatter.formatDict(myThread.dbi.processData(incompleteWfs))
    workflows = [wf['name'] for wf in workflows]
    print("\n*** WORKFLOWS: there are %d distinct workflows not completed." %
          len(workflows))
    printWfStatus(workflows, workflowsDict)

    wfsNotInjected = flattenList(
        formatter.format(myThread.dbi.processData(workflowsNotInjected)))
    print("\n*** WORKFLOWS: found %d workflows not fully injected." %
          len(wfsNotInjected))
    printWfStatus(wfsNotInjected, workflowsDict)

    jobsByState = formatter.formatDict(
        myThread.dbi.processData(jobCountByState))
    print("\n*** WMBS: amount of wmbs jobs in each status:\n%s" % jobsByState)
    # IF we have executing jobs in wmbs and nothing in condor, then investigate the wfs
    if 'executing' in [item['name'] for item in jobsByState]:
        wfsJobCount = formatter.formatDict(
            myThread.dbi.processData(workflowsExecuting))
        print("\n*** WMBS: %d workflows with executing jobs in wmbs:" %
              len(wfsJobCount))
        workflows = [wf['name'] for wf in wfsJobCount]
        printWfStatus(workflows, workflowsDict)

    unfinishedSubs = formatter.formatDict(
        myThread.dbi.processData(unfinishedSubscriptions))
    unfinishedSubs = [wf['wfname'] for wf in unfinishedSubs]
    print("\n*** SUBSCRIPTIONS: subscriptions not finished: %d" %
          len(unfinishedSubs))
    printWfStatus(unfinishedSubs, workflowsDict)

    filesAvailable = formatter.formatDict(
        myThread.dbi.processData(filesAvailWMBS))
    print(
        "\n*** SUBSCRIPTIONS: found %d files available in WMBS (waiting for job creation):\n%s"
        % (len(filesAvailable), filesAvailable))

    filesAcquired = formatter.formatDict(
        myThread.dbi.processData(filesAcqWMBS))
    print(
        "\n*** SUBSCRIPTIONS: found %d files acquired in WMBS (waiting for jobs to finish):\n%s"
        % (len(filesAcquired), filesAcquired))

    blocksopenDBS = formatter.formatDict(
        myThread.dbi.processData(blocksOpenDBS))
    print("\n*** DBS: found %d blocks open in DBS." % len(blocksopenDBS),
          end="")
    print(" Printing the first 20 blocks only:\n%s" % blocksopenDBS[:20])

    filesnotinDBS = flattenList(
        formatter.format(myThread.dbi.processData(filesNotInDBS)))
    print("\n*** DBS: found %d files not uploaded to DBS.\n" %
          len(filesnotinDBS))
    getDsetAndWf(filesnotinDBS, workflowsDict)

    filesnotinPhedex = flattenList(
        formatter.format(myThread.dbi.processData(filesNotInPhedex)))
    print(
        "\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (recoverable)."
        % len(filesnotinPhedex))
    getDsetAndWf(filesnotinPhedex, workflowsDict)

    filesnotinPhedexNull = flattenList(
        formatter.format(myThread.dbi.processData(filesNotInPhedexNull)))
    print(
        "\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (unrecoverable)."
        % len(filesnotinPhedexNull))
    getDsetAndWf(filesnotinPhedexNull, workflowsDict)
    def _wfParse(self):
        """
        Workflow description parser. Given a document template representing all the
        keyNames to be searched and a workflow description to search in recursively,
        returns all the fields that it can find aggregated according to the rules bellow:
            * if the number of found key instances is 0 - sets the default value from
              the template.
            * if the number of found key instances is 1 - sets the so found value from the
              workflow description and converts it back to the form expected and described
              in the template (removes the outermost list used for value aggregation)
            * if the number of found key instances is > 1 - the values are aggregated
              according to the expected types and data structure defined in the
              template as follows:
                * bool: sets it to True if any of the values found was set to True
                * list: chains/flattens all the sub lists into a single list containing
                        all the values found
                * dict: aggregates/flattens all the key-value pairs from all the
                        dictionaries found into one big dictionary
                        WARNING: (if an inner keyName happens to be found in multiple
                                  dictionaries from the aggregated list of dictionaries
                                  it will be overwritten with the values from the last
                                  one to be merged into the finally constructed dictionary)!
                * str:  will be accumulated in a list containing all the values found
                        WARNING: (will change the expected structure of the field from
                                  a single string to a list of strings)!

        :param wfDescr:     Dictionary with the workflow description
        :param docTemplate: Document template in the form of a list of tuples as follows:
                            [('KeyName', DefaultValue, type),
                             ('KeyName', DefaultValue, type),
                             ...]
                            To be used for identifying the fields to be searched for
                            in the workflow description
        """

        # Convert back the so aggregated extDoc to the original structure:
        for keyName, data in viewitems(self.extDoc):
            if len(data['values']) == 0:
                self.extDoc[keyName] = deepcopy(data['default'])
            elif len(data['values']) == 1:
                self.extDoc[keyName] = deepcopy(data['values'][0])
            elif len(data['values']) > 1:
                if data['type'] is bool:
                    self.extDoc[keyName] = any(data['values'])
                elif data['type'] is list:
                    self.extDoc[keyName] = list(
                        set(flattenList(data['values'])))
                    # WARNING: If it happens this list to be constructed out of elements
                    #          which are instances of unhashable types (e.g. dict, list)
                    #          the set() call will produce an ERR, but this is unlikely
                    #          to happen, see [1] - All the fields we fetch from the
                    #          so nested structure of Task/Step Chain dictionary are
                    #          of hashable types.
                    # [1] https://github.com/dmwm/WMCore/blob/ed40d33069bdddcd98ed5b8430d5ca6662e5941f/src/python/WMCore/WMSpec/StdSpecs/StdBase.py#L1189
                elif data['type'] is dict:
                    self.extDoc[keyName] = {}
                    for item in data['values']:
                        self.extDoc[keyName].update(item)
                elif (isinstance(data['type'], tuple) and (bytes in data['type'] or str in data['type'])) or \
                     (data['type'] is bytes or data['type'] is str):
                    data['values'] = list(set(data['values']))
                    if len(data['values']) == 1:
                        self.extDoc[keyName] = deepcopy(data['values'][0])
                    else:
                        self.extDoc[keyName] = deepcopy(data['values'])
def getWMBSInfo(config):
    """
    blah
    :return:
    """
    connectToDB()
    myThread = threading.currentThread()
    formatter = DBFormatter(logging, myThread.dbi)

    workflows = formatter.formatDict(myThread.dbi.processData(knownWorkflows))
    workflows = [wf['name'] for wf in workflows]
    print("\n*** WORKFLOWS: found %d distinct workflows in this agent." % len(workflows))
    workflowsDict = fetchWorkflowsSpec(config, workflows)
    printWfStatus(workflows, workflowsDict)

    for st in ('Available', 'Negotiating', 'Acquired', 'Running'):
        print("\n*** WORKQUEUE: elements still marked as %s in LQ workqueue / workqueue_inbox." % st)
        checkLocalWQStatus(config, st)

    for st in ("Acquired", "Running"):
        print("\n*** WORKQUEUE: elements still marked as %s in GQ workqueue." % st)
        checkGlobalWQStatus(config, st)

    workflows = formatter.formatDict(myThread.dbi.processData(incompleteWfs))
    workflows = [wf['name'] for wf in workflows]
    print("\n*** WORKFLOWS: there are %d distinct workflows not completed." % len(workflows))
    printWfStatus(workflows, workflowsDict)

    wfsNotInjected = flattenList(formatter.format(myThread.dbi.processData(workflowsNotInjected)))
    print("\n*** WORKFLOWS: found %d workflows not fully injected." % len(wfsNotInjected))
    printWfStatus(wfsNotInjected, workflowsDict)

    jobsByState = formatter.formatDict(myThread.dbi.processData(jobCountByState))
    print("\n*** WMBS: amount of wmbs jobs in each status:\n%s" % jobsByState)
    # IF we have executing jobs in wmbs and nothing in condor, then investigate the wfs
    if 'executing' in [item['name'] for item in jobsByState]:
        wfsJobCount = formatter.formatDict(myThread.dbi.processData(workflowsExecuting))
        print("\n*** WMBS: %d workflows with executing jobs in wmbs:" % len(wfsJobCount))
        workflows = [wf['name'] for wf in wfsJobCount]
        printWfStatus(workflows, workflowsDict)

    unfinishedSubs = formatter.formatDict(myThread.dbi.processData(unfinishedSubscriptions))
    unfinishedSubs = [wf['wfname'] for wf in unfinishedSubs]
    print("\n*** SUBSCRIPTIONS: subscriptions not finished: %d" % len(unfinishedSubs))
    printWfStatus(unfinishedSubs, workflowsDict)

    filesAvailable = formatter.formatDict(myThread.dbi.processData(filesAvailWMBS))
    print("\n*** SUBSCRIPTIONS: found %d files available in WMBS (waiting for job creation):\n%s" % (len(filesAvailable),
                                                                                                     filesAvailable))

    filesAcquired = formatter.formatDict(myThread.dbi.processData(filesAcqWMBS))
    print("\n*** SUBSCRIPTIONS: found %d files acquired in WMBS (waiting for jobs to finish):\n%s" % (len(filesAcquired),
                                                                                                      filesAcquired))

    blocksopenDBS = formatter.formatDict(myThread.dbi.processData(blocksOpenDBS))
    print("\n*** DBS: found %d blocks open in DBS." % len(blocksopenDBS), end="")
    print(" Printing the first 20 blocks only:\n%s" % blocksopenDBS[:20])

    filesnotinDBS = flattenList(formatter.format(myThread.dbi.processData(filesNotInDBS)))
    print("\n*** DBS: found %d files not uploaded to DBS.\n" % len(filesnotinDBS))
    getDsetAndWf(filesnotinDBS, workflowsDict)

    filesnotinPhedex = flattenList(formatter.format(myThread.dbi.processData(filesNotInPhedex)))
    print("\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (recoverable)." % len(filesnotinPhedex))
    getDsetAndWf(filesnotinPhedex, workflowsDict)

    filesnotinPhedexNull = flattenList(formatter.format(myThread.dbi.processData(filesNotInPhedexNull)))
    print("\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (unrecoverable)." % len(filesnotinPhedexNull))
    getDsetAndWf(filesnotinPhedexNull, workflowsDict)