def report(self, workflow, userdn, userproxy=None): res = {} self.logger.info( "About to get output of workflow: %s. Getting status first." % workflow) statusRes = self.status(workflow, userdn, userproxy)[0] #load the lumimask rows = self.api.query(None, None, ID.sql, taskname=workflow) splitArgs = literal_eval(rows.next()[6].read()) res['lumiMask'] = buildLumiMask(splitArgs['runs'], splitArgs['lumis']) #extract the finished jobs from filemetadata jobids = [x[1] for x in statusRes['jobList'] if x[0] in ['finished']] rows = self.api.query(None, None, GetFromPandaIds.sql, types='EDM', taskname=workflow, jobids=','.join(map(str,jobids)),\ limit=len(jobids)*100) res['runsAndLumis'] = {} for row in rows: res['runsAndLumis'][str(row[GetFromPandaIds.PANDAID])] = { 'parents': row[GetFromPandaIds.PARENTS].read(), 'runlumi': row[GetFromPandaIds.RUNLUMI].read(), 'events': row[GetFromPandaIds.INEVENTS], } yield res
def algorithm(self, *args, **kwargs): """ _algorithm_ """ myThread = threading.currentThread() periodicInterval = kwargs.get("periodic_harvest_interval", 0) periodicSibling = kwargs.get("periodic_harvest_sibling", False) dqmHarvestUnit = kwargs.get("dqmHarvestUnit", "byRun") runs = kwargs.get("runs", None) lumis = kwargs.get("lumis", None) daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) releasePeriodicJobDAO = daoFactory( classname="JobSplitting.ReleasePeriodicJob") periodicSiblingCompleteDAO = daoFactory( classname="JobSplitting.PeriodicSiblingComplete") fileset = self.subscription.getFileset() fileset.load() goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) if periodicInterval and periodicInterval > 0: # Trigger the Periodic Job if # * it is the first job OR # * the last job ended more than periodicInterval seconds ago triggerJob = releasePeriodicJobDAO.execute( subscription=self.subscription["id"], period=periodicInterval) if triggerJob: myThread.logger.debug("Creating Periodic harvesting job") self.createJobsLocationWise(fileset, False, dqmHarvestUnit, goodRunList) elif not fileset.open: # Trigger the EndOfRun job if # * (same as Periodic to not have JobCreator go nuts and stop after the first iteration) # * there is no Periodic sibling subscription OR # * the Periodic sibling subscription is complete triggerJob = releasePeriodicJobDAO.execute( subscription=self.subscription["id"], period=3600) if triggerJob and periodicSibling: triggerJob = periodicSiblingCompleteDAO.execute( subscription=self.subscription["id"]) if triggerJob: myThread.logger.debug("Creating EndOfRun harvesting job") self.createJobsLocationWise(fileset, True, dqmHarvestUnit, goodRunList) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) eventLimit = int(kwargs.get('max_events_per_lumi', 20000)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) deterministicPileup = kwargs.get('deterministicPileup', False) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id) self.nJobs = jobNumber goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def report(self, workflow, userdn, userproxy=None): res = {} self.logger.info("About to compute report of workflow: %s. Getting status first." % workflow) statusRes = self.status(workflow, userdn, userproxy)[0] #load the lumimask rows = self.api.query(None, None, ID.sql, taskname = workflow) splitArgs = literal_eval(rows.next()[6].read()) res['lumiMask'] = buildLumiMask(splitArgs['runs'], splitArgs['lumis']) self.logger.info("Lumi mask was: %s" % res['lumiMask']) #extract the finished jobs from filemetadata jobids = [x[1] for x in statusRes['jobList'] if x[0] in ['finished', 'transferring']] rows = self.api.query(None, None, GetFromTaskAndType.sql, filetype='EDM', taskname=workflow) res['runsAndLumis'] = {} for row in rows: if row[GetFromTaskAndType.PANDAID] in jobids: res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])] = { 'parents' : row[GetFromTaskAndType.PARENTS].read(), 'runlumi' : row[GetFromTaskAndType.RUNLUMI].read(), 'events' : row[GetFromTaskAndType.INEVENTS], } self.logger.info("Got %s edm files for workflow %s" % (len(res), workflow)) yield res
def algorithm(self, *args, **kwargs): """ _algorithm_ """ myThread = threading.currentThread() periodicInterval = kwargs.get("periodic_harvest_interval", 0) periodicSibling = kwargs.get("periodic_harvest_sibling", False) dqmHarvestUnit = kwargs.get("dqmHarvestUnit", "byRun") runs = kwargs.get("runs", None) lumis = kwargs.get("lumis", None) runWhitelist = set(kwargs.get('runWhitelist', [])) runBlacklist = set(kwargs.get('runBlacklist', [])) goodRunList = runWhitelist.difference(runBlacklist) daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) releasePeriodicJobDAO = daoFactory(classname="JobSplitting.ReleasePeriodicJob") periodicSiblingCompleteDAO = daoFactory(classname="JobSplitting.PeriodicSiblingComplete") fileset = self.subscription.getFileset() fileset.load() lumiMask = {} if runs and lumis: lumiMask = buildLumiMask(runs, lumis) if periodicInterval and periodicInterval > 0: # Trigger the Periodic Job if # * it is the first job OR # * the last job ended more than periodicInterval seconds ago triggerJob = releasePeriodicJobDAO.execute(subscription=self.subscription["id"], period=periodicInterval) if triggerJob: myThread.logger.debug("Creating Periodic harvesting job") self.createJobsLocationWise(fileset, False, dqmHarvestUnit, lumiMask, goodRunList) elif not fileset.open: # Trigger the EndOfRun job if # * (same as Periodic to not have JobCreator go nuts and stop after the first iteration) # * there is no Periodic sibling subscription OR # * the Periodic sibling subscription is complete triggerJob = releasePeriodicJobDAO.execute(subscription=self.subscription["id"], period=3600) if triggerJob and periodicSibling: triggerJob = periodicSiblingCompleteDAO.execute(subscription=self.subscription["id"]) if triggerJob: myThread.logger.debug("Creating EndOfRun harvesting job") self.createJobsLocationWise(fileset, True, dqmHarvestUnit, lumiMask, goodRunList) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ myThread = threading.currentThread() lumisPerJob = int(kwargs.get('lumis_per_job', 1)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ myThread = threading.currentThread() lumisPerJob = int(kwargs.get('lumis_per_job', 1)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def testBuildLumiMask(self): from WMCore.WMSpec.WMTask import buildLumiMask runs = ['3', '4'] lumis = ['1,4,23,45', '5,84,234,445'] expected = {'3': [[1, 4], [23, 45]], '4': [[5, 84], [234, 445]]} #working self.assertEqual(buildLumiMask(runs, lumis), expected, "buildLumiMask") #number of runs different than number of lumis runs = ['3'] lumis = ['1,4,23,45', '5,84,234,445'] self.assertRaises(ValueError, buildLumiMask, runs, lumis) #wrong format of the number of lumis runs = ['3', '4'] lumis = ['1,4,23,45', '5,84,234'] self.assertRaises(ValueError, buildLumiMask, runs, lumis)
def testBuildLumiMask(self): from WMCore.WMSpec.WMTask import buildLumiMask runs = ['3', '4'] lumis = ['1,4,23,45', '5,84,234,445'] expected = {'3': [[1, 4], [23, 45]], '4': [[5, 84], [234, 445]]} # working self.assertEqual(buildLumiMask(runs, lumis), expected, "buildLumiMask") # number of runs different than number of lumis runs = ['3'] lumis = ['1,4,23,45', '5,84,234,445'] self.assertRaises(ValueError, buildLumiMask, runs, lumis) # wrong format of the number of lumis runs = ['3', '4'] lumis = ['1,4,23,45', '5,84,234'] self.assertRaises(ValueError, buildLumiMask, runs, lumis)
def report(self, workflow, userdn, userproxy=None): res = {} self.logger.info("About to get output of workflow: %s. Getting status first." % workflow) statusRes = self.status(workflow, userdn, userproxy)[0] #load the lumimask rows = self.api.query(None, None, ID.sql, taskname = workflow) splitArgs = literal_eval(rows.next()[6].read()) res['lumiMask'] = buildLumiMask(splitArgs['runs'], splitArgs['lumis']) #extract the finished jobs from filemetadata jobids = [x[1] for x in statusRes['jobList'] if x[0] in ['finished']] rows = self.api.query(None, None, GetFromPandaIds.sql, types='EDM', taskname=workflow, jobids=','.join(map(str,jobids)),\ limit=len(jobids)*100) res['runsAndLumis'] = {} for row in rows: res['runsAndLumis'][str(row[GetFromPandaIds.PANDAID])] = { 'parents' : row[GetFromPandaIds.PARENTS].read(), 'runlumi' : row[GetFromPandaIds.RUNLUMI].read(), 'events' : row[GetFromPandaIds.INEVENTS], } yield res
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ myThread = threading.currentThread() lumisPerJob = int(kwargs.get('lumis_per_job', 1)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) deterministicPileup = kwargs.get('deterministicPileup', False) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory( classname="Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute( workflow=self.subscription.getWorkflow().id) self.nJobs = jobNumber timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def report(self, workflow, userdn, usedbs): """ Computes the report for workflow. If usedbs is used also query DBS and return information about the input and output datasets """ def _compactLumis(datasetInfo): """ Help function that allow to convert from runLumis divided per file (result of listDatasetFileDetails) to an aggregated result. """ lumilist = {} for file, info in datasetInfo.iteritems(): for run, lumis in info['Lumis'].iteritems(): lumilist.setdefault(str(run), []).extend(lumis) return lumilist res = {} self.logger.info("About to compute report of workflow: %s with usedbs=%s. Getting status first." % (workflow,usedbs)) statusRes = self.status(workflow, userdn)[0] #get the information we need from the taskdb/initilize variables row = self.api.query(None, None, self.Task.ID_sql, taskname = workflow).next() row = self.Task.ID_tuple(*row) inputDataset = row.input_dataset outputDatasets = literal_eval(row.output_dataset.read() if row.output_dataset else 'None') dbsUrl = row.dbs_url #load the lumimask splitArgs = literal_eval(row.split_args.read()) res['lumiMask'] = buildLumiMask(splitArgs['runs'], splitArgs['lumis']) self.logger.info("Lumi mask was: %s" % res['lumiMask']) #extract the finished jobs from filemetadata jobids = [x[1] for x in statusRes['jobList'] if x[0] in ['finished']] rows = self.api.query(None, None, self.FileMetaData.GetFromTaskAndType_sql, filetype='EDM,TFILE,POOLIN', taskname=workflow) res['runsAndLumis'] = {} for row in rows: if row[GetFromTaskAndType.PANDAID] in jobids: if str(row[GetFromTaskAndType.PANDAID]) not in res['runsAndLumis']: res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])] = [] res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])].append( { 'parents' : row[GetFromTaskAndType.PARENTS].read(), 'runlumi' : row[GetFromTaskAndType.RUNLUMI].read(), 'events' : row[GetFromTaskAndType.INEVENTS], 'type' : row[GetFromTaskAndType.TYPE], }) self.logger.info("Got %s edm files for workflow %s" % (len(res['runsAndLumis']), workflow)) if usedbs: if not outputDatasets: raise ExecutionError("Cannot find any information about the output datasets names. You can try to execute 'crab report' with --dbs=no") try: #load the input dataset's lumilist dbs = DBSReader(dbsUrl) inputDetails = dbs.listDatasetFileDetails(inputDataset) res['dbsInLumilist'] = _compactLumis(inputDetails) self.logger.info("Aggregated input lumilist: %s" % res['dbsInLumilist']) #load the output datasets' lumilist res['dbsNumEvents'] = 0 res['dbsNumFiles'] = 0 res['dbsOutLumilist'] = {} dbs = DBSReader("https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader") #We can only publish here with DBS3 outLumis = [] for outputDataset in outputDatasets: outputDetails = dbs.listDatasetFileDetails(outputDataset) outLumis.append(_compactLumis(outputDetails)) res['dbsNumEvents'] += sum(x['NumberOfEvents'] for x in outputDetails.values()) res['dbsNumFiles'] += sum(len(x['Parents']) for x in outputDetails.values()) outLumis = LumiList(runsAndLumis = outLumis).compactList for run,lumis in outLumis.iteritems(): res['dbsOutLumilist'][run] = reduce(lambda x1,x2: x1+x2, map(lambda x: range(x[0], x[1]+1), lumis)) self.logger.info("Aggregated output lumilist: %s" % res['dbsOutLumilist']) except Exception, ex: msg = "Failed to contact DBS: %s" % str(ex) self.logger.exception(msg) raise ExecutionError("Exception while contacting DBS. Cannot get the input/output lumi lists. You can try to execute 'crab report' with --dbs=no")
def algorithm(self, *args, **kwargs): """ _algorithm_ Split up all the available files such that each job will process a maximum of 'files_per_job'. If the 'files_per_job' parameters is not passed in jobs will process a maximum of 10 files. """ filesPerJob = int(kwargs.get("files_per_job", 10)) jobsPerGroup = int(kwargs.get("jobs_per_group", 0)) totalFiles = int(kwargs.get("total_files", 0)) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) runBoundaries = kwargs.get("respect_run_boundaries", False) getParents = kwargs.get("include_parents", False) filesInJob = 0 timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) #Get a dictionary of sites, files lDict = self.sortByLocation() locationDict = {} for key in lDict: newlist = [] for f in lDict[key]: if runs and lumis: ## Skip this file is it has no runs. if len(f['runs']) == 0: continue f['lumiCount'] = 0 f['runs'] = sorted(f['runs']) for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] ## Skip this file is it has no lumis. if f['lumiCount'] == 0: continue ## Do average event per lumi calculation. f['avgEvtsPerLumi'] = round(float(f['events']) / f['lumiCount']) newlist.append(f) locationDict[key] = sorted(newlist, key = lambda f: f['lfn']) ## Make a list with all the files, sorting them by LFN. Remove from the list all ## the files filtered out by the lumi-mask (if there is one). files = [] for filesPerLocSet in locationDict.values(): for f in filesPerLocSet: files.append(f) if len(files): files = sorted(files, key = lambda f: f['lfn']) if runs and lumis: skippedFiles = [] for f in files: skipFile = True for run in f['runs']: if not isGoodRun(goodRunList, run.run): continue for lumi in run: if not isGoodLumi(goodRunList, run.run, lumi): continue skipFile = False if skipFile: skippedFiles.append(f) for f in skippedFiles: files.remove(f) ## Keep only the first totalFiles files. Remove the other files from the locationDict. if totalFiles > 0 and totalFiles < len(files): removedFiles = files[totalFiles:] files = files[:totalFiles] for f in removedFiles: for locSet in locationDict.keys(): if f in locationDict[locSet]: locationDict[locSet].remove(f) for locSet in locationDict.keys(): #Now we have all the files in a certain location set fileList = locationDict[locSet] filesInJob = 0 jobsInGroup = 0 self.newGroup() if len(fileList) == 0: continue jobRun = None for f in fileList: if getParents: parentLFNs = self.findParent(lfn = f['lfn']) for lfn in parentLFNs: parent = File(lfn = lfn) f['parents'].add(parent) fileRun = f.get('minrun', None) createNewJob = False if filesInJob == 0 or filesInJob == filesPerJob or (runBoundaries and fileRun != jobRun): createNewJob = True if runs and lumis: for run in f['runs']: if not isGoodRun(goodRunList, run.run): continue firstLumi = None lastLumi = None for lumi in run: if not isGoodLumi(goodRunList, run.run, lumi): if firstLumi != None and lastLumi != None: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None continue if lastLumi != None and lumi != lastLumi + 1: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None if createNewJob: if jobsPerGroup: if jobsInGroup > jobsPerGroup: self.newGroup() jobsInGroup = 0 self.newJob(name = self.getJobName()) self.currentJob.addResourceEstimates(memory = memoryRequirement) filesInJob = 0 jobsInGroup += 1 jobRun = fileRun createNewJob = False self.currentJob.addFile(f) filesInJob += 1 if firstLumi == None: firstLumi = lumi lastLumi = lumi if self.currentJob and not f in self.currentJob['input_files']: self.currentJob.addFile(f) filesInJob += 1 if firstLumi != None and lastLumi != None: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None else: if createNewJob: if jobsPerGroup: if jobsInGroup > jobsPerGroup: self.newGroup() jobsInGroup = 0 self.newJob(name = self.getJobName()) self.currentJob.addResourceEstimates(memory = memoryRequirement) filesInJob = 0 jobsInGroup += 1 jobRun = fileRun self.currentJob.addFile(f) filesInJob += 1 fileTime = f['events'] * timePerEvent fileSize = f['events'] * sizePerEvent self.currentJob.addResourceEstimates(jobTime = fileTime, disk = fileSize) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) jobLimit = int(kwargs.get('job_limit', 0)) jobTimeLimit = int( kwargs.get('job_time_limit', self.defaultJobTimeLimit)) totalEvents = int(kwargs.get('total_events', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', False)) self.collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False)) deterministicPileup = kwargs.get('deterministicPileup', False) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) eventsPerLumiInDataset = 0 if avgEventsPerJob <= 0: msg = "events_per_job parameter must be positive. Its value is: %d" % avgEventsPerJob raise RuntimeError(msg) if self.package == 'WMCore.WMBS': self.loadRunLumi = self.daoFactory( classname="Files.GetBulkRunLumi") if deterministicPileup: getJobNumber = self.daoFactory( classname="Jobs.GetNumberOfJobsPerWorkflow") self.nJobs = getJobNumber.execute( workflow=self.subscription.getWorkflow().id) logging.info( 'Creating jobs in DeterministicPileup mode for %s', self.subscription.workflowName()) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if self.collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(self.collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList. " msg += "Refusing to create any jobs.\nDetails: %s" % str(ex) logging.exception(msg) return lDict = self.getFilesSortedByLocation(avgEventsPerJob) if not lDict: logging.info( "There are not enough events/files to be splitted. Trying again next cycle" ) return locationDict = {} for key in lDict.keys(): newlist = [] # First we need to load the data if self.loadRunLumi: fileLumis = self.loadRunLumi.execute(files=lDict[key]) if not fileLumis: logging.warning( "Empty fileLumis dict for workflow %s, subs %s.", self.subscription.workflowName(), self.subscription['id']) for f in lDict[key]: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run=Run(run, *lumiDict[run])) for f in lDict[key]: if len(f['runs']) == 0: continue f['runs'] = sorted(f['runs']) f['lumiCount'] = 0 for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] # Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round( float(f['events']) / f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: # No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) totalJobs = 0 lastLumi = None firstLumi = None lastRun = None lumisInJob = 0 totalAvgEventCount = 0 currentJobAvgEventCount = 0 stopTask = False self.lumiChecker = LumiChecker(applyLumiCorrection) for location in locationDict: # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) lumisInJobInFile = 0 updateSplitOnJobStop = False failNextJob = False # If estimated job time is higher the job time limit (condor limit) # and it's only one lumi then ditch that lumi timePerLumi = f['avgEvtsPerLumi'] * timePerEvent if timePerLumi > jobTimeLimit and f['lumiCount'] == 1: failNextJob = True stopJob = True lumisPerJob = 1 elif splitOnFile: # Then we have to split on every boundary stopJob = True # Check the average number of events per lumi in this file # Adapt the lumis per job to match the target conditions if f['avgEvtsPerLumi']: # If there are events in the file ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: # Zero event file, then the ratio goes to infinity. Computers don't like that lumisPerJob = f['lumiCount'] else: # Analyze how many events does this job already has # Check how many we want as target, include as many lumi sections as possible updateSplitOnJobStop = True eventsRemaining = max( avgEventsPerJob - currentJobAvgEventCount, 0) if f['avgEvtsPerLumi']: lumisAllowed = int( math.floor( float(eventsRemaining) / f['avgEvtsPerLumi'])) else: lumisAllowed = f['lumiCount'] lumisPerJob = max(lumisInJob + lumisAllowed, 1) for run in f['runs']: if not isGoodRun(goodRunList=goodRunList, run=run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: if (not isGoodLumi(goodRunList, run=run.run, lumi=lumi) or self.lumiChecker.isSplitLumi( run.run, lumi, f)): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if firstLumi is None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis( run=lastRun, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) msg = None if failNextJob: msg = "File %s has a single lumi %s, in run %s " % ( f['lfn'], lumi, run.run) msg += "with too many events %d and it woud take %d sec to run" \ % (f['events'], timePerLumi) self.lumiChecker.closeJob(self.currentJob) self.newJob(name=self.getJobName(), failedJob=failNextJob, failedReason=msg) if deterministicPileup: skipEvents = ( self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset self.currentJob.addBaggageParameter( "skipPileupEvents", skipEvents) self.currentJob.addResourceEstimates( memory=memoryRequirement) failNextJob = False firstLumi = lumi lumisInJob = 0 lumisInJobInFile = 0 currentJobAvgEventCount = 0 totalJobs += 1 if jobLimit and totalJobs > jobLimit: msg = "Job limit of {0} jobs exceeded.".format( jobLimit) raise RuntimeError(msg) # Add the file to new jobs self.currentJob.addFile(f) if updateSplitOnJobStop: # Then we were carrying from a previous file # Reset calculations for this file updateSplitOnJobStop = False if f['avgEvtsPerLumi']: ratio = float( avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: lumisPerJob = f['lumiCount'] lumisInJob += 1 lumisInJobInFile += 1 lastLumi = lumi stopJob = False lastRun = run.run totalAvgEventCount += f['avgEvtsPerLumi'] if self.currentJob and not f in self.currentJob[ 'input_files']: self.currentJob.addFile(f) # We stop here if there are more total events than requested. if totalEvents > 0 and totalAvgEventCount >= totalEvents: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if stopTask: break if not splitOnFile: currentJobAvgEventCount += f[ 'avgEvtsPerLumi'] * lumisInJobInFile if stopTask: break if stopTask: break self.lumiChecker.closeJob(self.currentJob) self.lumiChecker.fixInputFiles() return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) eventLimit = int(kwargs.get('max_events_per_lumi', 20000)) totalEvents = int(kwargs.get('total_events', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) deterministicPileup = kwargs.get('deterministicPileup', False) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id) self.nJobs = jobNumber goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception as ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return lDict = self.sortByLocation() locationDict = {} # First we need to load the data if self.package == 'WMCore.WMBS': loadRunLumi = self.daoFactory(classname = "Files.GetBulkRunLumi") for key in lDict.keys(): newlist = [] # First we need to load the data if self.package == 'WMCore.WMBS': fileLumis = loadRunLumi.execute(files = lDict[key]) for f in lDict[key]: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run = Run(run, *lumiDict[run])) for f in lDict[key]: if len(f['runs']) == 0: continue f['runs'] = sorted(f['runs']) f['lumiCount'] = 0 for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] #Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round(float(f['events'])/f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: #No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) totalJobs = 0 lastLumi = None firstLumi = None lastRun = None lumisInJob = 0 totalAvgEventCount = 0 currentJobAvgEventCount = 0 stopTask = False for location in locationDict: # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn = f['lfn']) for lfn in parentLFNs: parent = File(lfn = lfn) f['parents'].add(parent) lumisInJobInFile = 0 updateSplitOnJobStop = False failNextJob = False #If the number of events per lumi is higher than the limit #and it's only one lumi then ditch that lumi if f['avgEvtsPerLumi'] > eventLimit and f['lumiCount'] == 1: failNextJob = True stopJob = True lumisPerJob = 1 elif splitOnFile: # Then we have to split on every boundary stopJob = True #Check the average number of events per lumi in this file #Adapt the lumis per job to match the target conditions if f['avgEvtsPerLumi']: #If there are events in the file ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: #Zero event file, then the ratio goes to infinity. Computers don't like that lumisPerJob = f['lumiCount'] else: #Analyze how many events does this job already has #Check how many we want as target, include as many lumi sections as possible updateSplitOnJobStop = True eventsRemaining = max(avgEventsPerJob - currentJobAvgEventCount, 0) if f['avgEvtsPerLumi']: lumisAllowed = int(math.floor(float(eventsRemaining) / f['avgEvtsPerLumi'])) else: lumisAllowed = f['lumiCount'] lumisPerJob = max(lumisInJob + lumisAllowed, 1) for run in f['runs']: if not isGoodRun(goodRunList = goodRunList, run = run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: if not isGoodLumi(goodRunList, run = run.run, lumi = lumi): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None if firstLumi == None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis(run = lastRun, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) msg = None if failNextJob: msg = "File %s has too many events (%d) in %d lumi(s)" % (f['lfn'], f['events'], f['lumiCount']) self.newJob(name = self.getJobName(), failedJob = failNextJob, failedReason = msg) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset) self.currentJob.addResourceEstimates(memory = memoryRequirement) failNextJob = False firstLumi = lumi lumisInJob = 0 lumisInJobInFile = 0 currentJobAvgEventCount = 0 totalJobs += 1 # Add the file to new jobs self.currentJob.addFile(f) if updateSplitOnJobStop: #Then we were carrying from a previous file #Reset calculations for this file updateSplitOnJobStop = False if f['avgEvtsPerLumi']: ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: lumisPerJob = f['lumiCount'] lumisInJob += 1 lumisInJobInFile += 1 lastLumi = lumi stopJob = False lastRun = run.run totalAvgEventCount += f['avgEvtsPerLumi'] if self.currentJob and not f in self.currentJob['input_files']: self.currentJob.addFile(f) # We stop here if there are more total events than requested. if totalEvents > 0 and totalAvgEventCount >= totalEvents: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None if stopTask: break if not splitOnFile: currentJobAvgEventCount += f['avgEvtsPerLumi'] * lumisInJobInFile if stopTask: break if stopTask: break return
def report(self, workflow, userdn, usedbs): """ Computes the report for workflow. If usedbs is used also query DBS and return information about the input and output datasets """ def _compactLumis(datasetInfo): """ Help function that allow to convert from runLumis divided per file (result of listDatasetFileDetails) to an aggregated result. """ lumilist = {} for dummyfile, info in datasetInfo.iteritems(): for run, lumis in info['Lumis'].iteritems(): lumilist.setdefault(str(run), []).extend(lumis) return lumilist res = {} self.logger.info("About to compute report of workflow: %s with usedbs=%s. Getting status first." % (workflow, usedbs)) statusRes = self.status(workflow, userdn)[0] #get the information we need from the taskdb/initilize variables row = next(self.api.query(None, None, self.Task.ID_sql, taskname = workflow)) row = self.Task.ID_tuple(*row) inputDataset = row.input_dataset outputDatasets = literal_eval(row.output_dataset.read() if row.output_dataset else 'None') dbsUrl = row.dbs_url #load the lumimask splitArgs = literal_eval(row.split_args.read()) res['lumiMask'] = buildLumiMask(splitArgs['runs'], splitArgs['lumis']) self.logger.info("Lumi mask was: %s" % res['lumiMask']) #extract the finished jobs from filemetadata jobids = [x[1] for x in statusRes['jobList'] if x[0] in ['finished']] rows = self.api.query(None, None, self.FileMetaData.GetFromTaskAndType_sql, filetype='EDM,TFILE,POOLIN', taskname=workflow) res['runsAndLumis'] = {} for row in rows: if row[GetFromTaskAndType.PANDAID] in jobids: if str(row[GetFromTaskAndType.PANDAID]) not in res['runsAndLumis']: res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])] = [] res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])].append( { 'parents': row[GetFromTaskAndType.PARENTS].read(), 'runlumi': row[GetFromTaskAndType.RUNLUMI].read(), 'events': row[GetFromTaskAndType.INEVENTS], 'type': row[GetFromTaskAndType.TYPE], 'lfn': row[GetFromTaskAndType.LFN], }) self.logger.info("Got %s edm files for workflow %s" % (len(res['runsAndLumis']), workflow)) if usedbs: if not outputDatasets: raise ExecutionError("Cannot find any information about the output datasets names. You can try to execute 'crab report' with --dbs=no") try: #load the input dataset's lumilist dbs = DBSReader(dbsUrl) inputDetails = dbs.listDatasetFileDetails(inputDataset) res['dbsInLumilist'] = _compactLumis(inputDetails) self.logger.info("Aggregated input lumilist: %s" % res['dbsInLumilist']) #load the output datasets' lumilist res['dbsNumEvents'] = 0 res['dbsNumFiles'] = 0 res['dbsOutLumilist'] = {} dbs = DBSReader("https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader") #We can only publish here with DBS3 outLumis = [] for outputDataset in outputDatasets: outputDetails = dbs.listDatasetFileDetails(outputDataset) outLumis.append(_compactLumis(outputDetails)) res['dbsNumEvents'] += sum(x['NumberOfEvents'] for x in outputDetails.values()) res['dbsNumFiles'] += sum(len(x['Parents']) for x in outputDetails.values()) outLumis = LumiList(runsAndLumis = outLumis).compactList for run, lumis in outLumis.iteritems(): res['dbsOutLumilist'][run] = reduce(lambda x1, x2: x1+x2, map(lambda x: range(x[0], x[1]+1), lumis)) self.logger.info("Aggregated output lumilist: %s" % res['dbsOutLumilist']) except Exception as ex: msg = "Failed to contact DBS: %s" % str(ex) self.logger.exception(msg) raise ExecutionError("Exception while contacting DBS. Cannot get the input/output lumi lists. You can try to execute 'crab report' with --dbs=no") yield res
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ myThread = threading.currentThread() lumisPerJob = int(kwargs.get('lumis_per_job', 1)) totalLumis = int(kwargs.get('total_lumis', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) deterministicPileup = kwargs.get('deterministicPileup', False) applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False)) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id) self.nJobs = jobNumber timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return lDict = self.sortByLocation() locationDict = {} # First we need to load the data if self.package == 'WMCore.WMBS': loadRunLumi = self.daoFactory(classname = "Files.GetBulkRunLumi") for key in lDict.keys(): newlist = [] # First we need to load the data if self.package == 'WMCore.WMBS': fileLumis = loadRunLumi.execute(files = lDict[key]) for f in lDict[key]: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run = Run(run, *lumiDict[run])) for f in lDict[key]: # if hasattr(f, 'loadData'): # f.loadData() if len(f['runs']) == 0: continue f['lumiCount'] = 0 f['runs'] = sorted(f['runs']) for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] # Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round(float(f['events']) / f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: # No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key = operator.itemgetter('lowestRun')) # Split files into jobs with each job containing # EXACTLY lumisPerJob number of lumis (except for maybe the last one) totalJobs = 0 lastLumi = None firstLumi = None stopJob = True stopTask = False lastRun = None lumisInJob = 0 lumisInTask = 0 self.lumiChecker = LumiChecker(applyLumiCorrection) for location in locationDict.keys(): # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn = f['lfn']) for lfn in parentLFNs: parent = File(lfn = lfn) f['parents'].add(parent) if splitOnFile: # Then we have to split on every boundary stopJob = True for run in f['runs']: if not isGoodRun(goodRunList = goodRunList, run = run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: if (not isGoodLumi(goodRunList, run = run.run, lumi = lumi) or self.lumiChecker.isSplitLumi(run.run, lumi, f)): # splitLumi checks if the lumi is split across jobs # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None if firstLumi == None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis(run = lastRun, lumis = [firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) self.lumiChecker.closeJob(self.currentJob) # before creating a new job add the lumis of the current one to the checker self.newJob(name = self.getJobName()) self.currentJob.addResourceEstimates(memory = memoryRequirement) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset) firstLumi = lumi lumisInJob = 0 totalJobs += 1 # Add the file to new jobs self.currentJob.addFile(f) lumisInJob += 1 lumisInTask += 1 lastLumi = lumi stopJob = False lastRun = run.run if self.currentJob and not f in self.currentJob['input_files']: self.currentJob.addFile(f) if totalLumis > 0 and lumisInTask >= totalLumis: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None if stopTask: break if stopTask: break if stopTask: break self.lumiChecker.closeJob(self.currentJob) self.lumiChecker.fixInputFiles() return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split up all the available files such that each job will process a maximum of 'files_per_job'. If the 'files_per_job' parameters is not passed in jobs will process a maximum of 10 files. """ filesPerJob = int(kwargs.get("files_per_job", 10)) jobsPerGroup = int(kwargs.get("jobs_per_group", 0)) totalFiles = int(kwargs.get("total_files", 0)) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) runBoundaries = kwargs.get("respect_run_boundaries", False) getParents = kwargs.get("include_parents", False) filesInJob = 0 timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) #Get a dictionary of sites, files lDict = self.sortByLocation() locationDict = {} for key in lDict: newlist = [] for f in lDict[key]: if runs and lumis: ## Skip this file is it has no runs. if len(f['runs']) == 0: continue f['lumiCount'] = 0 f['runs'] = sorted(f['runs']) for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] ## Skip this file is it has no lumis. if f['lumiCount'] == 0: continue ## Do average event per lumi calculation. f['avgEvtsPerLumi'] = round( float(f['events']) / f['lumiCount']) newlist.append(f) locationDict[key] = sorted(newlist, key=lambda f: f['lfn']) ## Make a list with all the files, sorting them by LFN. Remove from the list all ## the files filtered out by the lumi-mask (if there is one). files = [] for filesPerLocSet in locationDict.values(): for f in filesPerLocSet: files.append(f) if len(files): files = sorted(files, key=lambda f: f['lfn']) if runs and lumis: skippedFiles = [] for f in files: skipFile = True for run in f['runs']: if not isGoodRun(goodRunList, run.run): continue for lumi in run: if not isGoodLumi(goodRunList, run.run, lumi): continue skipFile = False if skipFile: skippedFiles.append(f) for f in skippedFiles: files.remove(f) ## Keep only the first totalFiles files. Remove the other files from the locationDict. if totalFiles > 0 and totalFiles < len(files): removedFiles = files[totalFiles:] files = files[:totalFiles] for f in removedFiles: for locSet in locationDict.keys(): if f in locationDict[locSet]: locationDict[locSet].remove(f) for locSet in locationDict.keys(): #Now we have all the files in a certain location set fileList = locationDict[locSet] filesInJob = 0 jobsInGroup = 0 self.newGroup() if len(fileList) == 0: continue jobRun = None for f in fileList: if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) fileRun = f.get('minrun', None) createNewJob = False if filesInJob == 0 or filesInJob == filesPerJob or ( runBoundaries and fileRun != jobRun): createNewJob = True if runs and lumis: for run in f['runs']: if not isGoodRun(goodRunList, run.run): continue firstLumi = None lastLumi = None for lumi in run: if not isGoodLumi(goodRunList, run.run, lumi): if firstLumi != None and lastLumi != None: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None continue if lastLumi != None and lumi != lastLumi + 1: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if createNewJob: if jobsPerGroup: if jobsInGroup > jobsPerGroup: self.newGroup() jobsInGroup = 0 self.newJob(name=self.getJobName()) self.currentJob.addResourceEstimates( memory=memoryRequirement) filesInJob = 0 jobsInGroup += 1 jobRun = fileRun createNewJob = False self.currentJob.addFile(f) filesInJob += 1 if firstLumi == None: firstLumi = lumi lastLumi = lumi if self.currentJob and not f in self.currentJob[ 'input_files']: self.currentJob.addFile(f) filesInJob += 1 if firstLumi != None and lastLumi != None: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None else: if createNewJob: if jobsPerGroup: if jobsInGroup > jobsPerGroup: self.newGroup() jobsInGroup = 0 self.newJob(name=self.getJobName()) self.currentJob.addResourceEstimates( memory=memoryRequirement) filesInJob = 0 jobsInGroup += 1 jobRun = fileRun self.currentJob.addFile(f) filesInJob += 1 fileTime = f['events'] * timePerEvent fileSize = f['events'] * sizePerEvent self.currentJob.addResourceEstimates(jobTime=fileTime, disk=fileSize) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) eventLimit = int(kwargs.get('max_events_per_lumi', 20000)) totalEvents = int(kwargs.get('total_events', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) capJobTime = kwargs.get('capJobTime', None) capJobDisk = kwargs.get('capJobDisk', None) deterministicPileup = kwargs.get('deterministicPileup', False) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id) self.nJobs = jobNumber goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception as ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return lDict = self.sortByLocation() locationDict = {} # First we need to load the data if self.package == 'WMCore.WMBS': loadRunLumi = self.daoFactory(classname = "Files.GetBulkRunLumi") for key in lDict.keys(): newlist = [] # First we need to load the data if self.package == 'WMCore.WMBS': fileLumis = loadRunLumi.execute(files = lDict[key]) for f in lDict[key]: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run = Run(run, *lumiDict[run])) for f in lDict[key]: if len(f['runs']) == 0: continue f['runs'] = sorted(f['runs']) f['lumiCount'] = 0 for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] #Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round(float(f['events'])/f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: #No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) totalJobs = 0 lastLumi = None firstLumi = None lastRun = None lumisInJob = 0 totalAvgEventCount = 0 currentJobAvgEventCount = 0 stopTask = False for location in locationDict: # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn = f['lfn']) for lfn in parentLFNs: parent = File(lfn = lfn) f['parents'].add(parent) lumisInJobInFile = 0 updateSplitOnJobStop = False failNextJob = False #If the number of events per lumi is higher than the limit #and it's only one lumi then ditch that lumi if f['avgEvtsPerLumi'] > eventLimit and f['lumiCount'] == 1: failNextJob = True stopJob = True lumisPerJob = 1 elif splitOnFile: # Then we have to split on every boundary stopJob = True #Check the average number of events per lumi in this file #Adapt the lumis per job to match the target conditions if f['avgEvtsPerLumi']: #If there are events in the file ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: #Zero event file, then the ratio goes to infinity. Computers don't like that lumisPerJob = f['lumiCount'] else: #Analyze how many events does this job already has #Check how many we want as target, include as many lumi sections as possible updateSplitOnJobStop = True eventsRemaining = max(avgEventsPerJob - currentJobAvgEventCount, 0) if f['avgEvtsPerLumi']: lumisAllowed = int(math.floor(float(eventsRemaining) / f['avgEvtsPerLumi'])) else: lumisAllowed = f['lumiCount'] lumisPerJob = max(lumisInJob + lumisAllowed, 1) for run in f['runs']: if not isGoodRun(goodRunList = goodRunList, run = run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: if not isGoodLumi(goodRunList, run = run.run, lumi = lumi): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) if capJobTime or capJobDisk: self.currentJob.capResourceEstimates(jobTime = capJobTime, disk = capJobDisk) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) if capJobTime or capJobDisk: self.currentJob.capResourceEstimates(jobTime = capJobTime, disk = capJobDisk) firstLumi = None lastLumi = None if firstLumi == None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis(run = lastRun, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) if capJobTime or capJobDisk: self.currentJob.capResourceEstimates(jobTime = capJobTime, disk = capJobDisk) msg = None if failNextJob: msg = "File %s has too many events (%d) in %d lumi(s)" % (f['lfn'], f['events'], f['lumiCount']) self.newJob(name = self.getJobName(), failedJob = failNextJob, failedReason = msg) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset) self.currentJob.addResourceEstimates(memory = memoryRequirement) failNextJob = False firstLumi = lumi lumisInJob = 0 lumisInJobInFile = 0 currentJobAvgEventCount = 0 totalJobs += 1 # Add the file to new jobs self.currentJob.addFile(f) if updateSplitOnJobStop: #Then we were carrying from a previous file #Reset calculations for this file updateSplitOnJobStop = False if f['avgEvtsPerLumi']: ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: lumisPerJob = f['lumiCount'] lumisInJob += 1 lumisInJobInFile += 1 lastLumi = lumi stopJob = False lastRun = run.run totalAvgEventCount += f['avgEvtsPerLumi'] if self.currentJob and not f in self.currentJob['input_files']: self.currentJob.addFile(f) # We stop here if there are more total events than requested. if totalEvents > 0 and totalAvgEventCount >= totalEvents: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) if capJobTime or capJobDisk: self.currentJob.capResourceEstimates(jobTime = capJobTime, disk = capJobDisk) firstLumi = None lastLumi = None if stopTask: break if not splitOnFile: currentJobAvgEventCount += f['avgEvtsPerLumi'] * lumisInJobInFile if stopTask: break if stopTask: break return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ lumisPerJob = int(kwargs.get('lumis_per_job', 1)) totalLumis = int(kwargs.get('total_lumis', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', False)) self.collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) deterministicPileup = kwargs.get('deterministicPileup', False) applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False)) eventsPerLumiInDataset = 0 if self.package == 'WMCore.WMBS': self.loadRunLumi = self.daoFactory( classname="Files.GetBulkRunLumi") if deterministicPileup: getJobNumber = self.daoFactory( classname="Jobs.GetNumberOfJobsPerWorkflow") self.nJobs = getJobNumber.execute( workflow=self.subscription.getWorkflow().id) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if self.collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(self.collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList. " msg += "Refusing to create any jobs.\nDetails: %s" % str(ex) logging.exception(msg) return lDict = self.getFilesSortedByLocation(lumisPerJob) if not lDict: logging.info( "There are not enough lumis/files to be splitted. Trying again next cycle" ) return locationDict = {} for key in lDict.keys(): newlist = [] for f in lDict[key]: # if hasattr(f, 'loadData'): # f.loadData() if len(f['runs']) == 0: continue f['lumiCount'] = 0 f['runs'] = sorted(f['runs']) for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] # Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round( float(f['events']) / f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: # No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) # Split files into jobs with each job containing # EXACTLY lumisPerJob number of lumis (except for maybe the last one) totalJobs = 0 lastLumi = None firstLumi = None stopJob = True stopTask = False lastRun = None lumisInJob = 0 lumisInTask = 0 self.lumiChecker = LumiChecker(applyLumiCorrection) for location in locationDict.keys(): # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) if splitOnFile: # Then we have to split on every boundary stopJob = True for run in f['runs']: if not isGoodRun(goodRunList=goodRunList, run=run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: # splitLumi checks if the lumi is split across jobs if (not isGoodLumi(goodRunList, run=run.run, lumi=lumi) or self.lumiChecker.isSplitLumi( run.run, lumi, f)): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if firstLumi is None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis( run=lastRun, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) # before creating a new job add the lumis of the current one to the checker self.lumiChecker.closeJob(self.currentJob) self.newJob(name=self.getJobName()) self.currentJob.addResourceEstimates( memory=memoryRequirement) if deterministicPileup: skipEvents = ( self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset self.currentJob.addBaggageParameter( "skipPileupEvents", skipEvents) firstLumi = lumi lumisInJob = 0 totalJobs += 1 # Add the file to new jobs self.currentJob.addFile(f) lumisInJob += 1 lumisInTask += 1 lastLumi = lumi stopJob = False lastRun = run.run if self.currentJob and not f in self.currentJob[ 'input_files']: self.currentJob.addFile(f) if totalLumis > 0 and lumisInTask >= totalLumis: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if stopTask: break if stopTask: break if stopTask: break self.lumiChecker.closeJob(self.currentJob) self.lumiChecker.fixInputFiles() return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) jobLimit = int(kwargs.get('job_limit', 0)) jobTimeLimit = int(kwargs.get('job_time_limit', self.defaultJobTimeLimit)) totalEvents = int(kwargs.get('total_events', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', False)) self.collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False)) deterministicPileup = kwargs.get('deterministicPileup', False) allowCreationFailure = kwargs.get('allowCreationFailure', True) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) eventsPerLumiInDataset = 0 if avgEventsPerJob <= 0: msg = "events_per_job parameter must be positive. Its value is: %d" % avgEventsPerJob raise RuntimeError(msg) if self.package == 'WMCore.WMBS': self.loadRunLumi = self.daoFactory(classname="Files.GetBulkRunLumi") if deterministicPileup: getJobNumber = self.daoFactory(classname="Jobs.GetNumberOfJobsPerWorkflow") self.nJobs = getJobNumber.execute(workflow=self.subscription.getWorkflow().id) logging.info('Creating jobs in DeterministicPileup mode for %s', self.subscription.workflowName()) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if self.collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(self.collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList. " msg += "Refusing to create any jobs.\nDetails: %s" % str(ex) logging.exception(msg) return lDict = self.getFilesSortedByLocation(avgEventsPerJob) if not lDict: logging.info("There are not enough events/files to be splitted. Trying again next cycle") return locationDict = {} for key in lDict.keys(): newlist = [] # First we need to load the data if self.loadRunLumi: fileLumis = self.loadRunLumi.execute(files=lDict[key]) if not fileLumis: logging.warning("Empty fileLumis dict for workflow %s, subs %s.", self.subscription.workflowName(), self.subscription['id']) for f in lDict[key]: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run=Run(run, *lumiDict[run])) for f in lDict[key]: if len(f['runs']) == 0: continue f['runs'] = sorted(f['runs']) f['lumiCount'] = 0 for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] # Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round(float(f['events']) / f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: # No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) totalJobs = 0 lastLumi = None firstLumi = None lastRun = None lumisInJob = 0 totalAvgEventCount = 0 currentJobAvgEventCount = 0 stopTask = False self.lumiChecker = LumiChecker(applyLumiCorrection) for location in locationDict: # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) lumisInJobInFile = 0 updateSplitOnJobStop = False failNextJob = False # If estimated job time is higher the job time limit (condor limit) # and it's only one lumi then ditch that lumi timePerLumi = f['avgEvtsPerLumi'] * timePerEvent if timePerLumi > jobTimeLimit and f['lumiCount'] == 1: lumisPerJob = 1 stopJob = True if allowCreationFailure: failNextJob = True elif splitOnFile: # Then we have to split on every boundary stopJob = True # Check the average number of events per lumi in this file # Adapt the lumis per job to match the target conditions if f['avgEvtsPerLumi']: # If there are events in the file ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: # Zero event file, then the ratio goes to infinity. Computers don't like that lumisPerJob = f['lumiCount'] else: # Analyze how many events does this job already has # Check how many we want as target, include as many lumi sections as possible updateSplitOnJobStop = True eventsRemaining = max(avgEventsPerJob - currentJobAvgEventCount, 0) if f['avgEvtsPerLumi']: lumisAllowed = int(math.floor(float(eventsRemaining) / f['avgEvtsPerLumi'])) else: lumisAllowed = f['lumiCount'] lumisPerJob = max(lumisInJob + lumisAllowed, 1) for run in f['runs']: if not isGoodRun(goodRunList=goodRunList, run=run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: if (not isGoodLumi(goodRunList, run=run.run, lumi=lumi) or self.lumiChecker.isSplitLumi(run.run, lumi, f)): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis(run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis(run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if firstLumi is None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis(run=lastRun, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime=runAddedTime, disk=runAddedSize) msg = None if failNextJob: msg = "File %s has a single lumi %s, in run %s " % (f['lfn'], lumi, run.run) msg += "with too many events %d and it woud take %d sec to run" \ % (f['events'], timePerLumi) self.lumiChecker.closeJob(self.currentJob) self.newJob(name=self.getJobName(), failedJob=failNextJob, failedReason=msg) if deterministicPileup: skipEvents = (self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset self.currentJob.addBaggageParameter("skipPileupEvents", skipEvents) self.currentJob.addResourceEstimates(memory=memoryRequirement) failNextJob = False firstLumi = lumi lumisInJob = 0 lumisInJobInFile = 0 currentJobAvgEventCount = 0 totalJobs += 1 if jobLimit and totalJobs > jobLimit: msg = "Job limit of {0} jobs exceeded.".format(jobLimit) raise RuntimeError(msg) # Add the file to new jobs self.currentJob.addFile(f) if updateSplitOnJobStop: # Then we were carrying from a previous file # Reset calculations for this file updateSplitOnJobStop = False if f['avgEvtsPerLumi']: ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: lumisPerJob = f['lumiCount'] lumisInJob += 1 lumisInJobInFile += 1 lastLumi = lumi stopJob = False lastRun = run.run totalAvgEventCount += f['avgEvtsPerLumi'] if self.currentJob and not f in self.currentJob['input_files']: self.currentJob.addFile(f) # We stop here if there are more total events than requested. if totalEvents > 0 and totalAvgEventCount >= totalEvents: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis(run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if stopTask: break if not splitOnFile: currentJobAvgEventCount += f['avgEvtsPerLumi'] * lumisInJobInFile if stopTask: break if stopTask: break self.lumiChecker.closeJob(self.currentJob) self.lumiChecker.fixInputFiles() return