def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if not isinstance(self.maxRetries, dict): self.maxRetries = {'default' : self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname = "Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) return
def _getDBSBlock(self, match, wmspec): """Get DBS info for this block""" blockName = match['Inputs'].keys()[0] #TODO: Allow more than one if match['ACDC']: acdcInfo = match['ACDC'] acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) collection = acdc.getDataCollection(acdcInfo['collection']) splitedBlockName = ACDCBlock.splitBlockName(blockName) fileLists = acdc.getChunkFiles(acdcInfo['collection'], acdcInfo['fileset'], splitedBlockName['Offset'], splitedBlockName['NumOfFiles'], user = wmspec.getOwner().get("name"), group = wmspec.getOwner().get("group")) block = {} block["Files"] = fileLists return blockName, block else: dbs = get_dbs(match['Dbs']) if wmspec.getTask(match['TaskName']).parentProcessingFlag(): dbsBlockDict = dbs.getFileBlockWithParents(blockName) else: dbsBlockDict = dbs.getFileBlock(blockName) return blockName, dbsBlockDict[blockName]
def setUp(self): """ setup for test. """ super(ErrorHandlerTest, self).setUp() myThread = threading.currentThread() self.testInit = TestInitCouchApp(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMCore.WMBS"], useDefault=False) self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC") self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump") self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump") self.daofactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daofactory(classname="Jobs.GetAllJobs") self.setJobTime = self.daofactory(classname="Jobs.SetStateTime") locationAction = self.daofactory(classname="Locations.New") locationAction.execute(siteName="malpaquet", pnn="T2_CH_CERN") self.testDir = self.testInit.generateWorkDir() self.configFile = EmulatorSetup.setupWMAgentConfig() self.nJobs = 10 self.dataCS = DataCollectionService(url=self.testInit.couchUrl, database="errorhandler_t") return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) eventLimit = int(kwargs.get('max_events_per_lumi', 20000)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) deterministicPileup = kwargs.get('deterministicPileup', False) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id) self.nJobs = jobNumber goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def _getDBSBlock(self, match, wmspec): """Get DBS info for this block""" blockName = match['Inputs'].keys()[0] #TODO: Allow more than one if match['ACDC']: acdcInfo = match['ACDC'] acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) collection = acdc.getDataCollection(acdcInfo['collection']) splitedBlockName = ACDCBlock.splitBlockName(blockName) fileLists = acdc.getChunkFiles(acdcInfo['collection'], acdcInfo['fileset'], splitedBlockName['Offset'], splitedBlockName['NumOfFiles'], user = wmspec.getOwner().get("name"), group = wmspec.getOwner().get("group")) block = {} block["Files"] = fileLists return blockName, block else: dbs = get_dbs(match['Dbs']) if wmspec.getTask(match['TaskName']).parentProcessingFlag(): dbsBlockDict = dbs.getFileBlockWithParents(blockName) else: dbsBlockDict = dbs.getFileBlock(blockName) if wmspec.locationDataSourceFlag(): blockInfo = dbsBlockDict[blockName] seElements = [] for cmsSite in match['Inputs'].values()[0]: #TODO: Allow more than one ses = self.SiteDB.cmsNametoSE(cmsSite) seElements.extend(ses) seElements = list(set(seElements)) blockInfo['StorageElements'] = seElements return blockName, dbsBlockDict[blockName]
def validBlocks(self, task): """Return blocks that pass the input data restriction""" validBlocks = [] # TODO take the chunk size from parameter chunkSize = 200 acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError( self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0]) else: #if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = self.data.keys()[0] block = acdc.getChunkInfo( acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles'], user=self.wmspec.getOwner().get("name"), group=self.wmspec.getOwner().get("group")) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"]) validBlocks.append(dbsBlock) else: acdcBlocks = acdc.chunkFileset( acdcInfo['collection'], acdcInfo['fileset'], chunkSize, user=self.wmspec.getOwner().get("name"), group=self.wmspec.getOwner().get("group")) for block in acdcBlocks: dbsBlock = {} dbsBlock['Name'] = ACDCBlock.name(self.wmspec.name(), acdcInfo["fileset"], block['offset'], block['files']) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock["Sites"] = sitesFromStorageEelements( block["locations"]) dbsBlock['ACDC'] = acdcInfo validBlocks.append(dbsBlock) return validBlocks
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ myThread = threading.currentThread() lumisPerJob = int(kwargs.get('lumis_per_job', 1)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ myThread = threading.currentThread() lumisPerJob = int(kwargs.get('lumis_per_job', 1)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def testFailedJobsUniqueWf(self): """ Performance test of failedJobs with all failed jobs belonging to the same workflow and the same task name """ loadList = [] for i in range(1, 5000): loadList.append(self.jobConfig('wf1', '/wf1/task1', i, 'lfn1')) dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") dcs.failedJobs(loadList) return
def createResubmitSpec(self, serverUrl, couchDB): """ _createResubmitSpec_ Create a bogus resubmit workload. """ self.site = "cmssrm.fnal.gov" workload = WMWorkloadHelper(WMWorkload("TestWorkload")) reco = workload.newTask("reco") workload.setOwnerDetails(name = "evansde77", group = "DMWM") # first task uses the input dataset reco.addInputDataset(primary = "PRIMARY", processed = "processed-v1", tier = "TIER1") reco.data.input.splitting.algorithm = "File" reco.setTaskType("Processing") cmsRunReco = reco.makeStep("cmsRun1") cmsRunReco.setStepType("CMSSW") reco.applyTemplates() cmsRunRecoHelper = cmsRunReco.getTypeHelper() cmsRunRecoHelper.addOutputModule("outputRECO", primaryDataset = "PRIMARY", processedDataset = "processed-v2", dataTier = "TIER2", lfnBase = "/store/dunkindonuts", mergedLFNBase = "/store/kfc") dcs = DataCollectionService(url = serverUrl, database = couchDB) def getJob(workload): job = Job() job["task"] = workload.getTask("reco").getPathName() job["workflow"] = workload.name() job["location"] = self.site job["owner"] = "evansde77" job["group"] = "DMWM" return job testFileA = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileA.setLocation([self.site]) testFileA.addRun(Run(1, 1, 2)) testFileB = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileB.setLocation([self.site]) testFileB.addRun(Run(1, 3, 4)) testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) dcs.failedJobs([testJobA]) topLevelTask = workload.getTopLevelTask()[0] workload.truncate("Resubmit_TestWorkload", topLevelTask.getPathName(), serverUrl, couchDB) return workload
def validBlocks(self, task): """Return blocks that pass the input data restriction""" validBlocks = [] # TODO take the chunk size from parameter chunkSize = 200 acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError(self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0]) else: #if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = self.data.keys()[0] block = acdc.getChunkInfo(acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles'], user = self.wmspec.getOwner().get("name"), group = self.wmspec.getOwner().get("group")) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"]) validBlocks.append(dbsBlock) else: acdcBlocks = acdc.chunkFileset(acdcInfo['collection'], acdcInfo['fileset'], chunkSize, user = self.wmspec.getOwner().get("name"), group = self.wmspec.getOwner().get("group")) for block in acdcBlocks: dbsBlock = {} dbsBlock['Name'] = ACDCBlock.name(self.wmspec.name(), acdcInfo["fileset"], block['offset'], block['files']) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"]) dbsBlock['ACDC'] = acdcInfo validBlocks.append(dbsBlock) return validBlocks
def main(): start = time.time() # blockName = match['Inputs'].keys()[0] blockName = "/acdc/vlimant_ACDC0_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_190218_145226_481/:pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222:SUS-RunIIFall18wmLHEGS-00025_0/0/31055" # acdcInfo = match['ACDC'] acdcInfo = {"database": "acdcserver", "fileset": "/pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222/SUS-RunIIFall18wmLHEGS-00025_0", "collection": "pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222", "server": "https://cmsweb.cern.ch/couchdb"} acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) splitedBlockName = ACDCBlock.splitBlockName(blockName) print("Splitted block name: %s" % splitedBlockName) fileLists = acdc.getChunkFiles(acdcInfo['collection'], acdcInfo['fileset'], splitedBlockName['Offset'], splitedBlockName['NumOfFiles']) print("Retrieved %d unique files from the ACDCServer" % len(fileLists)) block = {} block["Files"] = fileLists wantedLumis = set([252052, 240646]) for f in fileLists: for run in f['runs']: maskDict = run.json() lumisSet = set(maskDict['Lumis'].keys()) if wantedLumis.intersection(lumisSet): print("File: %s with events: %s, contains these lumis: %s" % (f['lfn'], f['events'], wantedLumis.intersection(lumisSet))) # with open("chunkfiles.json", 'w') as fo: # json.dump(block, fo) end = time.time() print("Spent %s secs running so far" % (end - start)) sys.exit(1) ### Now doing the WMBSHelper stuff reqUrl = "https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache" requestName = "vlimant_ACDC0_task_HIG-RunIIFall17wmLHEGS-01122__v1_T_180808_130708_5376" wmspec = WMWorkloadHelper() wmspec.loadSpecFromCouch(reqUrl, requestName) taskName = "HIG-RunIIFall17DRPremix-00788_0" mask = None cacheDir = "/data/srv/wmagent/v1.1.14.patch6/install/wmagent/WorkQueueManager/cache" # wmbsHelper = WMBSHelper(wmspec, match['TaskName'], blockName, mask, self.params['CacheDir']) wmbsHelper = WMBSHelper(wmspec, taskName, blockName, mask, cacheDir) sub, numFilesAdded = wmbsHelper.createSubscriptionAndAddFiles(block=block)
def validBlocks(self, task): """Return blocks that pass the input data restriction according to the splitting algorithm""" validBlocks = [] acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError( self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0]) else: # if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = self.data.keys()[0] block = acdc.getChunkInfo( acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles'], user=self.wmspec.getOwner().get("name"), group=self.wmspec.getOwner().get("group")) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo if task.getTrustSitelists(): dbsBlock["Sites"] = self.sites else: # TODO remove this line when all DBS origin_site_name is converted to PNN block["locations"] = self.siteDB.checkAndConvertSENameToPNN( block["locations"]) # upto this dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"]) validBlocks.append(dbsBlock) else: if self.args['SplittingAlgo'] in self.unsupportedAlgos: raise WorkQueueWMSpecError( self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo']) splittingFunc = self.defaultAlgo if self.args['SplittingAlgo'] in self.algoMapping: splittingFunc = self.algoMapping[self.args['SplittingAlgo']] validBlocks = splittingFunc(acdc, acdcInfo, task) return validBlocks
def testFailedJobsScrambledWf(self): """ Performance test of failedJobs where jobs belong to 10 different workflows and 3 different tasks """ loadList = [] for i in range(1, 5000): wfName = "wf%d" % (i % 10) taskName = "/wf%d/task%d" % (i % 10, i % 3) loadList.append(self.jobConfig(wfName, taskName, i, '/file/name/lfn1')) dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") dcs.failedJobs(loadList) return
def main(): start = time.time() # acdcInfo = match['ACDC'] acdcInfo = { "database": "acdcserver", "fileset": "/pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222/SUS-RunIIFall18wmLHEGS-00025_0", "collection": "pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222", "server": "https://cmsweb.cern.ch/couchdb" } dcs = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) # acdcFileList = dcs.getProductionACDCInfo(acdcInfo['collection'], acdcInfo['fileset']) files = dcs._getFilesetInfo(acdcInfo['collection'], acdcInfo['fileset']) print("%s" % pformat(files[0])) files = mergeFilesInfo(files) acdcFileList = [] for value in files: fileInfo = { "lfn": value["lfn"], "first_event": value["first_event"], "lumis": value["runs"][0]["lumis"], "events": value["events"] } acdcFileList.append(fileInfo) #print("Data retrieved:\n%s" % pformat(acdcFileList)) print("Retrieved %d files from the ACDCServer" % len(acdcFileList)) listLumis = [] wantedLumis = set([252052, 240646]) for f in acdcFileList: listLumis.extend(f['lumis']) lumisSet = set(f['lumis']) if wantedLumis.intersection(lumisSet): print("File: %s with events: %s, contains these lumis: %s" % (f['lfn'], f['events'], f['lumis'])) print("Total amount of lumis: %d, where unique are: %d" % (len(listLumis), len(set(listLumis)))) # with open("chunkfiles.json", 'w') as fo: # json.dump(block, fo) end = time.time() print("Spent %s secs running so far" % (end - start)) sys.exit(1)
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") return
def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInitCouchApp(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase = True) self.testInit.setSchema(customModules = ["WMCore.WMBS"], useDefault = False) self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC") self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump") self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump") self.daofactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.getJobs = self.daofactory(classname = "Jobs.GetAllJobs") self.setJobTime = self.daofactory(classname = "Jobs.SetStateTime") locationAction = self.daofactory(classname = "Locations.New") locationAction.execute(siteName = "malpaquet", seName = "malpaquet") self.testDir = self.testInit.generateWorkDir() self.nJobs = 10 self.dataCS = DataCollectionService(url = self.testInit.couchUrl, database = "errorhandler_t") return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if not isinstance(self.maxRetries, dict): self.maxRetries = {"default": self.maxRetries} if "default" not in self.maxRetries: raise ErrorHandlerException("Max retries for the default job type must be specified") self.maxProcessSize = getattr(self.config.ErrorHandler, "maxProcessSize", 250) self.exitCodes = getattr(self.config.ErrorHandler, "failureExitCodes", []) self.maxFailTime = getattr(self.config.ErrorHandler, "maxFailTime", 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, "readFWJR", False) self.passCodes = getattr(self.config.ErrorHandler, "passExitCodes", []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url=config.ACDC.couchurl, database=config.ACDC.database) return
def validBlocks(self, task): """Return blocks that pass the input data restriction according to the splitting algorithm""" validBlocks = [] acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError(self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0]) else: # if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = self.data.keys()[0] block = acdc.getChunkInfo(acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles'], user = self.wmspec.getOwner().get("name"), group = self.wmspec.getOwner().get("group")) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo if task.inputLocationFlag(): dbsBlock["Sites"] = self.sites else: #TODO remove this line when all DBS origin_site_name is converted to PNN block["locations"] = self.siteDB.checkAndConvertSENameToPNN(block["locations"]) #upto this dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"]) validBlocks.append(dbsBlock) else: if self.args['SplittingAlgo'] in self.unsupportedAlgos: raise WorkQueueWMSpecError(self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo']) splittingFunc = self.defaultAlgo if self.args['SplittingAlgo'] in self.algoMapping: splittingFunc = self.algoMapping[self.args['SplittingAlgo']] validBlocks = splittingFunc(acdc, acdcInfo, task) return validBlocks
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if type(self.maxRetries) != dict: self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException( 'Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="ErrorHandler") # Some exit codes imply an immediate failure, non-configurable self.exitCodes.extend(WMJobPermanentSystemErrors) return
def lumiListFromACDC(couchURL=None, couchDB=None, filesetName=None, collectionName=None): """ This is not implemented yet :return: """ from WMCore.ACDC.DataCollectionService import DataCollectionService goodRunList = None try: logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumilistWhitelist(collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList. " msg += "Refusing to create any jobs.\nDetails: %s" % ex.__str__() logging.exception(msg) return goodRunList
def lumiListFromACDC(couchURL=None, couchDB=None, filesetName=None, collectionName=None): """ This is not implemented yet :return: """ from WMCore.ACDC.DataCollectionService import DataCollectionService goodRunList = None try: logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumilistWhitelist(collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList. " msg += "Refusing to create any jobs.\nDetails: %s" % str(ex) logging.exception(msg) return goodRunList
def validBlocks(self, task): """Return blocks that pass the input data restriction according to the splitting algorithm""" validBlocks = [] acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError( self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(next(iter(self.data))) else: # if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = next(iter(self.data)) block = acdc.getChunkInfo(acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles']) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo if task.getTrustSitelists().get('trustlists'): dbsBlock["Sites"] = self.sites else: dbsBlock["Sites"] = self.cric.PNNstoPSNs(block["locations"]) validBlocks.append(dbsBlock) else: if self.args['SplittingAlgo'] in self.unsupportedAlgos: raise WorkQueueWMSpecError( self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo']) splittingFunc = self.defaultAlgo if self.args['SplittingAlgo'] in self.algoMapping: splittingFunc = self.algoMapping[self.args['SplittingAlgo']] validBlocks = splittingFunc(acdc, acdcInfo, task) return validBlocks
def validBlocks(self, task): """Return blocks that pass the input data restriction according to the splitting algorithm""" validBlocks = [] acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError(self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0]) else: # if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = self.data.keys()[0] block = acdc.getChunkInfo(acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles']) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo if task.getTrustSitelists().get('trustlists'): dbsBlock["Sites"] = self.sites else: dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"]) validBlocks.append(dbsBlock) else: if self.args['SplittingAlgo'] in self.unsupportedAlgos: raise WorkQueueWMSpecError(self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo']) splittingFunc = self.defaultAlgo if self.args['SplittingAlgo'] in self.algoMapping: splittingFunc = self.algoMapping[self.args['SplittingAlgo']] validBlocks = splittingFunc(acdc, acdcInfo, task) return validBlocks
def main(): start = time.time() # acdcInfo = match['ACDC'] acdcInfo = {"database": "acdcserver", "fileset": "/pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222/SUS-RunIIFall18wmLHEGS-00025_0", "collection": "pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222", "server": "https://cmsweb.cern.ch/couchdb"} dcs = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) # acdcFileList = dcs.getProductionACDCInfo(acdcInfo['collection'], acdcInfo['fileset']) files = dcs._getFilesetInfo(acdcInfo['collection'], acdcInfo['fileset']) print("%s" % pformat(files[0])) files = mergeFilesInfo(files) acdcFileList = [] for value in files: fileInfo = {"lfn": value["lfn"], "first_event": value["first_event"], "lumis": value["runs"][0]["lumis"], "events": value["events"]} acdcFileList.append(fileInfo) #print("Data retrieved:\n%s" % pformat(acdcFileList)) print("Retrieved %d files from the ACDCServer" % len(acdcFileList)) listLumis = [] wantedLumis = set([252052, 240646]) for f in acdcFileList: listLumis.extend(f['lumis']) lumisSet = set(f['lumis']) if wantedLumis.intersection(lumisSet): print("File: %s with events: %s, contains these lumis: %s" % (f['lfn'], f['events'], f['lumis'])) print("Total amount of lumis: %d, where unique are: %d" % (len(listLumis), len(set(listLumis)))) # with open("chunkfiles.json", 'w') as fo: # json.dump(block, fo) end = time.time() print("Spent %s secs running so far" % (end - start)) sys.exit(1)
def test06UploadACDC(self): # get previous request we can piggyback on for request in reversed(self.__class__.reqmgr.getRequest()): request = request['WMCore.RequestManager.DataStructs.Request.Request']['RequestName'] if 'RequestCancellation_t' in request: self.__class__.requestParams['OriginalRequestName'] = request break else: raise nose.SkipTest("no suitable request in reqmgr to resubmit") self.__class__.requestParams['InitialTaskPath'] = self.__class__.requestParams['InitialTaskPath'] % self.__class__.requestParams['OriginalRequestName'] self.__class__.requestParams['ACDCServer'] = self.__class__.endpoint + '/couchdb' # create and upload acdc service = DataCollectionService(url=self.__class__.endpoint + '/couchdb', database = 'wmagent_acdc') service.createCollection(self.__class__.requestParams['OriginalRequestName'], 'integration', 'DMWM') with open(os.path.join(getTestBase(), '..', 'data', 'ACDC', 'linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735.json')) as infile: acdc_json = infile.read().replace('linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735', self.__class__.requestParams['OriginalRequestName']) acdc_json = loads(acdc_json) acdc_database = Database('wmagent_acdc', self.__class__.endpoint + '/couchdb') acdc_database.commit(acdc_json)
def lumiListFromACDC(couchURL=None, couchDB=None, filesetName=None, collectionName=None, ignoreACDC=False): """ This is not implemented yet :return: """ try: logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumilistWhitelist(collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: # Logic can go in main function? msg += "Ditching goodRunList\n" + str(ex) + str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" + str(ex) + str(traceback.format_exc()) logging.error(msg) return None # An error condtion - check return goodRunList
def test06UploadACDC(self): # get previous request we can piggyback on for request in reversed(self.__class__.reqmgr.getRequest()): request = request[ 'WMCore.RequestManager.DataStructs.Request.Request'][ 'RequestName'] if 'RequestCancellation_t' in request: self.__class__.requestParams['OriginalRequestName'] = request break else: raise nose.SkipTest("no suitable request in reqmgr to resubmit") self.__class__.requestParams[ 'InitialTaskPath'] = self.__class__.requestParams[ 'InitialTaskPath'] % self.__class__.requestParams[ 'OriginalRequestName'] self.__class__.requestParams[ 'ACDCServer'] = self.__class__.endpoint + '/couchdb' # create and upload acdc service = DataCollectionService(url=self.__class__.endpoint + '/couchdb', database='wmagent_acdc') service.createCollection( self.__class__.requestParams['OriginalRequestName'], 'integration', 'DMWM') with open( os.path.join( getTestBase(), '..', 'data', 'ACDC', 'linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735.json' )) as infile: acdc_json = infile.read().replace( 'linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735', self.__class__.requestParams['OriginalRequestName']) acdc_json = loads(acdc_json) acdc_database = Database('wmagent_acdc', self.__class__.endpoint + '/couchdb') acdc_database.commit(acdc_json)
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None self.maxRetries = self.config.ErrorHandler.maxRetries else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) self.setupComponentParam() return
def algorithm(self, *args, **kwargs): """ _algorithm_ An event base splitting algorithm. All available files are split into a set number of events per job. """ eventsPerJob = int(kwargs.get("events_per_job", 100)) eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob)) getParents = kwargs.get("include_parents", False) lheInput = kwargs.get("lheInputFiles", False) collectionName = kwargs.get('collectionName', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) acdcFileList = [] # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) acdcFileList = dcs.getProductionACDCInfo( collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def _getDBSBlock(self, match, wmspec): """Get DBS info for this block""" blockName = match['Inputs'].keys()[0] #TODO: Allow more than one if match['ACDC']: acdcInfo = match['ACDC'] acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) collection = acdc.getDataCollection(acdcInfo['collection']) splitedBlockName = ACDCBlock.splitBlockName(blockName) fileLists = acdc.getChunkFiles( acdcInfo['collection'], acdcInfo['fileset'], splitedBlockName['Offset'], splitedBlockName['NumOfFiles'], user=wmspec.getOwner().get("name"), group=wmspec.getOwner().get("group")) block = {} block["Files"] = fileLists return blockName, block else: dbs = get_dbs(match['Dbs']) if wmspec.getTask(match['TaskName']).parentProcessingFlag(): dbsBlockDict = dbs.getFileBlockWithParents(blockName) else: dbsBlockDict = dbs.getFileBlock(blockName) if wmspec.locationDataSourceFlag(): blockInfo = dbsBlockDict[blockName] seElements = [] for cmsSite in match['Inputs'].values( )[0]: #TODO: Allow more than one ses = self.SiteDB.cmsNametoSE(cmsSite) seElements.extend(ses) seElements = list(set(seElements)) blockInfo['StorageElements'] = seElements return blockName, dbsBlockDict[blockName]
def algorithm(self, *args, **kwargs): """ _algorithm_ An event base splitting algorithm. All available files are split into a set number of events per job. """ eventsPerJob = int(kwargs.get("events_per_job", 100)) eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob)) getParents = kwargs.get("include_parents", False) lheInput = kwargs.get("lheInputFiles", False) collectionName = kwargs.get('collectionName', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) acdcFileList = [] # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) acdcFileList = dcs.getProductionACDCInfo(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if type(self.maxRetries) != dict: self.maxRetries = {'default' : self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname = "Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") # Some exit codes imply an immediate failure, non-configurable self.exitCodes.extend(WMJobPermanentSystemErrors) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None self.maxRetries = self.config.ErrorHandler.maxRetries else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.maxRetries = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("MaxRetries") if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url=config.ACDC.couchurl, database=config.ACDC.database) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None self.maxRetries = self.config.ErrorHandler.maxRetries else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url=config.ACDC.couchurl, database=config.ACDC.database) self.setupComponentParam() return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory( classname="Jobs.GetFWJRTaskName") self.pnn_to_psn = self.daofactory( classname="Locations.GetPNNtoPSNMapping").execute() self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco self.maxAllowedRepackOutputSize = getattr( config.JobAccountant, 'maxAllowedRepackOutputSize', 12 * 1024 * 1024 * 1024) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://", "") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport def isTaskExistInFWJR(self, jobReport, jobStatus): """ If taskName is not available in the FWJR, then tries to recover it getting data from the SQL database. """ if not jobReport.getTaskName(): logging.warning( "Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus, jobReport.getJobID())) jobInfo = self.getJobTaskNameAction.execute( jobId=jobReport.getJobID(), conn=self.getDBConn(), transaction=self.existingTransaction()) jobReport.setTaskName(jobInfo['taskName']) jobReport.save(jobInfo['fwjr_path']) if not jobReport.getTaskName(): msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % ( jobStatus, jobReport.getJobID()) raise AccountantWorkerException(msg) else: logging.info( "TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(), jobReport.getJobID())) return def __call__(self, parameters): """ __call__ Handle a completed job. The parameters dictionary will contain the job ID and the path to the framework job report. """ returnList = [] self.reset() for job in parameters: logging.info("Handling %s" % job["fwjr_path"]) # Load the job and set the ID fwkJobReport = self.loadJobReport(job) fwkJobReport.setJobID(job['id']) jobSuccess = self.handleJob(jobID=job["id"], fwkJobReport=fwkJobReport) if self.returnJobReport: returnList.append({ 'id': job["id"], 'jobSuccess': jobSuccess, 'jobReport': fwkJobReport }) else: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess}) self.count += 1 self.beginTransaction() # Now things done at the end of the job # Do what we can with WMBS files self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds) # handle merge files separately since parentage need to set # separately to support robust merge self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge) # Create DBSBufferFiles self.createFilesInDBSBuffer() # Handle filesetAssoc if len(self.filesetAssoc) > 0: self.bulkAddToFilesetAction.execute( binds=self.filesetAssoc, conn=self.getDBConn(), transaction=self.existingTransaction()) # Move successful jobs to successful if len(self.listOfJobsToSave) > 0: idList = [x['id'] for x in self.listOfJobsToSave] outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToSave] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.jobCompleteInput.execute( id=idList, lfnsToSkip=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete") # If we have failed jobs, fail them if len(self.listOfJobsToFail) > 0: outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToFail] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete") # Arrange WMBS parentage if len(self.parentageBinds) > 0: self.setParentageByJob.execute( binds=self.parentageBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(self.parentageBindsForMerge) > 0: self.setParentageByMergeJob.execute( binds=self.parentageBindsForMerge, conn=self.getDBConn(), transaction=self.existingTransaction()) # Straighten out DBS Parentage if len(self.mergedOutputFiles) > 0: self.handleDBSBufferParentage() if len(self.jobsWithSkippedFiles) > 0: self.handleSkippedFiles() self.commitTransaction(existingTransaction=False) return returnList def outputFilesetsForJob(self, outputMap, merged, moduleLabel): """ _outputFilesetsForJob_ Determine if the file should be placed in any other fileset. Note that this will not return the JobGroup output fileset as all jobs will have their output placed there. """ if moduleLabel not in outputMap: logging.info("Output module label missing from output map.") return [] outputFilesets = [] for outputFileset in outputMap[moduleLabel]: if merged == False and outputFileset["output_fileset"] != None: outputFilesets.append(outputFileset["output_fileset"]) else: if outputFileset["merged_output_fileset"] != None: outputFilesets.append( outputFileset["merged_output_fileset"]) return outputFilesets def addFileToDBS(self, jobReportFile, task, errorDataset=False): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn=jobReportFile["lfn"], size=jobReportFile["size"], events=jobReportFile["events"], checksums=jobReportFile["checksums"], status="NOTUPLOADED") dbsFile.setAlgorithm(appName=datasetInfo["applicationName"], appVer=datasetInfo["applicationVersion"], appFam=jobReportFile["module_label"], psetHash="GIBBERISH", configContent=jobReportFile.get('configURL')) if errorDataset: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"] + "-Error", datasetInfo["processedDataset"], datasetInfo["dataTier"])) else: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus( validStatus=jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver=jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra( era=jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag=jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id=jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber=run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn=list(jobReportFile["locations"])[0], immediateSave=False) self.dbsFilesToCreate.append(dbsFile) return def findDBSParents(self, lfn): """ _findDBSParents_ Find the parent of the file in DBS This is meant to be called recursively """ parentsInfo = self.getParentInfoAction.execute( [lfn], conn=self.getDBConn(), transaction=self.existingTransaction()) newParents = set() for parentInfo in parentsInfo: # This will catch straight to merge files that do not have redneck # parents. We will mark the straight to merge file from the job # as a child of the merged parent. if int(parentInfo["merged"]) == 1: newParents.add(parentInfo["lfn"]) elif parentInfo['gpmerged'] == None: continue # Handle the files that result from merge jobs that aren't redneck # children. We have to setup parentage and then check on whether or # not this file has any redneck children and update their parentage # information. elif int(parentInfo["gpmerged"]) == 1: newParents.add(parentInfo["gplfn"]) # If that didn't work, we've reached the great-grandparents # And we have to work via recursion else: parentSet = self.findDBSParents(lfn=parentInfo['gplfn']) for parent in parentSet: newParents.add(parent) return newParents def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID=None): """ _addFileToWMBS_ Add a file that was produced in a job to WMBS. """ fwjrFile["first_event"] = jobMask["FirstEvent"] if fwjrFile["first_event"] == None: fwjrFile["first_event"] = 0 if jobType == "Merge" and fwjrFile["module_label"] != "logArchive": setattr(fwjrFile["fileRef"], 'merged', True) fwjrFile["merged"] = True wmbsFile = self.createFileFromDataStructsFile(file=fwjrFile, jobID=jobID) if jobType == "Merge": self.wmbsMergeFilesToBuild.append(wmbsFile) else: self.wmbsFilesToBuild.append(wmbsFile) if fwjrFile["merged"]: self.addFileToDBS( fwjrFile, task, jobType == "Repack" and fwjrFile["size"] > self.maxAllowedRepackOutputSize) return wmbsFile def _mapLocation(self, fwkJobReport): for file in fwkJobReport.getAllFileRefs(): if file and hasattr(file, 'location'): file.location = self.phedex.getBestNodeName( file.location, self.locLists) def handleJob(self, jobID, fwkJobReport): """ _handleJob_ Figure out if a job was successful or not, handle it appropriately (parse FWJR, update WMBS) and return the success status as a boolean """ jobSuccess = fwkJobReport.taskSuccessful() outputMap = self.getOutputMapAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) jobType = self.getJobTypeAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) if jobSuccess: fileList = fwkJobReport.getAllFiles() # consistency check comparing outputMap to fileList # they should match except for some limited special cases outputModules = set([]) for fwjrFile in fileList: outputModules.add(fwjrFile['outputModule']) if set(outputMap.keys()) == outputModules: pass elif jobType == "LogCollect" and len( outputMap.keys()) == 0 and outputModules == set( ['LogCollect']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['Merged', 'logArchive']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['MergedError', 'logArchive']): pass elif jobType == "Express" and set( outputMap.keys()).difference(outputModules) == set( ['write_RAW']): pass else: failJob = True if jobType in ["Processing", "Production"]: cmsRunSteps = 0 for step in fwkJobReport.listSteps(): if step.startswith("cmsRun"): cmsRunSteps += 1 if cmsRunSteps > 1: failJob = False if failJob: jobSuccess = False logging.error( "Job %d , list of expected outputModules does not match job report, failing job", jobID) logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys())) logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules)) fileList = fwkJobReport.getAllFilesFromStep( step='logArch1') else: logging.debug( "Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID) else: fileList = fwkJobReport.getAllFilesFromStep(step='logArch1') if jobSuccess: logging.info("Job %d , handle successful job", jobID) else: logging.warning("Job %d , bad jobReport, failing job", jobID) # make sure the task name is present in FWJR (recover from WMBS if needed) if len(fileList) > 0: if jobSuccess: self.isTaskExistInFWJR(fwkJobReport, "success") else: self.isTaskExistInFWJR(fwkJobReport, "failed") # special check for LogCollect jobs skipLogCollect = False if jobSuccess and jobType == "LogCollect": for fwjrFile in fileList: try: # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes self.associateLogCollectToParentJobsInWMStats( fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName()) except Exception as ex: skipLogCollect = True logging.error( "Error occurred: associating log collect location, will try again\n %s" % str(ex)) break # now handle the job (unless the special LogCollect check failed) if not skipLogCollect: wmbsJob = Job(id=jobID) wmbsJob.load() outputID = wmbsJob.loadOutputID() wmbsJob.getMask() wmbsJob["fwjr"] = fwkJobReport if jobSuccess: wmbsJob["outcome"] = "success" else: wmbsJob["outcome"] = "failure" for fwjrFile in fileList: logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"]) wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"], jobID=jobID, task=fwkJobReport.getTaskName()) merged = fwjrFile['merged'] moduleLabel = fwjrFile["module_label"] if merged: self.mergedOutputFiles.append(wmbsFile) self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputID }) # LogCollect jobs have no output fileset if jobType == "LogCollect": pass # Repack jobs that wrote too large merged output skip output filesets elif jobType == "Repack" and merged and wmbsFile[ "size"] > self.maxAllowedRepackOutputSize: pass else: outputFilesets = self.outputFilesetsForJob( outputMap, merged, moduleLabel) for outputFileset in outputFilesets: self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputFileset }) # Check if the job had any skipped files, put them in ACDC containers # We assume full file processing (no job masks) if jobSuccess: skippedFiles = fwkJobReport.getAllSkippedFiles() if skippedFiles and jobType not in ['LogCollect', 'Cleanup']: self.jobsWithSkippedFiles[jobID] = skippedFiles # Only save once job is done, and we're sure we made it through okay self._mapLocation(wmbsJob['fwjr']) if jobSuccess: self.listOfJobsToSave.append(wmbsJob) else: self.listOfJobsToFail.append(wmbsJob) return jobSuccess def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task): """ _associateLogCollectToParentJobsInWMStats_ Associate a logArchive output to its parent job """ inputFileList = fwkJobReport.getAllInputFiles() requestName = task.split('/')[1] keys = [] for inputFile in inputFileList: keys.append([requestName, inputFile["lfn"]]) resultRows = self.fwjrCouchDB.loadView( "FWJRDump", 'jobsByOutputLFN', options={"stale": "update_after"}, keys=keys)['rows'] if len(resultRows) > 0: #get data from wmbs parentWMBSJobIDs = [] for row in resultRows: parentWMBSJobIDs.append({"jobid": row["value"]}) #update Job doc in wmstats results = self.getJobInfoByID.execute(parentWMBSJobIDs) parentJobNames = [] if isinstance(results, list): for jobInfo in results: parentJobNames.append(jobInfo['name']) else: parentJobNames.append(results['name']) self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN) else: #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0) #It need to be failed and retried. logging.error( "job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList) return def createMissingFWKJR(self, parameters, errorCode=999, errorDescription='Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report def createFilesInDBSBuffer(self): """ _createFilesInDBSBuffer_ It does the actual job of creating things in DBSBuffer WARNING: This assumes all files in a job have the same final location """ if len(self.dbsFilesToCreate) == 0: # Whoops, nothing to do! return dbsFileTuples = [] dbsFileLoc = [] dbsCksumBinds = [] runLumiBinds = [] selfChecksums = None jobLocations = set() for dbsFile in self.dbsFilesToCreate: # Append a tuple in the format specified by DBSBufferFiles.Add # Also run insertDatasetAlgo assocID = None datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % ( dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"], dbsFile["appFam"], dbsFile["psetHash"], dbsFile['processingVer'], dbsFile['acquisitionEra'], dbsFile['globalTag']) # First, check if this is in the cache if datasetAlgoPath in self.datasetAlgoPaths: for da in self.datasetAlgoID: if da['datasetAlgoPath'] == datasetAlgoPath: assocID = da['assocID'] break if not assocID: # Then we have to get it ourselves try: assocID = dbsFile.insertDatasetAlgo() self.datasetAlgoPaths.append(datasetAlgoPath) self.datasetAlgoID.append({ 'datasetAlgoPath': datasetAlgoPath, 'assocID': assocID }) except WMException: raise except Exception as ex: msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath msg += str(ex) logging.error(msg) raise AccountantWorkerException(msg) # Associate the workflow to the file using the taskPath and the requestName # TODO: debug why it happens and then drop/recover these cases automatically taskPath = dbsFile.get('task') if not taskPath: msg = "Can't do workflow association, report this error to a developer.\n" msg += "DbsFile : %s" % str(dbsFile) raise AccountantWorkerException(msg) workflowName = taskPath.split('/')[1] workflowPath = '%s:%s' % (workflowName, taskPath) if workflowPath in self.workflowPaths: for wf in self.workflowIDs: if wf['workflowPath'] == workflowPath: workflowID = wf['workflowID'] break else: result = self.dbsGetWorkflow.execute( workflowName, taskPath, conn=self.getDBConn(), transaction=self.existingTransaction()) workflowID = result['id'] self.workflowPaths.append(workflowPath) self.workflowIDs.append({ 'workflowPath': workflowPath, 'workflowID': workflowID }) lfn = dbsFile['lfn'] selfChecksums = dbsFile['checksums'] jobLocation = dbsFile.getLocations()[0] jobLocations.add(jobLocation) dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'], assocID, dbsFile['status'], workflowID)) dbsFileLoc.append({'lfn': lfn, 'pnn': jobLocation}) if dbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): dbsCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) try: diffLocation = jobLocations.difference(self.dbsLocations) for jobLocation in diffLocation: self.dbsInsertLocation.execute( siteName=jobLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsLocations.add(jobLocation) self.dbsCreateFiles.execute(files=dbsFileTuples, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetLocation.execute(binds=dbsFileLoc, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetChecksum.execute(bulkList=dbsCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(runLumiBinds) > 0: self.dbsSetRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Got exception while inserting files into DBSBuffer!\n" msg += str(ex) logging.error(msg) logging.debug("Listing binds:") logging.debug("jobLocation: %s\n" % jobLocation) logging.debug("dbsFiles: %s\n" % dbsFileTuples) logging.debug("dbsFileLoc: %s\n" % dbsFileLoc) logging.debug("Checksum binds: %s\n" % dbsCksumBinds) logging.debug("RunLumi binds: %s\n" % runLumiBinds) raise AccountantWorkerException(msg) # Now that we've created those files, clear the list self.dbsFilesToCreate = [] return def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds): """ _handleWMBSFiles_ Do what can be done in bulk in bulk """ if len(wmbsFilesToBuild) == 0: # Nothing to do return runLumiBinds = [] fileCksumBinds = [] fileLocations = [] fileCreate = [] for wmbsFile in wmbsFilesToBuild: lfn = wmbsFile['lfn'] if lfn == None: continue selfChecksums = wmbsFile['checksums'] # by jobType add to different parentage relation # if it is the merge job, don't include the parentage on failed input files. # otherwise parentage is set for all input files. parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']}) if wmbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']}) if len(wmbsFile.getLocations()) > 0: outpnn = wmbsFile.getLocations()[0] if self.pnn_to_psn.get(outpnn, None): fileLocations.append({'lfn': lfn, 'location': outpnn}) else: msg = "PNN doesn't exist in wmbs_location_sename table: %s (investigate)" % outpnn logging.error(msg) raise AccountantWorkerException(msg) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): fileCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) fileCreate.append([ lfn, wmbsFile['size'], wmbsFile['events'], None, wmbsFile["first_event"], wmbsFile['merged'] ]) if len(fileCreate) == 0: return try: self.addFileAction.execute(files=fileCreate, conn=self.getDBConn(), transaction=self.existingTransaction()) if runLumiBinds: self.setFileRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileAddChecksum.execute( bulkList=fileCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileLocation.execute( lfn=fileLocations, location=self.fileLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while adding files to WMBS!\n" msg += str(ex) logging.error(msg) logging.debug("Printing binds: \n") logging.debug("FileCreate binds: %s\n" % fileCreate) logging.debug("Runlumi binds: %s\n" % runLumiBinds) logging.debug("Checksum binds: %s\n" % fileCksumBinds) logging.debug("FileLocation binds: %s\n" % fileLocations) raise AccountantWorkerException(msg) # Clear out finished files wmbsFilesToBuild = [] return def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error( "Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn=pnn, immediateSave=False) wmbsFile['jid'] = jobID return wmbsFile def handleDBSBufferParentage(self): """ _handleDBSBufferParentage_ Handle all the DBSBuffer Parentage in bulk if you can """ outputLFNs = [f['lfn'] for f in self.mergedOutputFiles] bindList = [] for lfn in outputLFNs: newParents = self.findDBSParents(lfn=lfn) for parentLFN in newParents: bindList.append({'child': lfn, 'parent': parentLFN}) # Now all the parents should exist # Commit them to DBSBuffer logging.info("About to commit all DBSBuffer Heritage information") logging.info(len(bindList)) if len(bindList) > 0: try: self.dbsLFNHeritage.execute( binds=bindList, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while trying to handle the DBS LFN heritage\n" msg += str(ex) msg += "BindList: %s" % bindList logging.error(msg) raise AccountantWorkerException(msg) return def handleSkippedFiles(self): """ _handleSkippedFiles_ Handle all the skipped files in bulk, the way it handles the skipped files imposes an important restriction: Skipped files should have been processed by a single job in the task and no job mask exists in it. This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased splitting algorithms. Here ACDC records and created and the file are moved to wmbs_sub_files_failed from completed. """ jobList = self.getFullJobInfo.execute( [{ 'jobid': x } for x in self.jobsWithSkippedFiles.keys()], fileSelection=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dataCollection.failedJobs(jobList, useMask=False) return
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if type(self.maxRetries) != dict: self.maxRetries = {'default' : self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname = "Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") return def setup(self, parameters = None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ self.changeState.propagate(jobList, 'exhausted', 'retrydone') # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() # Do not build ACDC for utilitarian job types jobList = [ job for job in jobList if job['type'] not in ['LogCollect','Cleanup'] ] self.handleACDC(jobList) return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed" % (len(jobList), state)) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) self.sendAlert(4, msg = msg) logging.debug("JobInfo: %s" % job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary = True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary = True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs" % len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff." % job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error("Failed to find FWJR for job %i in location %s.\n Passing it to cooloff." % (job['id'], reportPath)) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] if startTime == None or stopTime == None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i" % job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.exitCodes]): msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.error(msg) self.sendAlert(4, msg = msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes())) passJobs.append(job) continue cooloffJobs.append(job) except Exception as ex: logging.warning("Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs def handleRetryDoneJobs(self, jobList): """ _handleRetryDoneJobs_ """ myThread = threading.currentThread() logging.debug("About to process %d retry done jobs" % len(jobList)) myThread.transaction.begin() self.exhaustJobs(jobList) myThread.transaction.commit() return def handleFailedJobs(self, jobList, state): """ _handleFailedJobs_ """ myThread = threading.currentThread() logging.debug("About to process %d failures" % len(jobList)) myThread.transaction.begin() self.processRetries(jobList, state) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = [ 'create', 'submit', 'job' ] for state in failure_states: idList = self.getJobs.execute(state = "%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed" % (len(idList), state)) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state = 'retrydone') logging.info("Found %d jobs done with all retries" % len(idList)) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleRetryDoneJobs(jobList) return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.idLoad.execute(jobID = binds) # You have to have a list if type(results) == dict: results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id = entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def loadJobsFromListFull(self, idList): """ _loadJobsFromList_ Load jobs in bulk. Include the full metadata. """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.loadAction.execute(jobID = binds) # You have to have a list if type(results) == dict: results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id = entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def algorithm(self, parameters = None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") myThread = threading.currentThread() try: self.handleErrors() except WMException as ex: try: myThread.transaction.rollback() except: pass raise except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception in ErrorHandler\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except Exception as ex: msg = "Caught exception in ErrorHandler\n" msg += str(ex) msg += str(traceback.format_exc()) msg += "\n\n" logging.error(msg) self.sendAlert(6, msg = msg) if getattr(myThread, 'transaction', None) != None \ and getattr(myThread.transaction, 'transaction', None) != None: myThread.transaction.rollback() raise ErrorHandlerException(msg)
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) jobLimit = int(kwargs.get('job_limit', 0)) jobTimeLimit = int( kwargs.get('job_time_limit', self.defaultJobTimeLimit)) totalEvents = int(kwargs.get('total_events', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', False)) self.collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False)) deterministicPileup = kwargs.get('deterministicPileup', False) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) eventsPerLumiInDataset = 0 if avgEventsPerJob <= 0: msg = "events_per_job parameter must be positive. Its value is: %d" % avgEventsPerJob raise RuntimeError(msg) if self.package == 'WMCore.WMBS': self.loadRunLumi = self.daoFactory( classname="Files.GetBulkRunLumi") if deterministicPileup: getJobNumber = self.daoFactory( classname="Jobs.GetNumberOfJobsPerWorkflow") self.nJobs = getJobNumber.execute( workflow=self.subscription.getWorkflow().id) logging.info( 'Creating jobs in DeterministicPileup mode for %s', self.subscription.workflowName()) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if self.collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(self.collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList. " msg += "Refusing to create any jobs.\nDetails: %s" % str(ex) logging.exception(msg) return lDict = self.getFilesSortedByLocation(avgEventsPerJob) if not lDict: logging.info( "There are not enough events/files to be splitted. Trying again next cycle" ) return locationDict = {} for key in lDict.keys(): newlist = [] # First we need to load the data if self.loadRunLumi: fileLumis = self.loadRunLumi.execute(files=lDict[key]) if not fileLumis: logging.warning( "Empty fileLumis dict for workflow %s, subs %s.", self.subscription.workflowName(), self.subscription['id']) for f in lDict[key]: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run=Run(run, *lumiDict[run])) for f in lDict[key]: if len(f['runs']) == 0: continue f['runs'] = sorted(f['runs']) f['lumiCount'] = 0 for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] # Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round( float(f['events']) / f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: # No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) totalJobs = 0 lastLumi = None firstLumi = None lastRun = None lumisInJob = 0 totalAvgEventCount = 0 currentJobAvgEventCount = 0 stopTask = False self.lumiChecker = LumiChecker(applyLumiCorrection) for location in locationDict: # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) lumisInJobInFile = 0 updateSplitOnJobStop = False failNextJob = False # If estimated job time is higher the job time limit (condor limit) # and it's only one lumi then ditch that lumi timePerLumi = f['avgEvtsPerLumi'] * timePerEvent if timePerLumi > jobTimeLimit and f['lumiCount'] == 1: failNextJob = True stopJob = True lumisPerJob = 1 elif splitOnFile: # Then we have to split on every boundary stopJob = True # Check the average number of events per lumi in this file # Adapt the lumis per job to match the target conditions if f['avgEvtsPerLumi']: # If there are events in the file ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: # Zero event file, then the ratio goes to infinity. Computers don't like that lumisPerJob = f['lumiCount'] else: # Analyze how many events does this job already has # Check how many we want as target, include as many lumi sections as possible updateSplitOnJobStop = True eventsRemaining = max( avgEventsPerJob - currentJobAvgEventCount, 0) if f['avgEvtsPerLumi']: lumisAllowed = int( math.floor( float(eventsRemaining) / f['avgEvtsPerLumi'])) else: lumisAllowed = f['lumiCount'] lumisPerJob = max(lumisInJob + lumisAllowed, 1) for run in f['runs']: if not isGoodRun(goodRunList=goodRunList, run=run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: if (not isGoodLumi(goodRunList, run=run.run, lumi=lumi) or self.lumiChecker.isSplitLumi( run.run, lumi, f)): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if firstLumi is None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis( run=lastRun, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) msg = None if failNextJob: msg = "File %s has a single lumi %s, in run %s " % ( f['lfn'], lumi, run.run) msg += "with too many events %d and it woud take %d sec to run" \ % (f['events'], timePerLumi) self.lumiChecker.closeJob(self.currentJob) self.newJob(name=self.getJobName(), failedJob=failNextJob, failedReason=msg) if deterministicPileup: skipEvents = ( self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset self.currentJob.addBaggageParameter( "skipPileupEvents", skipEvents) self.currentJob.addResourceEstimates( memory=memoryRequirement) failNextJob = False firstLumi = lumi lumisInJob = 0 lumisInJobInFile = 0 currentJobAvgEventCount = 0 totalJobs += 1 if jobLimit and totalJobs > jobLimit: msg = "Job limit of {0} jobs exceeded.".format( jobLimit) raise RuntimeError(msg) # Add the file to new jobs self.currentJob.addFile(f) if updateSplitOnJobStop: # Then we were carrying from a previous file # Reset calculations for this file updateSplitOnJobStop = False if f['avgEvtsPerLumi']: ratio = float( avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: lumisPerJob = f['lumiCount'] lumisInJob += 1 lumisInJobInFile += 1 lastLumi = lumi stopJob = False lastRun = run.run totalAvgEventCount += f['avgEvtsPerLumi'] if self.currentJob and not f in self.currentJob[ 'input_files']: self.currentJob.addFile(f) # We stop here if there are more total events than requested. if totalEvents > 0 and totalAvgEventCount >= totalEvents: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if stopTask: break if not splitOnFile: currentJobAvgEventCount += f[ 'avgEvtsPerLumi'] * lumisInJobInFile if stopTask: break if stopTask: break self.lumiChecker.closeJob(self.currentJob) self.lumiChecker.fixInputFiles() return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ lumisPerJob = int(kwargs.get('lumis_per_job', 1)) totalLumis = int(kwargs.get('total_lumis', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', False)) self.collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) deterministicPileup = kwargs.get('deterministicPileup', False) applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False)) eventsPerLumiInDataset = 0 if self.package == 'WMCore.WMBS': self.loadRunLumi = self.daoFactory( classname="Files.GetBulkRunLumi") if deterministicPileup: getJobNumber = self.daoFactory( classname="Jobs.GetNumberOfJobsPerWorkflow") self.nJobs = getJobNumber.execute( workflow=self.subscription.getWorkflow().id) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if self.collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(self.collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList. " msg += "Refusing to create any jobs.\nDetails: %s" % str(ex) logging.exception(msg) return lDict = self.getFilesSortedByLocation(lumisPerJob) if not lDict: logging.info( "There are not enough lumis/files to be splitted. Trying again next cycle" ) return locationDict = {} for key in lDict.keys(): newlist = [] for f in lDict[key]: # if hasattr(f, 'loadData'): # f.loadData() if len(f['runs']) == 0: continue f['lumiCount'] = 0 f['runs'] = sorted(f['runs']) for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] # Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round( float(f['events']) / f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: # No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) # Split files into jobs with each job containing # EXACTLY lumisPerJob number of lumis (except for maybe the last one) totalJobs = 0 lastLumi = None firstLumi = None stopJob = True stopTask = False lastRun = None lumisInJob = 0 lumisInTask = 0 self.lumiChecker = LumiChecker(applyLumiCorrection) for location in locationDict.keys(): # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) if splitOnFile: # Then we have to split on every boundary stopJob = True for run in f['runs']: if not isGoodRun(goodRunList=goodRunList, run=run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: # splitLumi checks if the lumi is split across jobs if (not isGoodLumi(goodRunList, run=run.run, lumi=lumi) or self.lumiChecker.isSplitLumi( run.run, lumi, f)): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if firstLumi is None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis( run=lastRun, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) # before creating a new job add the lumis of the current one to the checker self.lumiChecker.closeJob(self.currentJob) self.newJob(name=self.getJobName()) self.currentJob.addResourceEstimates( memory=memoryRequirement) if deterministicPileup: skipEvents = ( self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset self.currentJob.addBaggageParameter( "skipPileupEvents", skipEvents) firstLumi = lumi lumisInJob = 0 totalJobs += 1 # Add the file to new jobs self.currentJob.addFile(f) lumisInJob += 1 lumisInTask += 1 lastLumi = lumi stopJob = False lastRun = run.run if self.currentJob and not f in self.currentJob[ 'input_files']: self.currentJob.addFile(f) if totalLumis > 0 and lumisInTask >= totalLumis: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis( run=run.run, lumis=[firstLumi, lastLumi]) addedEvents = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = addedEvents * timePerEvent runAddedSize = addedEvents * sizePerEvent self.currentJob.addResourceEstimates( jobTime=runAddedTime, disk=runAddedSize) firstLumi = None lastLumi = None if stopTask: break if stopTask: break if stopTask: break self.lumiChecker.closeJob(self.currentJob) self.lumiChecker.fixInputFiles() return
class ErrorHandlerTest(unittest.TestCase): """ TestCase for TestErrorHandler module """ def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInitCouchApp(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase = True) self.testInit.setSchema(customModules = ["WMCore.WMBS"], useDefault = False) self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC") self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump") self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump") self.daofactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.getJobs = self.daofactory(classname = "Jobs.GetAllJobs") self.setJobTime = self.daofactory(classname = "Jobs.SetStateTime") locationAction = self.daofactory(classname = "Locations.New") locationAction.execute(siteName = "malpaquet", seName = "malpaquet") self.testDir = self.testInit.generateWorkDir() self.nJobs = 10 self.dataCS = DataCollectionService(url = self.testInit.couchUrl, database = "errorhandler_t") return def tearDown(self): """ Database deletion """ self.testInit.clearDatabase() self.testInit.delWorkDir() self.testInit.tearDownCouch() return def getConfig(self): """ _getConfig_ """ config = Configuration() # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", self.testDir) config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("ErrorHandler") # The log level of the component. config.ErrorHandler.logLevel = 'DEBUG' # The namespace of the component config.ErrorHandler.namespace = 'WMComponent.ErrorHandler.ErrorHandler' # maximum number of threads we want to deal # with messages per pool. config.ErrorHandler.maxThreads = 30 # maximum number of retries we want for job config.ErrorHandler.maxRetries = 5 # The poll interval at which to look for failed jobs config.ErrorHandler.pollInterval = 60 # JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL', None) config.JobStateMachine.couchDBName = "errorhandler_t_jd" config.section_('ACDC') config.ACDC.couchurl = self.testInit.couchUrl config.ACDC.database = "errorhandler_t" return config def createWorkload(self, workloadName = 'Test', emulator = True): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload("Tier1ReReco") rereco = workload.getTask("ReReco") # Add RequestManager stuff workload.data.request.section_('schema') workload.data.request.schema.Requestor = 'nobody' workload.data.request.schema.Group = 'testers' taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() return workload def createTestJobGroup(self, nJobs = 10, retry_count = 1, workloadPath = 'test', fwjrPath = None, workloadName = makeUUID()): """ Creates a group of several jobs """ myThread = threading.currentThread() myThread.transaction.begin() testWorkflow = Workflow(spec = workloadPath, owner = "cmsdataops", group = "cmsdataops", name = workloadName, task="/TestWorkload/ReReco") testWorkflow.create() testWMBSFileset = Fileset(name = "TestFileset") testWMBSFileset.create() testSubscription = Subscription(fileset = testWMBSFileset, workflow = testWorkflow) testSubscription.create() testJobGroup = JobGroup(subscription = testSubscription) testJobGroup.create() testFile0 = File(lfn = "/this/is/a/parent", size = 1024, events = 10) testFile0.addRun(Run(10, *[12312])) testFile0.setLocation('malpaquet') testFileA = File(lfn = "/this/is/a/lfnA", size = 1024, events = 10, first_event = 88, last_event = 99) testFileA.addRun(Run(10, *[12312, 12313])) testFileA.setLocation('malpaquet') testFileB = File(lfn = "/this/is/a/lfnB", size = 1024, events = 10, first_event = 88, last_event = 99) testFileB.addRun(Run(10, *[12314, 12315, 12316])) testFileB.setLocation('malpaquet') testFile0.create() testFileA.create() testFileB.create() testFileA.addParent(lfn = "/this/is/a/parent") testFileB.addParent(lfn = "/this/is/a/parent") for i in range(0, nJobs): testJob = Job(name = makeUUID()) testJob['retry_count'] = retry_count testJob['retry_max'] = 10 testJob['mask'].addRunAndLumis(run = 10, lumis = [12312]) testJob['mask'].addRunAndLumis(run = 10, lumis = [12314, 12316]) testJob['mask']['FirstEvent'] = 100 testJob['cache_dir'] = os.path.join(self.testDir, testJob['name']) testJob['fwjr_path'] = fwjrPath os.mkdir(testJob['cache_dir']) testJobGroup.add(testJob) testJob.create(group = testJobGroup) testJob.addFile(testFileA) testJob.addFile(testFileB) testJob.save() testJobGroup.commit() testSubscription.acquireFiles(files = [testFileA, testFileB]) testSubscription.save() myThread.transaction.commit() return testJobGroup def testA_Create(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate() Mimics creation of component and test jobs failed in create stage. """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, workloadPath = workloadPath, workloadName = workloadName) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'createfailed', 'new') idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), self.nJobs) changer.propagate(testJobGroup.jobs, 'new', 'CreateCooloff') changer.propagate(testJobGroup.jobs, 'createfailed', 'new') # Now exhaust them for job in testJobGroup.jobs: job['retry_count'] = 6 job.save() testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) # Check that it showed up in ACDC collection = self.dataCS.getDataCollection(workloadName) # Now look at what's inside self.assertTrue(len(collection['filesets']) > 0) for fileset in collection["filesets"]: counter = 0 for f in fileset.listFiles(): counter += 1 self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"]) self.assertEqual(f['events'], 10) self.assertEqual(f['size'], 1024) self.assertEqual(f['parents'], [u'/this/is/a/parent']) self.assertTrue(f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]], "Unknown lumi %s" % f['runs'][0]['lumis']) self.assertTrue(f['merged'], 1) self.assertTrue(f['first_event'], 88) self.assertTrue(f['last_event'], 99) self.assertEqual(counter, 20) return def testB_Submit(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit() Mimics creation of component and test jobs failed in submit stage. """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, workloadPath = workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') idList = self.getJobs.execute(state = 'SubmitFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'SubmitFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'SubmitCooloff') self.assertEqual(len(idList), self.nJobs) return def testC_Jobs(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs() Mimics creation of component and test jobs failed in execute stage. """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, workloadPath = workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), self.nJobs) return def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, retry_count = 5, workloadPath = workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testSubscription = Subscription(id = 1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2) def testE_FailJobs(self): """ _FailJobs_ Test our ability to fail jobs based on the information in the FWJR """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, workloadPath = workloadPath, fwjrPath = fwjrPath) config = self.getConfig() config.ErrorHandler.readFWJR = True config.ErrorHandler.failureExitCodes = [8020] changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) # This should exhaust all jobs due to exit code idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.failureExitCodes = [] config.ErrorHandler.maxFailTime = -10 testErrorHandler2 = ErrorHandlerPoller(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler2.algorithm(None) # This should exhaust all jobs due to timeout idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.maxFailTime = 24 * 3600 config.ErrorHandler.passExitCodes = [8020] testErrorHandler3 = ErrorHandlerPoller(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler3.algorithm(None) idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) return def testZ_Profile(self): """ _testProfile_ Do a full profile of the poller """ return import cProfile, pstats nJobs = 1000 testJobGroup = self.createTestJobGroup(nJobs = nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'createfailed', 'new') idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) startTime = time.time() #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat") testErrorHandler.algorithm() stopTime = time.time() idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ avgEventsPerJob = int(kwargs.get('events_per_job', 5000)) eventLimit = int(kwargs.get('max_events_per_lumi', 20000)) totalEvents = int(kwargs.get('total_events', 0)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) deterministicPileup = kwargs.get('deterministicPileup', False) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id) self.nJobs = jobNumber goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception as ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return lDict = self.sortByLocation() locationDict = {} # First we need to load the data if self.package == 'WMCore.WMBS': loadRunLumi = self.daoFactory(classname = "Files.GetBulkRunLumi") for key in lDict.keys(): newlist = [] # First we need to load the data if self.package == 'WMCore.WMBS': fileLumis = loadRunLumi.execute(files = lDict[key]) for f in lDict[key]: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run = Run(run, *lumiDict[run])) for f in lDict[key]: if len(f['runs']) == 0: continue f['runs'] = sorted(f['runs']) f['lumiCount'] = 0 for run in f['runs']: run.lumis.sort() f['lumiCount'] += len(run.lumis) f['lowestRun'] = f['runs'][0] #Do average event per lumi calculation if f['lumiCount']: f['avgEvtsPerLumi'] = round(float(f['events'])/f['lumiCount']) if deterministicPileup: # We assume that all lumis are equal in the dataset eventsPerLumiInDataset = f['avgEvtsPerLumi'] else: #No lumis in the file, ignore it continue newlist.append(f) locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun')) totalJobs = 0 lastLumi = None firstLumi = None lastRun = None lumisInJob = 0 totalAvgEventCount = 0 currentJobAvgEventCount = 0 stopTask = False for location in locationDict: # For each location, we need a new jobGroup self.newGroup() stopJob = True for f in locationDict[location]: if getParents: parentLFNs = self.findParent(lfn = f['lfn']) for lfn in parentLFNs: parent = File(lfn = lfn) f['parents'].add(parent) lumisInJobInFile = 0 updateSplitOnJobStop = False failNextJob = False #If the number of events per lumi is higher than the limit #and it's only one lumi then ditch that lumi if f['avgEvtsPerLumi'] > eventLimit and f['lumiCount'] == 1: failNextJob = True stopJob = True lumisPerJob = 1 elif splitOnFile: # Then we have to split on every boundary stopJob = True #Check the average number of events per lumi in this file #Adapt the lumis per job to match the target conditions if f['avgEvtsPerLumi']: #If there are events in the file ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: #Zero event file, then the ratio goes to infinity. Computers don't like that lumisPerJob = f['lumiCount'] else: #Analyze how many events does this job already has #Check how many we want as target, include as many lumi sections as possible updateSplitOnJobStop = True eventsRemaining = max(avgEventsPerJob - currentJobAvgEventCount, 0) if f['avgEvtsPerLumi']: lumisAllowed = int(math.floor(float(eventsRemaining) / f['avgEvtsPerLumi'])) else: lumisAllowed = f['lumiCount'] lumisPerJob = max(lumisInJob + lumisAllowed, 1) for run in f['runs']: if not isGoodRun(goodRunList = goodRunList, run = run.run): # Then skip this one continue if len(runWhitelist) > 0 and not run.run in runWhitelist: # Skip due to run whitelist continue firstLumi = None if splitOnRun and run.run != lastRun: # Then we need to kill this job and get a new one stopJob = True # Now loop over the lumis for lumi in run: if not isGoodLumi(goodRunList, run = run.run, lumi = lumi): # Kill the chain of good lumis # Skip this lumi if firstLumi != None and firstLumi != lumi: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None continue # You have to kill the lumi chain if they're not continuous if lastLumi and not lumi == lastLumi + 1: self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None if firstLumi == None: # Set the first lumi in the run firstLumi = lumi # If we're full, end the job if lumisInJob == lumisPerJob: stopJob = True # Actually do the new job creation if stopJob: if firstLumi != None and lastLumi != None and lastRun != None: self.currentJob['mask'].addRunAndLumis(run = lastRun, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) msg = None if failNextJob: msg = "File %s has too many events (%d) in %d lumi(s)" % (f['lfn'], f['events'], f['lumiCount']) self.newJob(name = self.getJobName(), failedJob = failNextJob, failedReason = msg) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset) self.currentJob.addResourceEstimates(memory = memoryRequirement) failNextJob = False firstLumi = lumi lumisInJob = 0 lumisInJobInFile = 0 currentJobAvgEventCount = 0 totalJobs += 1 # Add the file to new jobs self.currentJob.addFile(f) if updateSplitOnJobStop: #Then we were carrying from a previous file #Reset calculations for this file updateSplitOnJobStop = False if f['avgEvtsPerLumi']: ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi'] lumisPerJob = max(int(math.floor(ratio)), 1) else: lumisPerJob = f['lumiCount'] lumisInJob += 1 lumisInJobInFile += 1 lastLumi = lumi stopJob = False lastRun = run.run totalAvgEventCount += f['avgEvtsPerLumi'] if self.currentJob and not f in self.currentJob['input_files']: self.currentJob.addFile(f) # We stop here if there are more total events than requested. if totalEvents > 0 and totalAvgEventCount >= totalEvents: stopTask = True break if firstLumi != None and lastLumi != None: # Add this run to the mask self.currentJob['mask'].addRunAndLumis(run = run.run, lumis = [firstLumi, lastLumi]) eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi']) runAddedTime = eventsAdded * timePerEvent runAddedSize = eventsAdded * sizePerEvent self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize) firstLumi = None lastLumi = None if stopTask: break if not splitOnFile: currentJobAvgEventCount += f['avgEvtsPerLumi'] * lumisInJobInFile if stopTask: break if stopTask: break return
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") return def setup(self, parameters): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def processRetries(self, jobs, jobType): """ Actually do the retries """ logging.info("Processing retries for %i failed jobs of type %s." % (len(jobs), jobType)) exhaustJobs = [] cooloffJobs = [] # Retries < max retry count for ajob in jobs: # Retries < max retry count if ajob['retry_count'] < self.maxRetries: cooloffJobs.append(ajob) # Check if Retries >= max retry count elif ajob['retry_count'] >= self.maxRetries: exhaustJobs.append(ajob) msg = "Exhausting job %i" % ajob['id'] logging.error(msg) self.sendAlert(6, msg = msg) logging.debug("JobInfo: %s" % ajob) else: logging.debug("Job %i had %s retries remaining" \ % (ajob['id'], str(ajob['retry_count']))) #Now to actually do something. logging.debug("About to propagate jobs") self.changeState.propagate(exhaustJobs, 'exhausted', \ '%sfailed' %(jobType)) self.changeState.propagate(cooloffJobs, '%scooloff' %(jobType), \ '%sfailed' %(jobType)) # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in exhaustJobs: job.failInputFiles() return exhaustJobs def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ logging.debug("Entering ACDC with %i jobs" % len(jobList)) for job in jobList: job.getMask() self.dataCollection.failedJobs(jobList) return def splitJobList(self, jobList, jobType): """ _splitJobList_ Split up list of jobs into more manageable chunks if necessary """ if len(jobList) < 1: # Nothing to do return myThread = threading.currentThread() while len(jobList) > 0: # Loop over the list and handle it one chunk at a time tmpList = jobList[:self.maxProcessSize] jobList = jobList[self.maxProcessSize:] logging.debug("About to process %i errors" % len(tmpList)) myThread.transaction.begin() exhaustList = self.processRetries(tmpList, jobType) self.handleACDC(jobList = exhaustList) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ createList = [] submitList = [] jobList = [] # Run over created jobs idList = self.getJobs.execute(state = 'CreateFailed') logging.info("Found %s failed jobs failed during creation" \ % len(idList)) if len(idList) > 0: createList = self.loadJobsFromList(idList = idList) # Run over submitted jobs idList = self.getJobs.execute(state = 'SubmitFailed') logging.info("Found %s failed jobs failed during submit" \ % len(idList)) if len(idList) > 0: submitList = self.loadJobsFromList(idList = idList) # Run over executed jobs idList = self.getJobs.execute(state = 'JobFailed') logging.info("Found %s failed jobs failed during execution" \ % len(idList)) if len(idList) > 0: jobList = self.loadJobsFromList(idList = idList) self.splitJobList(jobList = createList, jobType = 'create') self.splitJobList(jobList = submitList, jobType = 'submit') self.splitJobList(jobList = jobList, jobType = 'job') return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = loadAction.execute(jobID = binds) # You have to have a list if type(results) == dict: results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id = entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def algorithm(self, parameters = None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") myThread = threading.currentThread() try: self.handleErrors() except WMException, ex: try: myThread.transaction.rollback() except: pass raise except Exception, ex: msg = "Caught exception in ErrorHandler\n" msg += str(ex) msg += str(traceback.format_exc()) msg += "\n\n" logging.error(msg) self.sendAlert(6, msg = msg) if getattr(myThread, 'transaction', None) != None \ and getattr(myThread.transaction, 'transaction', None) != None: myThread.transaction.rollback() raise ErrorHandlerException(msg)
def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = myThread.logger, dbinterface = myThread.dbi) self.getOutputMapAction = self.daofactory(classname = "Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory(classname = "Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory(classname = "Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname = "Jobs.GetType") self.getParentInfoAction = self.daofactory(classname = "Files.GetParentInfo") self.setParentageByJob = self.daofactory(classname = "Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory(classname = "Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname = "Files.AddRunLumi") self.setFileLocation = self.daofactory(classname = "Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory(classname = "Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname = "Files.Add") self.jobCompleteInput = self.daofactory(classname = "Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname = "Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname = "Jobs.LoadFromID") self.getFullJobInfo = self.daofactory(classname = "Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory(classname = "Jobs.GetFWJRTaskName") self.dbsStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory(classname = "DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname = "ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # ACDC service self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen = 1000) self.datasetAlgoPaths = collections.deque(maxlen = 1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen = 1000) self.workflowPaths = collections.deque(maxlen = 1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return
def testChunking(self): """ _testChunking_ Insert a workload and files that have several distinct sets of locations. Verify that the chunks are created correctly and that they only groups files that have the same set of locations. Also verify that the chunks are pulled out of ACDC correctly. """ dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") testFileA = File(lfn=makeUUID(), size=1024, events=1024) testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn=makeUUID(), size=1024, events=1024) testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileB.addRun(Run(1, 3, 4)) testFileC = File(lfn=makeUUID(), size=1024, events=1024) testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileC.addRun(Run(1, 5, 6)) testJobA = self.getMinimalJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testJobA.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024) testFileD.setLocation(["cmssrm.fnal.gov"]) testFileD.addRun(Run(2, 1, 2)) testFileE = File(lfn=makeUUID(), size=1024, events=1024) testFileE.setLocation(["cmssrm.fnal.gov"]) testFileE.addRun(Run(2, 3, 4)) testJobB = self.getMinimalJob() testJobB.addFile(testFileD) testJobB.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/F"}) testFileF.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileF.addRun(Run(3, 1, 2)) testFileG = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/G"}) testFileG.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileG.addRun(Run(3, 3, 4)) testFileH = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/H"}) testFileH.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileH.addRun(Run(3, 5, 6)) testJobC = self.getMinimalJob() testJobC.addFile(testFileF) testJobC.addFile(testFileG) testJobC.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileI.addRun(Run(4, 1, 2)) testFileJ = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileJ.addRun(Run(4, 3, 4)) testFileK = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileK.addRun(Run(4, 5, 6)) testJobD = self.getMinimalJob() testJobD.addFile(testFileI) testJobD.addFile(testFileJ) testJobD.addFile(testFileK) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD]) chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco", chunkSize=5) self.assertEqual( len(chunks), 4, "Error: There should be four chunks: %s" % len(chunks)) goldenMetaData = { 1: { "lumis": 2, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 1024 }, 2: { "lumis": 4, "locations": ["cmssrm.fnal.gov"], "events": 2048 }, 3: { "lumis": 6, "locations": ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"], "events": 3072 }, 5: { "lumis": 10, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 5120 } } testFiles = [ testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK ] lastFile = testFileA for testFile in testFiles: if lastFile["lfn"] < testFile["lfn"]: lastFile = testFile testFiles.remove(lastFile) goldenFiles = { 1: [lastFile], 2: [testFileD, testFileE], 3: [testFileF, testFileG, testFileH], 5: testFiles } for chunk in chunks: chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertEqual(chunkMetaData["files"], chunk["files"]) self.assertEqual(chunkMetaData["lumis"], chunk["lumis"]) self.assertEqual(chunkMetaData["events"], chunk["events"]) self.assertEqual(chunkMetaData["locations"], chunk["locations"]) self.assertTrue(chunk["files"] in goldenMetaData.keys(), "Error: Extra chunk found.") self.assertEqual(chunk["lumis"], goldenMetaData[chunk["files"]]["lumis"], "Error: Lumis in chunk is wrong.") self.assertEqual(chunk["locations"], goldenMetaData[chunk["files"]]["locations"], "Error: Locations in chunk is wrong.") self.assertEqual(chunk["events"], goldenMetaData[chunk["files"]]["events"], "Error: Events in chunk is wrong.") del goldenMetaData[chunk["files"]] chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertTrue(chunk["files"] in goldenFiles.keys(), "Error: Extra chunk found.") goldenChunkFiles = goldenFiles[chunk["files"]] self.assertEqual(len(chunkFiles), len(goldenChunkFiles)) for chunkFile in chunkFiles: foundFile = None for goldenChunkFile in goldenChunkFiles: if chunkFile["lfn"] == goldenChunkFile["lfn"]: foundFile = goldenChunkFile break self.assertIsNotNone( foundFile, "Error: Missing chunk file: %s, %s" % (chunkFiles, goldenChunkFiles)) self.assertEqual(set(foundFile["parents"]), chunkFile["parents"], "Error: File parents should match.") self.assertEqual(foundFile["merged"], chunkFile["merged"], "Error: File merged status should match.") self.assertEqual(foundFile["locations"], chunkFile["locations"], "Error: File locations should match.") self.assertEqual(foundFile["events"], chunkFile["events"]) self.assertEqual(foundFile["size"], chunkFile["size"]) self.assertEqual(len(foundFile["runs"]), len(chunkFile["runs"]), "Error: Wrong number of runs.") for run in foundFile["runs"]: runMatch = False for chunkRun in chunkFile["runs"]: if chunkRun.run == run.run and chunkRun.lumis == run.lumis: runMatch = True break self.assertTrue(runMatch, "Error: Run information is wrong.") del goldenFiles[chunk["files"]] singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco") self.assertEqual( singleChunk, { "offset": 0, "files": 11, "events": 11264, "lumis": 22, "locations": {"castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"} }, "Error: Single chunk metadata is wrong") return
def testGetLumiWhitelist(self): """ _testGetLumiWhitelist_ Verify that the ACDC whitelist generation code works correctly. We'll add jobs with the following lumi info: # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12] # Run 2, lumis [5, 6, 7], [10, 11, 12], [15] # Run 3, lumis [20] And should get out a whitelist that looks like this: {"1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]]} """ dcs = DataCollectionService(url = self.testInit.couchUrl, database = "wmcore-acdc-datacollectionsvc") def getJob(): job = Job() job["task"] = "/ACDCTest/reco" job["workflow"] = "ACDCTest" job["location"] = "cmssrm.fnal.gov" job["owner"] = "cmsdataops" job["group"] = "cmsdataops" return job testFileA = File(lfn = makeUUID(), size = 1024, events = 1024) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn = makeUUID(), size = 1024, events = 1024) testFileB.addRun(Run(1, 3)) testJobA = getJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn = makeUUID(), size = 1024, events = 1024) testFileC.addRun(Run(1, 4, 6)) testJobB = getJob() testJobB.addFile(testFileC) testFileD = File(lfn = makeUUID(), size = 1024, events = 1024) testFileD.addRun(Run(1, 7)) testJobC = getJob() testJobC.addFile(testFileD) testFileE = File(lfn = makeUUID(), size = 1024, events = 1024) testFileE.addRun(Run(1, 11, 12)) testJobD = getJob() testJobD.addFile(testFileE) testFileF = File(lfn = makeUUID(), size = 1024, events = 1024) testFileF.addRun(Run(2, 5, 6, 7)) testJobE = getJob() testJobE.addFile(testFileF) testFileG = File(lfn = makeUUID(), size = 1024, events = 1024) testFileG.addRun(Run(2, 10, 11, 12)) testJobF = getJob() testJobF.addFile(testFileG) testFileH = File(lfn = makeUUID(), size = 1024, events = 1024) testFileH.addRun(Run(2, 15)) testJobG = getJob() testJobG.addFile(testFileH) testFileI = File(lfn = makeUUID(), size = 1024, events = 1024) testFileI.addRun(Run(3, 20)) testJobH = getJob() testJobH.addFile(testFileI) testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024) testFileJ.addRun(Run(1, 9)) testJobI = getJob() testJobI.addFile(testFileJ) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE, testJobF, testJobG, testJobH, testJobI]) whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(len(whiteList.keys()), 3, "Error: There should be 3 runs.") self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]], "Error: Whitelist for run 1 is wrong.") self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]], "Error: Whitelist for run 2 is wrong.") self.assertEqual(whiteList["3"], [[20, 20]], "Error: Whitelist for run 3 is wrong.") return
def testChunking(self): """ _testChunking_ Insert a workload and files that have several distinct sets of locations. Verify that the chunks are created correctly and that they only groups files that have the same set of locations. Also verify that the chunks are pulled out of ACDC correctly. """ dcs = DataCollectionService(url = self.testInit.couchUrl, database = "wmcore-acdc-datacollectionsvc") def getJob(): job = Job() job["task"] = "/ACDCTest/reco" job["workflow"] = "ACDCTest" job["location"] = "cmssrm.fnal.gov" job["owner"] = "cmsdataops" job["group"] = "cmsdataops" return job testFileA = File(lfn = makeUUID(), size = 1024, events = 1024) testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn = makeUUID(), size = 1024, events = 1024) testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileB.addRun(Run(1, 3, 4)) testFileC = File(lfn = makeUUID(), size = 1024, events = 1024) testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileC.addRun(Run(1, 5, 6)) testJobA = getJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testJobA.addFile(testFileC) testFileD = File(lfn = makeUUID(), size = 1024, events = 1024) testFileD.setLocation(["cmssrm.fnal.gov"]) testFileD.addRun(Run(2, 1, 2)) testFileE = File(lfn = makeUUID(), size = 1024, events = 1024) testFileE.setLocation(["cmssrm.fnal.gov"]) testFileE.addRun(Run(2, 3, 4)) testJobB = getJob() testJobB.addFile(testFileD) testJobB.addFile(testFileE) testFileF = File(lfn = makeUUID(), size = 1024, events = 1024, parents = set(["/some/parent/F"])) testFileF.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileF.addRun(Run(3, 1, 2)) testFileG = File(lfn = makeUUID(), size = 1024, events = 1024, parents = set(["/some/parent/G"])) testFileG.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"] ) testFileG.addRun(Run(3, 3, 4)) testFileH = File(lfn = makeUUID(), size = 1024, events = 1024, parents = set(["/some/parent/H"])) testFileH.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileH.addRun(Run(3, 5, 6)) testJobC = getJob() testJobC.addFile(testFileF) testJobC.addFile(testFileG) testJobC.addFile(testFileH) testFileI = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True) testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileI.addRun(Run(4, 1, 2)) testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True) testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"] ) testFileJ.addRun(Run(4, 3, 4)) testFileK = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True) testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileK.addRun(Run(4, 5, 6)) testJobD = getJob() testJobD.addFile(testFileI) testJobD.addFile(testFileJ) testJobD.addFile(testFileK) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD]) chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco", chunkSize = 5) self.assertEqual(len(chunks), 4, "Error: There should be four chunks: %s" % len(chunks)) goldenMetaData = {1: {"lumis": 2, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 1024}, 2: {"lumis": 4, "locations": ["cmssrm.fnal.gov"], "events": 2048}, 3: {"lumis": 6, "locations": ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"], "events": 3072}, 5: {"lumis": 10, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 5120}} testFiles =[testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK] lastFile = testFileA for testFile in testFiles: if lastFile["lfn"] < testFile["lfn"]: lastFile = testFile testFiles.remove(lastFile) goldenFiles = {1: [lastFile], 2: [testFileD, testFileE], 3: [testFileF, testFileG, testFileH], 5: testFiles} for chunk in chunks: chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertEqual(chunkMetaData["files"], chunk["files"], "Error: Metadata doesn't match.") self.assertEqual(chunkMetaData["lumis"], chunk["lumis"], "Error: Metadata doesn't match.") self.assertEqual(chunkMetaData["events"], chunk["events"], "Error: Metadata doesn't match.") self.assertEqual(chunkMetaData["locations"], chunk["locations"], "Error: Metadata doesn't match.") self.assertTrue(chunk["files"] in goldenMetaData.keys(), "Error: Extra chunk found.") self.assertEqual(chunk["lumis"], goldenMetaData[chunk["files"]]["lumis"], "Error: Lumis in chunk is wrong.") self.assertEqual(chunk["locations"], goldenMetaData[chunk["files"]]["locations"], "Error: Locations in chunk is wrong.") self.assertEqual(chunk["events"], goldenMetaData[chunk["files"]]["events"], "Error: Events in chunk is wrong.") del goldenMetaData[chunk["files"]] chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertTrue(chunk["files"] in goldenFiles.keys(), "Error: Extra chunk found.") goldenChunkFiles = goldenFiles[chunk["files"]] self.assertEqual(len(chunkFiles), len(goldenChunkFiles)) for chunkFile in chunkFiles: foundFile = None for goldenChunkFile in goldenChunkFiles: if chunkFile["lfn"] == goldenChunkFile["lfn"]: foundFile = goldenChunkFile break self.assertTrue(foundFile != None, "Error: Missing chunk file: %s, %s" % (chunkFiles, goldenChunkFiles)) self.assertEqual(foundFile["parents"], chunkFile["parents"], "Error: File parents should match.") self.assertEqual(foundFile["merged"], chunkFile["merged"], "Error: File merged status should match.") self.assertEqual(foundFile["locations"], chunkFile["locations"], "Error: File locations should match.") self.assertEqual(foundFile["events"], chunkFile["events"], "Error: File locations should match: %s" % chunk["files"]) self.assertEqual(foundFile["size"], chunkFile["size"], "Error: File locations should match.") self.assertEqual(len(foundFile["runs"]), len(chunkFile["runs"]), "Error: Wrong number of runs.") for run in foundFile["runs"]: runMatch = False for chunkRun in chunkFile["runs"]: if chunkRun.run == run.run and chunkRun.lumis == run.lumis: runMatch = True break self.assertTrue(runMatch, "Error: Run information is wrong.") del goldenFiles[chunk["files"]] singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco") self.assertEqual(singleChunk, {"offset" : 0, "files" : 11, "events" : 11264, "lumis" : 22, "locations" : set(["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"])}, "Error: Single chunk metadata is wrong") return
def testGetLumiWhitelist(self): """ _testGetLumiWhitelist_ Verify that the ACDC whitelist generation code works correctly. We'll add jobs with the following lumi info: # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12] # Run 2, lumis [5, 6, 7], [10, 11, 12], [15] # Run 3, lumis [20] And should get out a whitelist that looks like this: {"1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]]} """ dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") testFileA = File(lfn=makeUUID(), size=1024, events=1024) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn=makeUUID(), size=1024, events=1024) testFileB.addRun(Run(1, 3)) testJobA = self.getMinimalJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn=makeUUID(), size=1024, events=1024) testFileC.addRun(Run(1, 4, 6)) testJobB = self.getMinimalJob() testJobB.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024) testFileD.addRun(Run(1, 7)) testJobC = self.getMinimalJob() testJobC.addFile(testFileD) testFileE = File(lfn=makeUUID(), size=1024, events=1024) testFileE.addRun(Run(1, 11, 12)) testJobD = self.getMinimalJob() testJobD.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024) testFileF.addRun(Run(2, 5, 6, 7)) testJobE = self.getMinimalJob() testJobE.addFile(testFileF) testFileG = File(lfn=makeUUID(), size=1024, events=1024) testFileG.addRun(Run(2, 10, 11, 12)) testJobF = self.getMinimalJob() testJobF.addFile(testFileG) testFileH = File(lfn=makeUUID(), size=1024, events=1024) testFileH.addRun(Run(2, 15)) testJobG = self.getMinimalJob() testJobG.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024) testFileI.addRun(Run(3, 20)) testJobH = self.getMinimalJob() testJobH.addFile(testFileI) testFileJ = File(lfn=makeUUID(), size=1024, events=1024) testFileJ.addRun(Run(1, 9)) testJobI = self.getMinimalJob() testJobI.addFile(testFileJ) dcs.failedJobs([ testJobA, testJobB, testJobC, testJobD, testJobE, testJobF, testJobG, testJobH, testJobI ]) whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(len(whiteList.keys()), 3, "Error: There should be 3 runs.") self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]], "Error: Whitelist for run 1 is wrong.") self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]], "Error: Whitelist for run 2 is wrong.") self.assertEqual(whiteList["3"], [[20, 20]], "Error: Whitelist for run 3 is wrong.") correctLumiList = LumiList( compactList={ "1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]] }) testLumiList = dcs.getLumilistWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(correctLumiList.getCMSSWString(), testLumiList.getCMSSWString()) return
def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory( classname="Jobs.GetFWJRTaskName") self.pnn_to_psn = self.daofactory( classname="Locations.GetPNNtoPSNMapping").execute() self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco self.maxAllowedRepackOutputSize = getattr( config.JobAccountant, 'maxAllowedRepackOutputSize', 12 * 1024 * 1024 * 1024) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return
def testC_ACDCTest(self): """ _ACDCTest_ Test whether we can get a goodRunList out of ACDC and process it correctly. """ workload = self.createTestWorkload() dcs = DataCollectionService(url=self.testInit.couchUrl, database=self.testInit.couchDbName) testFileA = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileA.addRun(Run(1, 1, 2)) testFileA.create() testFileB = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileB.addRun(Run(1, 3)) testFileB.create() testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileC.addRun(Run(1, 4, 6)) testFileC.create() testJobB = getJob(workload) testJobB.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileD.addRun(Run(1, 7)) testFileD.create() testJobC = getJob(workload) testJobC.addFile(testFileD) testFileE = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileE.addRun(Run(1, 11, 12)) testFileE.create() testJobD = getJob(workload) testJobD.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileF.addRun(Run(2, 5, 6, 7)) testFileF.create() testJobE = getJob(workload) testJobE.addFile(testFileF) testFileG = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileG.addRun(Run(2, 10, 11, 12)) testFileG.create() testJobF = getJob(workload) testJobF.addFile(testFileG) testFileH = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileH.addRun(Run(2, 15)) testFileH.create() testJobG = getJob(workload) testJobG.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileI.addRun(Run(3, 20)) testFileI.create() testJobH = getJob(workload) testJobH.addFile(testFileI) testFileJ = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileJ.addRun(Run(1, 9)) testFileJ.create() testJobI = getJob(workload) testJobI.addFile(testFileJ) # dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE, # testJobF, testJobG, testJobH, testJobI]) dcs.failedJobs([testJobA, testJobD, testJobH]) baseName = makeUUID() testFileset = Fileset(name=baseName) testFileset.create() testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testFileset.addFile(testFileH) testFileset.addFile(testFileI) testFileset.addFile(testFileJ) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="LumiBased", type="Processing") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroups = jobFactory(lumis_per_job=100, halt_job_on_file_boundaries=False, splitOnRun=True, collectionName=workload.name(), filesetName=workload.getTask("reco").getPathName(), owner="evansde77", group="DMWM", couchURL=self.testInit.couchUrl, couchDB=self.testInit.couchDbName, performance=self.performanceParams) self.assertEqual(jobGroups[0].jobs[0]['mask'].getRunAndLumis(), {1: [[1, 2], [3, 3], [11, 12]]}) self.assertEqual(jobGroups[0].jobs[1]['mask'].getRunAndLumis(), {3: [[20, 20]]}) return
def testC_ACDCTest(self): """ _ACDCTest_ Test whether we can get a goodRunList out of ACDC and process it correctly. """ workload = self.createTestWorkload() dcs = DataCollectionService(url=self.testInit.couchUrl, database=self.testInit.couchDbName) testFileA = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileA.addRun(Run(1, 1, 2)) testFileA.create() testFileB = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileB.addRun(Run(1, 3)) testFileB.create() testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileC.addRun(Run(1, 4, 6)) testFileC.create() testJobB = getJob(workload) testJobB.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileD.addRun(Run(1, 7)) testFileD.create() testJobC = getJob(workload) testJobC.addFile(testFileD) testFileE = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileE.addRun(Run(1, 11, 12)) testFileE.create() testJobD = getJob(workload) testJobD.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileF.addRun(Run(2, 5, 6, 7)) testFileF.create() testJobE = getJob(workload) testJobE.addFile(testFileF) testFileG = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileG.addRun(Run(2, 10, 11, 12)) testFileG.create() testJobF = getJob(workload) testJobF.addFile(testFileG) testFileH = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileH.addRun(Run(2, 15)) testFileH.create() testJobG = getJob(workload) testJobG.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileI.addRun(Run(3, 20)) testFileI.create() testJobH = getJob(workload) testJobH.addFile(testFileI) testFileJ = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileJ.addRun(Run(1, 9)) testFileJ.create() testJobI = getJob(workload) testJobI.addFile(testFileJ) # dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE, # testJobF, testJobG, testJobH, testJobI]) dcs.failedJobs([testJobA, testJobD, testJobH]) baseName = makeUUID() testFileset = Fileset(name=baseName) testFileset.create() testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testFileset.addFile(testFileH) testFileset.addFile(testFileI) testFileset.addFile(testFileJ) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="LumiBased", type="Processing") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroups = jobFactory( lumis_per_job=100, halt_job_on_file_boundaries=False, splitOnRun=True, collectionName=workload.name(), filesetName=workload.getTask("reco").getPathName(), owner="evansde77", group="DMWM", couchURL=self.testInit.couchUrl, couchDB=self.testInit.couchDbName, performance=self.performanceParams) self.assertEqual(jobGroups[0].jobs[0]['mask'].getRunAndLumis(), {1: [[1, 2], [3, 3], [11, 12]]}) self.assertEqual(jobGroups[0].jobs[1]['mask'].getRunAndLumis(), {3: [[20, 20]]}) return
class ErrorHandlerTest(EmulatedUnitTestCase): """ TestCase for TestErrorHandler module """ def setUp(self): """ setup for test. """ super(ErrorHandlerTest, self).setUp() myThread = threading.currentThread() self.testInit = TestInitCouchApp(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMCore.WMBS"], useDefault=False) self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC") self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump") self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump") self.daofactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daofactory(classname="Jobs.GetAllJobs") self.setJobTime = self.daofactory(classname="Jobs.SetStateTime") locationAction = self.daofactory(classname="Locations.New") locationAction.execute(siteName="malpaquet", pnn="T2_CH_CERN") self.testDir = self.testInit.generateWorkDir() self.configFile = EmulatorSetup.setupWMAgentConfig() self.nJobs = 10 self.dataCS = DataCollectionService(url=self.testInit.couchUrl, database="errorhandler_t") return def tearDown(self): """ Database deletion """ self.testInit.clearDatabase() self.testInit.delWorkDir() self.testInit.tearDownCouch() EmulatorSetup.deleteConfig(self.configFile) return def getConfig(self): """ _getConfig_ """ config = self.testInit.getConfiguration() self.testInit.generateWorkDir(config) # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", self.testDir) config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("ErrorHandler") # The log level of the component. config.ErrorHandler.logLevel = 'INFO' # The namespace of the component config.ErrorHandler.namespace = 'WMComponent.ErrorHandler.ErrorHandler' # maximum number of threads we want to deal # config.ErrorHandler.maxThreads = 30 # with messages per pool. config.ErrorHandler.maxProcessSize = 30 config.ErrorHandler.readFWJR = True # maximum number of retries we want for job config.ErrorHandler.maxRetries = 5 # The poll interval at which to look for failed jobs config.ErrorHandler.pollInterval = 60 # this will be overwritten in some unittests # JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL', None) config.JobStateMachine.couchDBName = "errorhandler_t_jd" config.section_('ACDC') config.ACDC.couchurl = self.testInit.couchUrl config.ACDC.database = "errorhandler_t" return config def createWorkload(self, workloadName='Test'): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload(workloadName) # Add RequestManager stuff workload.data.request.section_('schema') workload.data.request.schema.Requestor = 'nobody' workload.data.request.schema.Group = 'testers' taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() return workload def createTestJobGroup(self, nJobs=10, retry_count=1, workloadPath='test', fwjrPath=None, workloadName=makeUUID(), fileModifier=''): """ Creates a group of several jobs """ myThread = threading.currentThread() myThread.transaction.begin() testWorkflow = Workflow(spec=workloadPath, owner="cmsdataops", group="cmsdataops", name=workloadName, task="/TestWorkload/ReReco") testWorkflow.create() testWMBSFileset = Fileset(name="TestFileset") testWMBSFileset.create() testSubscription = Subscription(fileset=testWMBSFileset, workflow=testWorkflow) testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() testFile0 = File(lfn="/this/is/a/parent%s" % fileModifier, size=1024, events=10) testFile0.addRun(Run(10, *[12312])) testFile0.setLocation('T2_CH_CERN') testFileA = File(lfn="/this/is/a/lfnA%s" % fileModifier, size=1024, events=10, first_event=88, merged=False) testFileA.addRun(Run(10, *[12312, 12313])) testFileA.setLocation('T2_CH_CERN') testFileB = File(lfn="/this/is/a/lfnB%s" % fileModifier, size=1024, events=10, first_event=88, merged=False) testFileB.addRun(Run(10, *[12314, 12315, 12316])) testFileB.setLocation('T2_CH_CERN') testFile0.create() testFileA.create() testFileB.create() testFileA.addParent(lfn="/this/is/a/parent%s" % fileModifier) testFileB.addParent(lfn="/this/is/a/parent%s" % fileModifier) for i in range(0, nJobs): testJob = Job(name=makeUUID()) testJob['retry_count'] = retry_count testJob['retry_max'] = 10 testJob['mask'].addRunAndLumis(run=10, lumis=[12312]) testJob['mask'].addRunAndLumis(run=10, lumis=[12314, 12316]) testJob['cache_dir'] = os.path.join(self.testDir, testJob['name']) testJob['fwjr_path'] = fwjrPath os.mkdir(testJob['cache_dir']) testJobGroup.add(testJob) testJob.create(group=testJobGroup) testJob.addFile(testFileA) testJob.addFile(testFileB) testJob.save() testJobGroup.commit() testSubscription.acquireFiles(files=[testFileA, testFileB]) testSubscription.save() myThread.transaction.commit() return testJobGroup def testA_Create(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate() Mimics creation of component and test jobs failed in create stage. """ njobs = 4 workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') # testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, testJobGroup = self.createTestJobGroup(nJobs=njobs, workloadPath=workloadPath, workloadName=workloadName) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'createfailed', 'created') idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), njobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) # These should go directly to exhausted idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), njobs) # Check that it showed up in ACDC collection = self.dataCS.getDataCollection(workloadName) # Now look at what's inside self.assertTrue(len(collection['filesets']) > 0) for fileset in collection["filesets"]: counter = 0 for f in fileset.listFiles(): counter += 1 self.assertTrue( f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"]) self.assertEqual(f['events'], 10) self.assertEqual(f['size'], 1024) self.assertEqual(f['parents'], [u'/this/is/a/parent']) self.assertTrue(f['runs'][0]['run_number'] == 10) if f['lfn'] == "/this/is/a/lfnA": self.assertItemsEqual(f['runs'][0]['lumis'], [12312]) elif f['lfn'] == "/this/is/a/lfnB": self.assertItemsEqual(f['runs'][0]['lumis'], [12314, 12315, 12316]) else: self.assertFail("File name is not known: %s" % f['lfn']) self.assertEqual(f['merged'], 0) self.assertEqual(f['first_event'], 88) self.assertEqual( counter, njobs * 2) # each job has 2 files (thus 4 times duplicate) return def testB_Submit(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit() Mimics creation of component and test jobs failed in submit stage. """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') idList = self.getJobs.execute(state='SubmitFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='SubmitFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) return def testC_Jobs(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs() Mimics creation of component and test jobs failed in execute stage. """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) return def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5, workloadPath=workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testSubscription = Subscription(id=1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2) def testE_FailJobs(self): """ _FailJobs_ Test our ability to fail jobs based on the information in the FWJR """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath, fwjrPath=fwjrPath) badJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath, fwjrPath=None, fileModifier='bad') config = self.getConfig() config.ErrorHandler.readFWJR = True changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(badJobGroup.jobs, 'created', 'new') changer.propagate(badJobGroup.jobs, 'executing', 'created') changer.propagate(badJobGroup.jobs, 'complete', 'executing') changer.propagate(badJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.exitCodesNoRetry = [8020] testErrorHandler.algorithm(None) # This should exhaust all jobs due to exit code # Except those with no fwjr idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.maxFailTime = -10 testErrorHandler2 = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler2.reqAuxDB = None changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler2.algorithm(None) # This should exhaust all jobs due to timeout idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.maxFailTime = 24 * 3600 config.ErrorHandler.passExitCodes = [8020] testErrorHandler3 = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler3.reqAuxDB = None changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler3.algorithm(None) # This should pass all jobs due to exit code idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) return @attr('integration') def testZ_Profile(self): """ _testProfile_ Do a full profile of the poller """ nJobs = 100 workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), nJobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) startTime = time.time() cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename="profStats.stat") stopTime = time.time() idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def algorithm(self, *args, **kwargs): """ _algorithm_ Split files into a number of lumis per job Allow a flag to determine if we split files between jobs """ myThread = threading.currentThread() lumisPerJob = int(kwargs.get('lumis_per_job', 1)) splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True)) ignoreACDC = bool(kwargs.get('ignore_acdc_except', False)) collectionName = kwargs.get('collectionName', None) splitOnRun = kwargs.get('splitOnRun', True) getParents = kwargs.get('include_parents', False) runWhitelist = kwargs.get('runWhitelist', []) runs = kwargs.get('runs', None) lumis = kwargs.get('lumis', None) deterministicPileup = kwargs.get('deterministicPileup', False) eventsPerLumiInDataset = 0 if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory( classname="Jobs.GetNumberOfJobsPerWorkflow") jobNumber = getJobNumber.execute( workflow=self.subscription.getWorkflow().id) self.nJobs = jobNumber timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) goodRunList = {} if runs and lumis: goodRunList = buildLumiMask(runs, lumis) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s' % filesetName) dcs = DataCollectionService(couchURL, couchDB) goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group) except Exception, ex: msg = "Exception while trying to load goodRunList\n" if ignoreACDC: msg += "Ditching goodRunList\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) goodRunList = {} else: msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if type(self.maxRetries) != dict: self.maxRetries = {'default' : self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname = "Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") # Some exit codes imply an immediate failure, non-configurable self.exitCodes.extend(WMJobPermanentSystemErrors) return def setup(self, parameters = None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ self.changeState.propagate(jobList, 'exhausted', 'retrydone') # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() self.handleACDC(jobList) return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed" % (len(jobList), state)) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) self.sendAlert(4, msg = msg) logging.debug("JobInfo: %s" % job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary = True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary = True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs" % len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff." % job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error("Failed to find FWJR for job %i in location %s.\n Passing it to cooloff." % (job['id'], reportPath)) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] if startTime == None or stopTime == None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i" % job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.exitCodes]): msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.error(msg) self.sendAlert(4, msg = msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes())) passJobs.append(job) continue cooloffJobs.append(job) except Exception, ex: logging.warning("Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs
def algorithm(self, *args, **kwargs): """ _algorithm_ An event base splitting algorithm. All available files are split into a set number of events per job. """ eventsPerJob = int(kwargs.get("events_per_job", 100)) eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob)) getParents = kwargs.get("include_parents", False) lheInput = kwargs.get("lheInputFiles", False) collectionName = kwargs.get('collectionName', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) acdcFileList = [] deterministicPileup = kwargs.get('deterministicPileup', False) if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname="Jobs.GetNumberOfJobsPerWorkflow") self.nJobs = getJobNumber.execute(workflow=self.subscription.getWorkflow().id) logging.info('Creating %d jobs in DeterministicPileup mode', self.nJobs) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') owner = kwargs.get('owner') group = kwargs.get('group') logging.info('Creating jobs for ACDC fileset %s', filesetName) dcs = DataCollectionService(couchURL, couchDB) acdcFileList = dcs.getProductionACDCInfo(collectionName, filesetName, owner, group) except Exception as ex: msg = "Exception while trying to load goodRunList\n" msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return totalJobs = 0 locationDict = self.sortByLocation() for location in locationDict: self.newGroup() fileList = locationDict[location] getRunLumiInformation = False for f in fileList: if f['lfn'].startswith("MCFakeFile"): # We have one MCFakeFile, then it needs run information getRunLumiInformation = True break if getRunLumiInformation: if self.package == 'WMCore.WMBS': loadRunLumi = self.daoFactory(classname="Files.GetBulkRunLumi") fileLumis = loadRunLumi.execute(files=fileList) for f in fileList: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict.keys(): f.addRun(run=Run(run, *lumiDict[run])) for f in fileList: currentEvent = f['first_event'] eventsInFile = f['events'] runs = list(f['runs']) # We got the runs, clean the file. f['runs'] = set() if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) if acdcFileList: if f['lfn'] in [x['lfn'] for x in acdcFileList]: totalJobs = self.createACDCJobs(f, acdcFileList, timePerEvent, sizePerEvent, memoryRequirement, lheInput, eventsPerJob, eventsPerLumi, deterministicPileup, totalJobs) continue if not f['lfn'].startswith("MCFakeFile"): # Very very uncommon, but it has real input dataset if eventsInFile >= eventsPerJob: while currentEvent < eventsInFile: self.newJob(name=self.getJobName(length=totalJobs)) self.currentJob.addFile(f) if eventsPerJob + currentEvent < eventsInFile: jobTime = eventsPerJob * timePerEvent diskRequired = eventsPerJob * sizePerEvent self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob, currentEvent) else: jobTime = (eventsInFile - currentEvent) * timePerEvent diskRequired = (eventsInFile - currentEvent) * sizePerEvent self.currentJob["mask"].setMaxAndSkipEvents(None, currentEvent) self.currentJob.addResourceEstimates(jobTime=jobTime, memory=memoryRequirement, disk=diskRequired) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob) logging.debug("Job created for real input with %s", self.currentJob) currentEvent += eventsPerJob totalJobs += 1 else: self.newJob(name=self.getJobName(length=totalJobs)) self.currentJob.addFile(f) jobTime = eventsInFile * timePerEvent diskRequired = eventsInFile * sizePerEvent self.currentJob.addResourceEstimates(jobTime=jobTime, memory=memoryRequirement, disk=diskRequired) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob) logging.debug("Last job created for real input with %s", self.currentJob) totalJobs += 1 else: # This assumes there's only one run which is the case for MC lumis = runs[0].lumis (firstLumi, lastLumi) = (min(lumis), max(lumis)) currentLumi = firstLumi totalEvents = 0 if eventsInFile >= eventsPerJob: while totalEvents < eventsInFile: self.newJob(name=self.getJobName(length=totalJobs)) self.currentJob.addFile(f) self.currentJob.addBaggageParameter("lheInputFiles", lheInput) lumisPerJob = int(ceil(float(eventsPerJob) / eventsPerLumi)) # Limit the number of events to a unsigned 32bit int eventsRemaining = eventsInFile - totalEvents if (currentEvent + eventsPerJob - 1) > (2 ** 32 - 1) and ( currentEvent + eventsRemaining - 1) > (2 ** 32 - 1): currentEvent = 1 if eventsRemaining > eventsPerJob: self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob, currentEvent) self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob, currentLumi) jobTime = eventsPerJob * timePerEvent diskRequired = eventsPerJob * sizePerEvent else: jobTime = eventsRemaining * timePerEvent diskRequired = eventsRemaining * sizePerEvent lumisPerJob = int(ceil(float(eventsRemaining) / eventsPerLumi)) self.currentJob["mask"].setMaxAndSkipEvents(eventsRemaining, currentEvent) self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob, currentLumi) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob) currentLumi += lumisPerJob currentEvent += eventsPerJob totalEvents += eventsPerJob totalJobs += 1 self.currentJob.addResourceEstimates(jobTime=jobTime, memory=memoryRequirement, disk=diskRequired) else: self.newJob(name=self.getJobName(length=totalJobs)) self.currentJob.addFile(f) # For MC we use firstEvent instead of skipEvents so set it to 1 # We must check for events going over 2**32 - 1 here too if (eventsInFile + currentEvent - 1) > (2 ** 32 - 1): currentEvent = 1 self.currentJob["mask"].setMaxAndSkipEvents(eventsInFile, currentEvent) self.currentJob["mask"].setMaxAndSkipLumis(lastLumi - currentLumi + 1, currentLumi) jobTime = eventsInFile * timePerEvent diskRequired = eventsInFile * sizePerEvent self.currentJob.addResourceEstimates(jobTime=jobTime, memory=memoryRequirement, disk=diskRequired) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob) totalJobs += 1
def algorithm(self, *args, **kwargs): """ _algorithm_ An event base splitting algorithm. All available files are split into a set number of events per job. """ eventsPerJob = int(kwargs.get("events_per_job", 100)) eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob)) getParents = kwargs.get("include_parents", False) lheInput = kwargs.get("lheInputFiles", False) collectionName = kwargs.get('collectionName', None) timePerEvent, sizePerEvent, memoryRequirement = \ self.getPerformanceParameters(kwargs.get('performance', {})) acdcFileList = [] deterministicPileup = kwargs.get('deterministicPileup', False) if eventsPerJob <= 0 or eventsPerLumi <= 0: msg = "events_per_job and events_per_lumi must be positive. Their values are: " msg += "events_per_job: %d, events_per_lumi: %d" % (eventsPerJob, eventsPerLumi) raise RuntimeError(msg) if deterministicPileup and self.package == 'WMCore.WMBS': getJobNumber = self.daoFactory(classname="Jobs.GetNumberOfJobsPerWorkflow") self.nJobs = getJobNumber.execute(workflow=self.subscription.getWorkflow().id) logging.info('Creating jobs in DeterministicPileup mode for %s', self.subscription.workflowName()) # If we have runLumi info, we need to load it from couch if collectionName: try: from WMCore.ACDC.DataCollectionService import DataCollectionService couchURL = kwargs.get('couchURL') couchDB = kwargs.get('couchDB') filesetName = kwargs.get('filesetName') collectionName = kwargs.get('collectionName') logging.info('Loading ACDC info for collectionName: %s, with filesetName: %s', collectionName, filesetName) dcs = DataCollectionService(couchURL, couchDB) acdcFileList = dcs.getProductionACDCInfo(collectionName, filesetName) except Exception as ex: msg = "Exception while trying to load goodRunList\n" msg += "Refusing to create any jobs.\n" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) return totalJobs = 0 locationDict = self.sortByLocation() for location in locationDict: self.newGroup() fileList = locationDict[location] getRunLumiInformation = False for f in fileList: if f['lfn'].startswith("MCFakeFile"): # We have one MCFakeFile, then it needs run information getRunLumiInformation = True break if getRunLumiInformation: if self.package == 'WMCore.WMBS': loadRunLumi = self.daoFactory(classname="Files.GetBulkRunLumi") fileLumis = loadRunLumi.execute(files=fileList) if not fileLumis: logging.warning("Empty fileLumis dict for workflow %s, subs %s.", self.subscription.workflowName(), self.subscription['id']) for f in fileList: lumiDict = fileLumis.get(f['id'], {}) for run in lumiDict: f.addRun(run=Run(run, *lumiDict[run])) for f in fileList: currentEvent = f['first_event'] eventsInFile = f['events'] runs = list(f['runs']) # We got the runs, clean the file. f['runs'] = set() if getParents: parentLFNs = self.findParent(lfn=f['lfn']) for lfn in parentLFNs: parent = File(lfn=lfn) f['parents'].add(parent) if acdcFileList: totalJobs = self.createACDCJobs(f, acdcFileList, timePerEvent, sizePerEvent, memoryRequirement, lheInput, eventsPerJob, eventsPerLumi, deterministicPileup, totalJobs) continue if not f['lfn'].startswith("MCFakeFile"): # there might be files with 0 event that still have to be processed if eventsInFile == 0: self.newJob(name=self.getJobName(length=totalJobs)) self.currentJob.addFile(f) # Do not set LastEvent self.currentJob["mask"].setMaxAndSkipEvents(None, currentEvent) self.currentJob.addResourceEstimates(jobTime=0, memory=memoryRequirement, disk=0) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob) totalJobs += 1 logging.info("Job created for 0-event input file with %s", self.currentJob) # Very very uncommon in production, but it has real input dataset while eventsInFile: self.newJob(name=self.getJobName(length=totalJobs)) self.currentJob.addFile(f) if eventsInFile >= eventsPerJob: jobTime = eventsPerJob * timePerEvent diskRequired = eventsPerJob * sizePerEvent self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob - 1, currentEvent) else: jobTime = eventsInFile * timePerEvent diskRequired = eventsInFile * sizePerEvent self.currentJob["mask"].setMaxAndSkipEvents(eventsInFile - 1, currentEvent) eventsInFile = eventsPerJob self.currentJob.addResourceEstimates(jobTime=jobTime, memory=memoryRequirement, disk=diskRequired) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob) eventsInFile -= eventsPerJob currentEvent += eventsPerJob totalJobs += 1 logging.debug("Job created for real input with %s", self.currentJob) else: # This assumes there's only one run which is the case for MC lumis = runs[0].lumis (firstLumi, lastLumi) = (min(lumis), max(lumis)) currentLumi = firstLumi lumisPerJob = int(ceil(float(eventsPerJob) / eventsPerLumi)) while eventsInFile: self.newJob(name=self.getJobName(length=totalJobs)) self.currentJob.addFile(f) self.currentJob.addBaggageParameter("lheInputFiles", lheInput) # Limit the number of events to a unsigned 32bit int if (currentEvent + eventsPerJob - 1) > (2 ** 32 - 1) and \ (currentEvent + eventsInFile) > (2 ** 32 - 1): currentEvent = 1 if eventsInFile >= eventsPerJob: jobTime = eventsPerJob * timePerEvent diskRequired = eventsPerJob * sizePerEvent # Alan on 16/Apr/2019: inclusiveMask must be a real inclusiveMask, thus # FirstEvent/FirstLumi and LastEvent/LastLumi are also processed by the job self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob - 1, currentEvent) self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob - 1, currentLumi) else: jobTime = eventsInFile * timePerEvent diskRequired = eventsInFile * sizePerEvent lumisPerJob = int(ceil(float(eventsInFile) / eventsPerLumi)) self.currentJob["mask"].setMaxAndSkipEvents(eventsInFile - 1, currentEvent) self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob - 1, currentLumi) eventsInFile = eventsPerJob self.currentJob.addResourceEstimates(jobTime=jobTime, memory=memoryRequirement, disk=diskRequired) if deterministicPileup: self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob) eventsInFile -= eventsPerJob currentEvent += eventsPerJob currentLumi += lumisPerJob totalJobs += 1 logging.info("Job created with mask: %s", self.currentJob['mask']) return
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException( 'Max retries for the default job type must be specified') self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) return def setup(self, parameters=None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ self.changeState.propagate(jobList, 'exhausted', 'retrydone') # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() # Do not build ACDC for utilitarian job types jobList = [ job for job in jobList if job['type'] not in ['LogCollect', 'Cleanup'] ] self.handleACDC(jobList) return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed", len(jobList), state) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) logging.debug("JobInfo: %s", job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors( cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary=True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary=True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs", len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] if self.reqAuxDB: self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName).get("NoRetryExitCodes", []) for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error( "No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.", job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error( "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.", job['id'], reportPath) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] # correct the location if the original location is different from recorded in wmbs # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this. # If location in wmbs needs to be updated, it should happen in JobAccountant. locationFromFWJR = report.getSiteName() if locationFromFWJR: job["location"] = locationFromFWJR job["site_cms_name"] = locationFromFWJR if startTime is None or stopTime is None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i", job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % ( job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([ x for x in report.getExitCodes() if x in self.exitCodesNoRetry ]): msg = "Job %i exhausted due to a bad exit code (%s)" % ( job['id'], str(report.getExitCodes())) logging.error(msg) exhaustJobs.append(job) continue if len( [x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % ( job['id'], str(report.getExitCodes())) logging.debug(msg) passJobs.append(job) continue cooloffJobs.append(job) except Exception as ex: logging.warning( "Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs def handleRetryDoneJobs(self, jobList): """ _handleRetryDoneJobs_ """ myThread = threading.currentThread() logging.info("About to process %d retry done jobs", len(jobList)) myThread.transaction.begin() self.exhaustJobs(jobList) myThread.transaction.commit() return def handleFailedJobs(self, jobList, state): """ _handleFailedJobs_ """ myThread = threading.currentThread() logging.info("About to process %d failures", len(jobList)) myThread.transaction.begin() self.processRetries(jobList, state) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = ['create', 'submit', 'job'] for state in failure_states: idList = self.getJobs.execute(state="%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed", len(idList), state) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state='retrydone') logging.info("Found %d jobs done with all retries", len(idList)) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleRetryDoneJobs(jobList) return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.idLoad.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def loadJobsFromListFull(self, idList): """ _loadJobsFromList_ Load jobs in bulk. Include the full metadata. """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.loadAction.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs @timeFunction def algorithm(self, parameters=None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") try: myThread = threading.currentThread() self.handleErrors() except (CouchConnectionError, HTTPException) as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. " msg += "Transactions postponed until the next polling cycle\n" msg += str(ex) logging.error(msg) except Exception as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught unexpected exception in ErrorHandler:\n" msg += str(ex) logging.exception(msg) raise ErrorHandlerException(msg)
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = myThread.logger, dbinterface = myThread.dbi) self.getOutputMapAction = self.daofactory(classname = "Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory(classname = "Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory(classname = "Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname = "Jobs.GetType") self.getParentInfoAction = self.daofactory(classname = "Files.GetParentInfo") self.setParentageByJob = self.daofactory(classname = "Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory(classname = "Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname = "Files.AddRunLumi") self.setFileLocation = self.daofactory(classname = "Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory(classname = "Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname = "Files.Add") self.jobCompleteInput = self.daofactory(classname = "Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname = "Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname = "Jobs.LoadFromID") self.getFullJobInfo = self.daofactory(classname = "Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory(classname = "Jobs.GetFWJRTaskName") self.dbsStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory(classname = "DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname = "ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # ACDC service self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen = 1000) self.datasetAlgoPaths = collections.deque(maxlen = 1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen = 1000) self.workflowPaths = collections.deque(maxlen = 1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://","") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport def isTaskExistInFWJR(self, jobReport, jobStatus): """ If taskName is not available in the FWJR, then tries to recover it getting data from the SQL database. """ if not jobReport.getTaskName(): logging.warning("Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus, jobReport.getJobID())) jobInfo = self.getJobTaskNameAction.execute(jobId = jobReport.getJobID(), conn = self.getDBConn(), transaction = self.existingTransaction()) jobReport.setTaskName(jobInfo['taskName']) jobReport.save(jobInfo['fwjr_path']) if not jobReport.getTaskName(): msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % (jobStatus, jobReport.getJobID()) raise AccountantWorkerException(msg) else: logging.info("TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(), jobReport.getJobID())) return def __call__(self, parameters): """ __call__ Handle a completed job. The parameters dictionary will contain the job ID and the path to the framework job report. """ returnList = [] self.reset() for job in parameters: logging.info("Handling %s" % job["fwjr_path"]) # Load the job and set the ID fwkJobReport = self.loadJobReport(job) fwkJobReport.setJobID(job['id']) jobSuccess = self.handleJob(jobID = job["id"], fwkJobReport = fwkJobReport) if self.returnJobReport: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess, 'jobReport': fwkJobReport}) else: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess}) self.count += 1 self.beginTransaction() # Now things done at the end of the job # Do what we can with WMBS files self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds) # handle merge files separately since parentage need to set # separately to support robust merge self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge) # Create DBSBufferFiles self.createFilesInDBSBuffer() # Handle filesetAssoc if len(self.filesetAssoc) > 0: self.bulkAddToFilesetAction.execute(binds = self.filesetAssoc, conn = self.getDBConn(), transaction = self.existingTransaction()) # Move successful jobs to successful if len(self.listOfJobsToSave) > 0: idList = [x['id'] for x in self.listOfJobsToSave] outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToSave] self.setBulkOutcome.execute(binds = outcomeBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.jobCompleteInput.execute(id = idList, lfnsToSkip = self.jobsWithSkippedFiles, conn = self.getDBConn(), transaction = self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete") # If we have failed jobs, fail them if len(self.listOfJobsToFail) > 0: outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToFail] self.setBulkOutcome.execute(binds = outcomeBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete") # Arrange WMBS parentage if len(self.parentageBinds) > 0: self.setParentageByJob.execute(binds = self.parentageBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) if len(self.parentageBindsForMerge) > 0: self.setParentageByMergeJob.execute(binds = self.parentageBindsForMerge, conn = self.getDBConn(), transaction = self.existingTransaction()) # Straighten out DBS Parentage if len(self.mergedOutputFiles) > 0: self.handleDBSBufferParentage() if len(self.jobsWithSkippedFiles) > 0: self.handleSkippedFiles() self.commitTransaction(existingTransaction = False) return returnList def outputFilesetsForJob(self, outputMap, merged, moduleLabel): """ _outputFilesetsForJob_ Determine if the file should be placed in any other fileset. Note that this will not return the JobGroup output fileset as all jobs will have their output placed there. """ if moduleLabel not in outputMap: logging.info("Output module label missing from output map.") return [] outputFilesets = [] for outputFileset in outputMap[moduleLabel]: if merged == False and outputFileset["output_fileset"] != None: outputFilesets.append(outputFileset["output_fileset"]) else: if outputFileset["merged_output_fileset"] != None: outputFilesets.append(outputFileset["merged_output_fileset"]) return outputFilesets def addFileToDBS(self, jobReportFile, task): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn = jobReportFile["lfn"], size = jobReportFile["size"], events = jobReportFile["events"], checksums = jobReportFile["checksums"], status = "NOTUPLOADED") dbsFile.setAlgorithm(appName = datasetInfo["applicationName"], appVer = datasetInfo["applicationVersion"], appFam = jobReportFile["module_label"], psetHash = "GIBBERISH", configContent = jobReportFile.get('configURL')) dbsFile.setDatasetPath("/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus(validStatus = jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver = jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra(era = jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag = jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id = jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber = run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn = list(jobReportFile["locations"])[0], immediateSave = False) self.dbsFilesToCreate.append(dbsFile) return def findDBSParents(self, lfn): """ _findDBSParents_ Find the parent of the file in DBS This is meant to be called recursively """ parentsInfo = self.getParentInfoAction.execute([lfn], conn = self.getDBConn(), transaction = self.existingTransaction()) newParents = set() for parentInfo in parentsInfo: # This will catch straight to merge files that do not have redneck # parents. We will mark the straight to merge file from the job # as a child of the merged parent. if int(parentInfo["merged"]) == 1: newParents.add(parentInfo["lfn"]) elif parentInfo['gpmerged'] == None: continue # Handle the files that result from merge jobs that aren't redneck # children. We have to setup parentage and then check on whether or # not this file has any redneck children and update their parentage # information. elif int(parentInfo["gpmerged"]) == 1: newParents.add(parentInfo["gplfn"]) # If that didn't work, we've reached the great-grandparents # And we have to work via recursion else: parentSet = self.findDBSParents(lfn = parentInfo['gplfn']) for parent in parentSet: newParents.add(parent) return newParents def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID = None): """ _addFileToWMBS_ Add a file that was produced in a job to WMBS. """ fwjrFile["first_event"] = jobMask["FirstEvent"] if fwjrFile["first_event"] == None: fwjrFile["first_event"] = 0 if jobType == "Merge" and fwjrFile["module_label"] != "logArchive": setattr(fwjrFile["fileRef"], 'merged', True) fwjrFile["merged"] = True wmbsFile = self.createFileFromDataStructsFile(file = fwjrFile, jobID = jobID) if jobType == "Merge": self.wmbsMergeFilesToBuild.append(wmbsFile) else: self.wmbsFilesToBuild.append(wmbsFile) if fwjrFile["merged"]: self.addFileToDBS(fwjrFile, task) return wmbsFile def _mapLocation(self, fwkJobReport): for file in fwkJobReport.getAllFileRefs(): if file and hasattr(file, 'location'): file.location = self.phedex.getBestNodeName(file.location, self.locLists) def handleJob(self, jobID, fwkJobReport): """ _handleJob_ Figure out if a job was successful or not, handle it appropriately (parse FWJR, update WMBS) and return the success status as a boolean """ jobSuccess = fwkJobReport.taskSuccessful() outputMap = self.getOutputMapAction.execute(jobID = jobID, conn = self.getDBConn(), transaction = self.existingTransaction()) jobType = self.getJobTypeAction.execute(jobID = jobID, conn = self.getDBConn(), transaction = self.existingTransaction()) if jobSuccess: fileList = fwkJobReport.getAllFiles() # consistency check comparing outputMap to fileList # they should match except for some limited special cases outputModules = set([]) for fwjrFile in fileList: outputModules.add(fwjrFile['outputModule']) if set(outputMap.keys()) == outputModules: pass elif jobType == "LogCollect" and len(outputMap.keys()) == 0 and outputModules == set(['LogCollect']): pass elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['Merged', 'logArchive']): pass elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['MergedError', 'logArchive']): pass elif jobType == "Express" and set(outputMap.keys()).difference(outputModules) == set(['write_RAW']): pass else: failJob = True if jobType in [ "Processing", "Production" ]: cmsRunSteps = 0 for step in fwkJobReport.listSteps(): if step.startswith("cmsRun"): cmsRunSteps += 1 if cmsRunSteps > 1: failJob = False if failJob: jobSuccess = False logging.error("Job %d , list of expected outputModules does not match job report, failing job", jobID) logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys())) logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules)) fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1') else: logging.debug("Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID) else: fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1') if jobSuccess: logging.info("Job %d , handle successful job", jobID) else: logging.error("Job %d , bad jobReport, failing job", jobID) # make sure the task name is present in FWJR (recover from WMBS if needed) if len(fileList) > 0: if jobSuccess: self.isTaskExistInFWJR(fwkJobReport, "success") else: self.isTaskExistInFWJR(fwkJobReport, "failed") # special check for LogCollect jobs skipLogCollect = False if jobSuccess and jobType == "LogCollect": for fwjrFile in fileList: try: # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes self.associateLogCollectToParentJobsInWMStats(fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName()) except Exception as ex: skipLogCollect = True logging.error("Error occurred: associating log collect location, will try again\n %s" % str(ex)) break # now handle the job (unless the special LogCollect check failed) if not skipLogCollect: wmbsJob = Job(id = jobID) wmbsJob.load() outputID = wmbsJob.loadOutputID() wmbsJob.getMask() wmbsJob["fwjr"] = fwkJobReport if jobSuccess: wmbsJob["outcome"] = "success" else: wmbsJob["outcome"] = "failure" for fwjrFile in fileList: logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"]) wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"], jobID = jobID, task = fwkJobReport.getTaskName()) merged = fwjrFile['merged'] moduleLabel = fwjrFile["module_label"] if merged: self.mergedOutputFiles.append(wmbsFile) self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputID}) # LogCollect jobs have no output fileset if jobType != "LogCollect": outputFilesets = self.outputFilesetsForJob(outputMap, merged, moduleLabel) for outputFileset in outputFilesets: self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputFileset}) # Check if the job had any skipped files, put them in ACDC containers # We assume full file processing (no job masks) if jobSuccess: skippedFiles = fwkJobReport.getAllSkippedFiles() if skippedFiles: self.jobsWithSkippedFiles[jobID] = skippedFiles # Only save once job is done, and we're sure we made it through okay self._mapLocation(wmbsJob['fwjr']) if jobSuccess: self.listOfJobsToSave.append(wmbsJob) else: self.listOfJobsToFail.append(wmbsJob) return jobSuccess def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task): """ _associateLogCollectToParentJobsInWMStats_ Associate a logArchive output to its parent job """ inputFileList = fwkJobReport.getAllInputFiles() requestName = task.split('/')[1] keys = [] for inputFile in inputFileList: keys.append([requestName, inputFile["lfn"]]) resultRows = self.fwjrCouchDB.loadView("FWJRDump", 'jobsByOutputLFN', options = {"stale": "update_after"}, keys = keys)['rows'] if len(resultRows) > 0: #get data from wmbs parentWMBSJobIDs = [] for row in resultRows: parentWMBSJobIDs.append({"jobid": row["value"]}) #update Job doc in wmstats results = self.getJobInfoByID.execute(parentWMBSJobIDs) parentJobNames = [] if isinstance(results, list): for jobInfo in results: parentJobNames.append(jobInfo['name']) else: parentJobNames.append(results['name']) self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN) else: #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0) #It need to be failed and retried. logging.error("job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList) return def createMissingFWKJR(self, parameters, errorCode = 999, errorDescription = 'Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report def createFilesInDBSBuffer(self): """ _createFilesInDBSBuffer_ It does the actual job of creating things in DBSBuffer WARNING: This assumes all files in a job have the same final location """ if len(self.dbsFilesToCreate) == 0: # Whoops, nothing to do! return dbsFileTuples = [] dbsFileLoc = [] dbsCksumBinds = [] runLumiBinds = [] selfChecksums = None jobLocations = set() for dbsFile in self.dbsFilesToCreate: # Append a tuple in the format specified by DBSBufferFiles.Add # Also run insertDatasetAlgo assocID = None datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % (dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"], dbsFile["appFam"], dbsFile["psetHash"], dbsFile['processingVer'], dbsFile['acquisitionEra'], dbsFile['globalTag']) # First, check if this is in the cache if datasetAlgoPath in self.datasetAlgoPaths: for da in self.datasetAlgoID: if da['datasetAlgoPath'] == datasetAlgoPath: assocID = da['assocID'] break if not assocID: # Then we have to get it ourselves try: assocID = dbsFile.insertDatasetAlgo() self.datasetAlgoPaths.append(datasetAlgoPath) self.datasetAlgoID.append({'datasetAlgoPath': datasetAlgoPath, 'assocID': assocID}) except WMException: raise except Exception as ex: msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath msg += str(ex) logging.error(msg) raise AccountantWorkerException(msg) # Associate the workflow to the file using the taskPath and the requestName # TODO: debug why it happens and then drop/recover these cases automatically taskPath = dbsFile.get('task') if not taskPath: msg = "Can't do workflow association, report this error to a developer.\n" msg += "DbsFile : %s" % str(dbsFile) raise AccountantWorkerException(msg) workflowName = taskPath.split('/')[1] workflowPath = '%s:%s' % (workflowName, taskPath) if workflowPath in self.workflowPaths: for wf in self.workflowIDs: if wf['workflowPath'] == workflowPath: workflowID = wf['workflowID'] break else: result = self.dbsGetWorkflow.execute(workflowName, taskPath, conn = self.getDBConn(), transaction = self.existingTransaction()) workflowID = result['id'] self.workflowPaths.append(workflowPath) self.workflowIDs.append({'workflowPath': workflowPath, 'workflowID': workflowID}) lfn = dbsFile['lfn'] selfChecksums = dbsFile['checksums'] jobLocation = dbsFile.getLocations()[0] jobLocations.add(jobLocation) dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'], assocID, dbsFile['status'], workflowID)) dbsFileLoc.append({'lfn': lfn, 'sename' : jobLocation}) if dbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): dbsCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry], 'cktype' : entry}) try: diffLocation = jobLocations.difference(self.dbsLocations) for jobLocation in diffLocation: self.dbsInsertLocation.execute(siteName = jobLocation, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsLocations.add(jobLocation) self.dbsCreateFiles.execute(files = dbsFileTuples, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsSetLocation.execute(binds = dbsFileLoc, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsSetChecksum.execute(bulkList = dbsCksumBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) if len(runLumiBinds) > 0: self.dbsSetRunLumi.execute(file = runLumiBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Got exception while inserting files into DBSBuffer!\n" msg += str(ex) logging.error(msg) logging.debug("Listing binds:") logging.debug("jobLocation: %s\n" % jobLocation) logging.debug("dbsFiles: %s\n" % dbsFileTuples) logging.debug("dbsFileLoc: %s\n" %dbsFileLoc) logging.debug("Checksum binds: %s\n" % dbsCksumBinds) logging.debug("RunLumi binds: %s\n" % runLumiBinds) raise AccountantWorkerException(msg) # Now that we've created those files, clear the list self.dbsFilesToCreate = [] return def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds): """ _handleWMBSFiles_ Do what can be done in bulk in bulk """ if len(wmbsFilesToBuild) == 0: # Nothing to do return runLumiBinds = [] fileCksumBinds = [] fileLocations = [] fileCreate = [] for wmbsFile in wmbsFilesToBuild: lfn = wmbsFile['lfn'] if lfn == None: continue selfChecksums = wmbsFile['checksums'] # by jobType add to different parentage relation # if it is the merge job, don't include the parentage on failed input files. # otherwise parentage is set for all input files. parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']}) if wmbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']}) if len(wmbsFile.getLocations()) > 0: fileLocations.append({'lfn': lfn, 'location': wmbsFile.getLocations()[0]}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): fileCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry], 'cktype' : entry}) fileCreate.append([lfn, wmbsFile['size'], wmbsFile['events'], None, wmbsFile["first_event"], wmbsFile['merged']]) if len(fileCreate) == 0: return try: self.addFileAction.execute(files = fileCreate, conn = self.getDBConn(), transaction = self.existingTransaction()) if runLumiBinds: self.setFileRunLumi.execute(file = runLumiBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.setFileAddChecksum.execute(bulkList = fileCksumBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.setFileLocation.execute(lfn = fileLocations, location = self.fileLocation, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while adding files to WMBS!\n" msg += str(ex) logging.error(msg) logging.debug("Printing binds: \n") logging.debug("FileCreate binds: %s\n" % fileCreate) logging.debug("Runlumi binds: %s\n" % runLumiBinds) logging.debug("Checksum binds: %s\n" % fileCksumBinds) logging.debug("FileLocation binds: %s\n" % fileLocations) raise AccountantWorkerException(msg) # Clear out finished files wmbsFilesToBuild = [] return def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error("Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn = pnn, immediateSave = False) wmbsFile['jid'] = jobID return wmbsFile def handleDBSBufferParentage(self): """ _handleDBSBufferParentage_ Handle all the DBSBuffer Parentage in bulk if you can """ outputLFNs = [f['lfn'] for f in self.mergedOutputFiles] bindList = [] for lfn in outputLFNs: newParents = self.findDBSParents(lfn = lfn) for parentLFN in newParents: bindList.append({'child': lfn, 'parent': parentLFN}) # Now all the parents should exist # Commit them to DBSBuffer logging.info("About to commit all DBSBuffer Heritage information") logging.info(len(bindList)) if len(bindList) > 0: try: self.dbsLFNHeritage.execute(binds = bindList, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while trying to handle the DBS LFN heritage\n" msg += str(ex) msg += "BindList: %s" % bindList logging.error(msg) raise AccountantWorkerException(msg) return def handleSkippedFiles(self): """ _handleSkippedFiles_ Handle all the skipped files in bulk, the way it handles the skipped files imposes an important restriction: Skipped files should have been processed by a single job in the task and no job mask exists in it. This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased splitting algorithms. Here ACDC records and created and the file are moved to wmbs_sub_files_failed from completed. """ jobList = self.getFullJobInfo.execute([{'jobid' : x} for x in self.jobsWithSkippedFiles.keys()], fileSelection = self.jobsWithSkippedFiles, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dataCollection.failedJobs(jobList, useMask = False) return