def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error
def queueWork(self, wmspecUrl, request = None, team = None): """ Take and queue work from a WMSpec. If request name is provided but doesn't match WMSpec name an error is raised. If team is provided work will only be available to queue's belonging to that team. Duplicate specs will be ignored. """ self.logger.info('queueWork() begin queueing "%s"' % wmspecUrl) wmspec = WMWorkloadHelper() wmspec.load(wmspecUrl) # check we haven't already got this work try: self.backend.getInboxElements(elementIDs = [wmspec.name()]) except CouchNotFoundError: pass else: self.logger.warning('queueWork(): Ignoring duplicate spec "%s"' % wmspec.name()) return 1 if request: try: Lexicon.requestName(request) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(wmspec, "Request name validation error: %s" % str(ex)) raise error if request != wmspec.name(): raise WorkQueueWMSpecError(wmspec, 'Request & workflow name mismatch %s vs %s' % (request, wmspec.name()))
def queueWork(self, wmspecUrl, request=None, team=None): """ Take and queue work from a WMSpec. If request name is provided but doesn't match WMSpec name an error is raised. If team is provided work will only be available to queue's belonging to that team. Duplicate specs will be ignored. """ self.logger.info('queueWork() begin queueing "%s"' % wmspecUrl) wmspec = WMWorkloadHelper() wmspec.load(wmspecUrl) if request: # validate request name try: Lexicon.requestName(request) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( wmspec, "Request name validation error: %s" % str(ex)) raise error if request != wmspec.name(): raise WorkQueueWMSpecError( wmspec, 'Request & workflow name mismatch %s vs %s' % (request, wmspec.name()))
def setDataset(self, datasetName, primaryType, datasetType, physicsGroup = None, overwrite = False, valid = 1): """ _setDataset_ Set all the information concerning a single dataset, including the primary, processed and tier info """ if self.hasDataset() and not overwrite: # Do nothing, we already have a dataset return Lexicon.primaryDatasetType(primaryType) if not datasetType in ['VALID', 'PRODUCTION', 'INVALID', 'DEPRECATED', 'DELETED']: msg = "Invalid processedDatasetType %s\n" % datasetType logging.error(msg) raise DBSBlockException(msg) try: if datasetName[0] == '/': junk, primary, processed, tier = datasetName.split('/') else: primary, processed, tier = datasetName.split('/') except Exception, ex: msg = "Invalid dataset name %s" % datasetName logging.error(msg) raise DBSBlockException(msg)
def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error
def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info('Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests() except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError(None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True) except TERMINAL_EXCEPTIONS as ex: # fatal error - report back to ReqMgr self.logger.error('Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nSee log for details.\nError: "%s"' % str(ex) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() Lexicon.dataset(datasetPath) # check dataset name validBlocks = [] locations = None blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() for blockName in dbs.listFileBlocks(datasetPath): block = dbs.getDBSSummaryInfo(datasetPath, block = blockName) # check block restrictions if blockWhiteList and block['block'] not in blockWhiteList: continue if block['block'] in blockBlackList: continue # check run restrictions if runWhiteList or runBlackList: # listRuns returns a run number per lumi section full_lumi_list = dbs.listRuns(block = block['block']) runs = set(full_lumi_list) # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: continue # recalculate effective size of block # make a guess for new event/file numbers from ratio # of accepted lumi sections (otherwise have to pull file info) accepted_lumis = [x for x in full_lumi_list if x in runs] ratio_accepted = 1. * len(accepted_lumis) / len(full_lumi_list) block[self.lumiType] = len(accepted_lumis) block['NumberOfFiles'] = float(block['NumberOfFiles']) * ratio_accepted block['NumberOfEvents'] = float(block['NumberOfEvents']) * ratio_accepted validBlocks.append(block) if locations is None: locations = set(sitesFromStorageEelements(dbs.listFileBlockLocation(block['block']))) else: locations = locations.intersection(set(sitesFromStorageEelements(dbs.listFileBlockLocation(block['block'])))) # all needed blocks present at these sites if locations: self.data[datasetPath] = list(locations) return validBlocks
def setDataset(self, datasetName, primaryType, datasetType, physicsGroup=None, prep_id=None, overwrite=False): """ _setDataset_ Set all the information concerning a single dataset, including the primary, processed and tier info """ if self.getDataset() != None and not overwrite: # Do nothing, we already have a dataset return Lexicon.primaryDatasetType(primaryType) if not datasetType in [ 'VALID', 'PRODUCTION', 'INVALID', 'DEPRECATED', 'DELETED' ]: msg = "Invalid processedDatasetType %s\n" % datasetType logging.error(msg) raise DBSBufferBlockException(msg) try: if datasetName[0] == '/': _, primary, processed, tier = datasetName.split('/') else: primary, processed, tier = datasetName.split('/') except Exception: msg = "Invalid dataset name %s" % datasetName logging.error(msg) raise DBSBufferBlockException(msg) # Do the primary dataset self.data['primds']['primary_ds_name'] = primary self.data['primds']['primary_ds_type'] = primaryType self.data['primds']['create_by'] = "WMAgent" self.data['primds']['creation_date'] = int(time.time()) # Do the processed self.data['dataset']['physics_group_name'] = physicsGroup self.data['dataset']['processed_ds_name'] = processed self.data['dataset']['data_tier_name'] = tier self.data['dataset']['dataset_access_type'] = datasetType self.data['dataset']['dataset'] = datasetName self.data['dataset']['prep_id'] = prep_id # Add misc meta data. self.data['dataset']['create_by'] = "WMAgent" self.data['dataset']['last_modified_by'] = "WMAgent" self.data['dataset']['creation_date'] = int(time.time()) self.data['dataset']['last_modification_date'] = int(time.time()) return
def setDataset(self, datasetName, primaryType, datasetType, physicsGroup = None, prep_id = None, overwrite = False): """ _setDataset_ Set all the information concerning a single dataset, including the primary, processed and tier info """ if self.getDataset() != None and not overwrite: # Do nothing, we already have a dataset return Lexicon.primaryDatasetType(primaryType) if not datasetType in ['VALID', 'PRODUCTION', 'INVALID', 'DEPRECATED', 'DELETED']: msg = "Invalid processedDatasetType %s\n" % datasetType logging.error(msg) raise DBSBufferBlockException(msg) try: if datasetName[0] == '/': junk, primary, processed, tier = datasetName.split('/') else: primary, processed, tier = datasetName.split('/') except Exception: msg = "Invalid dataset name %s" % datasetName logging.error(msg) raise DBSBufferBlockException(msg) # Do the primary dataset self.data['primds']['primary_ds_name'] = primary self.data['primds']['primary_ds_type'] = primaryType self.data['primds']['create_by'] = "WMAgent" self.data['primds']['creation_date'] = int(time.time()) # Do the processed self.data['dataset']['physics_group_name'] = physicsGroup self.data['dataset']['processed_ds_name'] = processed self.data['dataset']['data_tier_name'] = tier self.data['dataset']['dataset_access_type'] = datasetType self.data['dataset']['dataset'] = datasetName self.data['dataset']['prep_id'] = prep_id # Add misc meta data. self.data['dataset']['create_by'] = "WMAgent" self.data['dataset']['last_modified_by'] = "WMAgent" self.data['dataset']['creation_date'] = int(time.time()) self.data['dataset']['last_modification_date'] = int(time.time()) return
def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error if self.initialTask.siteWhitelist(): if isinstance(self.initialTask.siteWhitelist(), basestring): error = WorkQueueWMSpecError(self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type( self.initialTask.siteWhitelist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist()] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error else: error = WorkQueueWMSpecError(self.wmspec, "Site whitelist validation error: Empty site whitelist") raise error if self.initialTask.siteBlacklist(): if isinstance(self.initialTask.siteBlacklist(), basestring): error = WorkQueueWMSpecError(self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type( self.initialTask.siteBlacklist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist()] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site blacklist validation error: %s" % str(ex)) raise error # splitter settings if self.args.get('SliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SliceSize parameter') raise error if self.args.get('SubSliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SubSliceSize parameter') raise error # check input dataset is valid try: if self.initialTask.getInputDatasetPath(): Lexicon.dataset(self.initialTask.getInputDatasetPath()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Dataset validation error: %s" % str(ex)) raise error # if pileup is found, check that they are valid datasets try: pileupDatasets = self.wmspec.listPileupDatasets() for dbsUrl in pileupDatasets: for dataset in pileupDatasets[dbsUrl]: Lexicon.dataset(dataset) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Pileup dataset validation error: %s" % str(ex)) raise error
def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error if self.initialTask.siteWhitelist(): if isinstance(self.initialTask.siteWhitelist(), basestring): error = WorkQueueWMSpecError(self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type( self.initialTask.siteWhitelist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist()] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error if self.initialTask.siteBlacklist(): if isinstance(self.initialTask.siteBlacklist(), basestring): error = WorkQueueWMSpecError(self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type( self.initialTask.siteBlacklist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist()] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site blacklist validation error: %s" % str(ex)) raise error # splitter settings if self.args.get('SliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SliceSize parameter') raise error if self.args.get('SubSliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SubSliceSize parameter') raise error # check input dataset is valid try: if self.initialTask.getInputDatasetPath(): Lexicon.dataset(self.initialTask.getInputDatasetPath()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Dataset validation error: %s" % str(ex)) raise error # if pileup is found, check that they are valid datasets try: pileupDatasets = self.wmspec.listPileupDatasets() for dbsUrl in pileupDatasets: for dataset in pileupDatasets[dbsUrl]: Lexicon.dataset(dataset) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Pileup dataset validation error: %s" % str(ex)) raise error
def createPrimaryDataset(primaryName, primaryDatasetType='mc', apiRef=None): """ _createPrimaryDataset_ """ logging.debug("Inserting PrimaryDataset %s with Type %s" \ % (primaryName, primaryDatasetType)) Lexicon.primaryDatasetType(primaryDatasetType) primary = DbsPrimaryDataset(Name=primaryName, Type=primaryDatasetType) if apiRef: try: apiRef.insertPrimaryDataset(primary) except DbsException, ex: msg = "Error in DBSInterface.createPrimaryDataset(%s)\n" % primaryName msg += formatEx(ex) logging.error(msg) raise DBSInterfaceError(msg)
def createPrimaryDataset(primaryName, primaryDatasetType = 'mc', apiRef = None): """ _createPrimaryDataset_ """ logging.debug("Inserting PrimaryDataset %s with Type %s" \ % (primaryName, primaryDatasetType)) Lexicon.primaryDatasetType(primaryDatasetType) primary = DbsPrimaryDataset(Name = primaryName, Type = primaryDatasetType) if apiRef: try: apiRef.insertPrimaryDataset(primary) except DbsException, ex: msg = "Error in DBSInterface.createPrimaryDataset(%s)\n" % primaryName msg += formatEx(ex) logging.error(msg) raise DBSInterfaceError(msg)
def getMaskedBlocks(self, task, dbs, datasetPath): """ Get the blocks which pass the lumi mask restrictions. For each block return the list of lumis which were ok (given the lumi mask). The data structure returned is the following: { "block1" : {"file1" : LumiList(), "file5" : LumiList(), ...} "block2" : {"file2" : LumiList(), "file7" : LumiList(), ...} } """ # Get mask and convert to LumiList to make operations easier maskedBlocks = {} lumiMask = task.getLumiMask() taskMask = LumiList(compactList=lumiMask) # Find all the files that have runs and lumis we are interested in, # fill block lfn part of maskedBlocks for run, lumis in lumiMask.items(): files = [] for slumis in Lexicon.slicedIterator(lumis, 50): slicedFiles = dbs.dbs.listFileArray(dataset=datasetPath, run_num=run, lumi_list=slumis, detail=True) files.extend(slicedFiles) for file in files: blockName = file['block_name'] fileName = file['logical_file_name'] if blockName not in maskedBlocks: maskedBlocks[blockName] = {} if fileName not in maskedBlocks[blockName]: maskedBlocks[blockName][fileName] = LumiList() # Fill maskedLumis part of maskedBlocks for block in maskedBlocks: fileLumis = dbs.dbs.listFileLumis(block_name=block, validFileOnly=1) for fileLumi in fileLumis: lfn = fileLumi['logical_file_name'] # For each run : [lumis] mask by needed lumis, append to maskedBlocks if maskedBlocks[block].get(lfn, None) is not None: lumiList = LumiList( runsAndLumis={ fileLumi['run_num']: fileLumi['lumi_section_num'] }) maskedBlocks[block][lfn] += (lumiList & taskMask) return maskedBlocks
class StartPolicyInterface(PolicyInterface): """Interface for start policies""" def __init__(self, **args): PolicyInterface.__init__(self, **args) self.workQueueElements = [] self.wmspec = None self.team = None self.initialTask = None self.splitParams = None self.dbs_pool = {} self.data = {} self.lumi = None self.couchdb = None self.rejectedWork = [] # List of inputs that were rejected self.pileupData = {} def split(self): """Apply policy to spec""" raise NotImplementedError def validate(self): """Check params and spec are appropriate for the policy""" raise NotImplementedError def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error if self.initialTask.siteWhitelist(): if type(self.initialTask.siteWhitelist()) in types.StringTypes: error = WorkQueueWMSpecError( self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type(self.initialTask.siteWhitelist())) raise error try: [ Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist() ] except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error
def setDataset(self, datasetName, primaryType, datasetType, physicsGroup=None, prep_id=None, overwrite=False, valid=1): """ _setDataset_ Set all the information concerning a single dataset, including the primary, processed and tier info """ if self.hasDataset() and not overwrite: # Do nothing, we already have a dataset return Lexicon.primaryDatasetType(primaryType) if not datasetType in [ 'VALID', 'PRODUCTION', 'INVALID', 'DEPRECATED', 'DELETED' ]: msg = "Invalid processedDatasetType %s\n" % datasetType logging.error(msg) raise DBSBlockException(msg) try: if datasetName[0] == '/': junk, primary, processed, tier = datasetName.split('/') else: primary, processed, tier = datasetName.split('/') except Exception, ex: msg = "Invalid dataset name %s" % datasetName logging.error(msg) raise DBSBlockException(msg)
def queueWork(self, wmspecUrl, request = None, team = None): """ Take and queue work from a WMSpec. If request name is provided but doesn't match WMSpec name an error is raised. If team is provided work will only be available to queue's belonging to that team. Duplicate specs will be ignored. """ self.logger.info('queueWork() begin queueing "%s"' % wmspecUrl) wmspec = WMWorkloadHelper() wmspec.load(wmspecUrl) if request: # validate request name try: Lexicon.requestName(request) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(wmspec, "Request name validation error: %s" % str(ex)) raise error if request != wmspec.name(): raise WorkQueueWMSpecError(wmspec, 'Request & workflow name mismatch %s vs %s' % (request, wmspec.name()))
def getMaskedBlocks(self, task, dbs, datasetPath): """ Get the blocks which pass the lumi mask restrictions. For each block return the list of lumis which were ok (given the lumi mask). The data structure returned is the following: { "block1" : {"file1" : LumiList(), "file5" : LumiList(), ...} "block2" : {"file2" : LumiList(), "file7" : LumiList(), ...} } """ # Get mask and convert to LumiList to make operations easier maskedBlocks = {} lumiMask = task.getLumiMask() taskMask = LumiList(compactList = lumiMask) # Find all the files that have runs and lumis we are interested in, # fill block lfn part of maskedBlocks for run, lumis in lumiMask.items(): files = [] for slumis in Lexicon.slicedIterator(lumis, 50): slicedFiles = dbs.dbs.listFileArray(dataset=datasetPath, run_num=run, lumi_list=slumis, detail=True) files.extend(slicedFiles) for file in files: blockName = file['block_name'] fileName = file['logical_file_name'] if blockName not in maskedBlocks: maskedBlocks[blockName] = {} if fileName not in maskedBlocks[blockName]: maskedBlocks[blockName][fileName] = LumiList() # Fill maskedLumis part of maskedBlocks for block in maskedBlocks: fileLumis = dbs.dbs.listFileLumis(block_name=block, validFileOnly = 1) for fileLumi in fileLumis: lfn = fileLumi['logical_file_name'] # For each run : [lumis] mask by needed lumis, append to maskedBlocks if maskedBlocks[block].get(lfn, None) is not None: lumiList = LumiList(runsAndLumis = {fileLumi['run_num']: fileLumi['lumi_section_num']}) maskedBlocks[block][lfn] += (lumiList & taskMask) return maskedBlocks
def test(num_lumis): from dbs.apis.dbsClient import DbsApi dbsApi = DbsApi(url = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader/') #dbsApi = DbsApi(url = 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSReader/') datasetPath = "/SMS-T5qqqqVV_mGluino-1200To1275_mLSP-1to1150_TuneCUETP8M1_13TeV-madgraphMLM-pythia8/RunIIWinter15pLHE-MCRUN2_71_V1-v1/LHE" #datasetPath = "/QDTojWinc_NC_M-1200_TuneZ2star_8TeV-madgraph/Summer12pLHE-DMWM_Validation_DONOTDELETE_Alan_TEST-v1/GEN" run = 1 lumis = range(1, num_lumis+1) files = [] print "Starting queries to listFileArray (in dataset mode) at %s" % datetime.utcnow() for slumis in Lexicon.slicedIterator(lumis, 10): start = datetime.utcnow() print slumis slicedFiles = dbsApi.listFileArray(dataset=datasetPath, run_num=run, lumi_list=slumis, detail=True) files.extend(slicedFiles) end = datetime.utcnow() print " slice completed in %s" % (end - start) # pprint(files) maskedBlocks = {} for lfn in files: blockName = lfn['block_name'] fileName = lfn['logical_file_name'] if blockName not in maskedBlocks: maskedBlocks[blockName] = {} if fileName not in maskedBlocks[blockName]: maskedBlocks[blockName][fileName] = LumiList() # pprint(maskedBlocks) print "\nStarting queries to listFileLumis at %s" % datetime.utcnow() for block in maskedBlocks: print block start = datetime.utcnow() fileLumis = dbsApi.listFileLumis(block_name=block, validFileOnly = 1) end = datetime.utcnow() print " query completed in %s" % (end - start)
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() Lexicon.dataset(datasetPath) # check dataset name validBlocks = [] locations = None blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() lumiMask = task.getLumiMask() if lumiMask: maskedBlocks = self.getMaskedBlocks(task, dbs, datasetPath) for blockName in dbs.listFileBlocks(datasetPath): # check block restrictions if blockWhiteList and blockName not in blockWhiteList: continue if blockName in blockBlackList: continue blockSummary = dbs.getDBSSummaryInfo(block=blockName) if int(blockSummary.get('NumberOfFiles', 0)) == 0: logging.warning("Block %s being rejected for lack of valid files to process", blockName) self.badWork.append(blockName) continue if self.args['SliceType'] == 'NumberOfRuns': blockSummary['NumberOfRuns'] = dbs.listRuns(block=blockName) # check lumi restrictions if lumiMask: if blockName not in maskedBlocks: logging.warning("Block %s doesn't pass the lumi mask constraints", blockName) self.rejectedWork.append(blockName) continue acceptedLumiCount = sum([len(maskedBlocks[blockName][lfn].getLumis()) for lfn in maskedBlocks[blockName]]) ratioAccepted = 1. * acceptedLumiCount / float(blockSummary['NumberOfLumis']) maskedRuns = [maskedBlocks[blockName][lfn].getRuns() for lfn in maskedBlocks[blockName]] acceptedRuns = set(lumiMask.getRuns()).intersection(set().union(*maskedRuns)) blockSummary['NumberOfFiles'] = len(maskedBlocks[blockName]) blockSummary['NumberOfEvents'] = float(blockSummary['NumberOfEvents']) * ratioAccepted blockSummary[self.lumiType] = acceptedLumiCount blockSummary['NumberOfRuns'] = acceptedRuns # check run restrictions elif runWhiteList or runBlackList: runs = set(dbs.listRuns(block=blockName)) # multi run blocks need special account, requires more DBS calls recalculateLumiCounts = True if len(runs) > 1 else False # apply blacklist and whitelist runs = runs.difference(runBlackList) if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: logging.warning("Block %s doesn't pass the runs constraints", blockName) self.rejectedWork.append(blockName) continue if recalculateLumiCounts: # Recalculate the number of files, lumis and ~events accepted acceptedLumiCount = 0 acceptedEventCount = 0 acceptedFileCount = 0 fileInfo = dbs.listFilesInBlock(fileBlockName=blockName) for fileEntry in fileInfo: acceptedFile = False for lumiInfo in fileEntry['LumiList']: if lumiInfo['RunNumber'] in runs: acceptedFile = True acceptedLumiCount += len(lumiInfo['LumiSectionNumber']) if acceptedFile: acceptedFileCount += 1 acceptedEventCount += fileEntry['NumberOfEvents'] else: acceptedLumiCount = blockSummary["NumberOfLumis"] acceptedFileCount = blockSummary['NumberOfFiles'] acceptedEventCount = blockSummary['NumberOfEvents'] blockSummary[self.lumiType] = acceptedLumiCount blockSummary['NumberOfFiles'] = acceptedFileCount blockSummary['NumberOfEvents'] = acceptedEventCount blockSummary['NumberOfRuns'] = runs validBlocks.append(blockSummary) if locations is None: locations = set(dbs.listFileBlockLocation(blockName)) else: locations = locations.intersection(dbs.listFileBlockLocation(blockName)) # all needed blocks present at these sites if task.getTrustSitelists().get('trustlists'): siteWhitelist = task.siteWhitelist() siteBlacklist = task.siteBlacklist() self.sites = makeLocationsList(siteWhitelist, siteBlacklist) self.data[datasetPath] = self.sites elif locations: self.data[datasetPath] = list(set(self.cric.PNNstoPSNs(locations))) return validBlocks
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() validBlocks = [] blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() if task.getLumiMask(): # if we have a lumi mask get only the relevant blocks maskedBlocks = self.getMaskedBlocks(task, dbs, datasetPath) if task.getTrustSitelists().get('trustlists'): siteWhitelist = task.siteWhitelist() siteBlacklist = task.siteBlacklist() self.sites = makeLocationsList(siteWhitelist, siteBlacklist) blocks = [] # Take data inputs or from spec if not self.data: if blockWhiteList: self.data = dict((block, []) for block in blockWhiteList) else: self.data = {datasetPath: []} # same structure as in WorkQueueElement for data in self.data: if data.find('#') > -1: Lexicon.block(data) # check block name datasetPath = str(data.split('#')[0]) blocks.append(str(data)) else: Lexicon.dataset(data) # check dataset name for block in dbs.listFileBlocks(data, onlyClosedBlocks=True): blocks.append(str(block)) for blockName in blocks: # check block restrictions if blockWhiteList and blockName not in blockWhiteList: continue if blockName in blockBlackList: continue if blockName in self.blockBlackListModifier: # Don't duplicate blocks rejected before or blocks that were included and therefore are now in the blacklist continue if task.getLumiMask() and blockName not in maskedBlocks: self.rejectedWork.append(blockName) continue block = dbs.getDBSSummaryInfo(datasetPath, block=blockName) # blocks with 0 valid files should be ignored # - ideally they would be deleted but dbs can't delete blocks if not block['NumberOfFiles'] or block['NumberOfFiles'] == '0': self.rejectedWork.append(blockName) continue # check lumi restrictions if task.getLumiMask(): accepted_lumis = sum([len(maskedBlocks[blockName][lfn].getLumis()) for lfn in maskedBlocks[blockName]]) # use the information given from getMaskedBlocks to compute che size of the block block['NumberOfFiles'] = len(maskedBlocks[blockName]) # ratio = lumis which are ok in the block / total num lumis ratioAccepted = accepted_lumis / block['NumberOfLumis'] block['NumberOfEvents'] = block['NumberOfEvents'] * ratioAccepted block[self.lumiType] = accepted_lumis # check run restrictions elif runWhiteList or runBlackList: # listRunLumis returns a dictionary with the lumi sections per run runLumis = dbs.listRunLumis(block=block['block']) runs = set(runLumis.keys()) recalculateLumiCounts = False if len(runs) > 1: # If more than one run in the block # Then we must calculate the lumi counts after filtering the run list # This has to be done rarely and requires calling DBS file information recalculateLumiCounts = True # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: self.rejectedWork.append(blockName) continue if len(runs) == len(runLumis): # If there is no change in the runs, then we can skip recalculating lumi counts recalculateLumiCounts = False if recalculateLumiCounts: # Recalculate effective size of block # We pull out file info, since we don't do this often acceptedLumiCount = 0 acceptedEventCount = 0 acceptedFileCount = 0 fileInfo = dbs.listFilesInBlock(fileBlockName=block['block']) for fileEntry in fileInfo: acceptedFile = False acceptedFileLumiCount = 0 for lumiInfo in fileEntry['LumiList']: runNumber = lumiInfo['RunNumber'] if runNumber in runs: acceptedFile = True acceptedFileLumiCount += 1 acceptedLumiCount += len(lumiInfo['LumiSectionNumber']) if acceptedFile: acceptedFileCount += 1 if len(fileEntry['LumiList']) != acceptedFileLumiCount: acceptedEventCount += acceptedFileLumiCount * fileEntry['NumberOfEvents'] / len(fileEntry['LumiList']) else: acceptedEventCount += fileEntry['NumberOfEvents'] block[self.lumiType] = acceptedLumiCount block['NumberOfFiles'] = acceptedFileCount block['NumberOfEvents'] = acceptedEventCount # save locations if task.getTrustSitelists().get('trustlists'): self.data[block['block']] = self.sites else: self.data[block['block']] = self.siteDB.PNNstoPSNs(dbs.listFileBlockLocation(block['block'])) # TODO: need to decide what to do when location is no find. # There could be case for network problem (no connection to dbs, phedex) # or DBS se is not recorded (This will be retried anyway by location mapper) if not self.data[block['block']]: self.data[block['block']] = ["NoInitialSite"] # # No sites for this block, move it to rejected # self.rejectedWork.append(blockName) # continue validBlocks.append(block) return validBlocks
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() validBlocks = [] blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() blocks = [] # Take data inputs or from spec if not self.data: self.data = {datasetPath : []} # same structure as in WorkQueueElement #blocks = dbs.getFileBlocksInfo(datasetPath, locations = False) #else: #dataItems = self.data.keys() for data in self.data: if data.find('#') > -1: Lexicon.block(data) # check dataset name datasetPath = str(data.split('#')[0]) blocks.extend(dbs.getFileBlocksInfo(datasetPath, blockName = str(data), locations = True)) else: Lexicon.dataset(data) # check dataset name blocks.extend(dbs.getFileBlocksInfo(datasetPath, locations = True)) for block in blocks: # blocks with 0 valid files should be ignored # - ideally they would be deleted but dbs can't delete blocks if not block['NumberOfFiles']: continue # check block restrictions if blockWhiteList and block['Name'] not in blockWhiteList: continue if block['Name'] in blockBlackList: continue # check run restrictions if runWhiteList or runBlackList: # listRuns returns a run number per lumi section full_lumi_list = dbs.listRuns(block = block['Name']) runs = set(full_lumi_list) # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: continue # recalculate effective size of block # make a guess for new event/file numbers from ratio # of accepted lumi sections (otherwise have to pull file info) accepted_lumis = [x for x in full_lumi_list if x in runs] ratio_accepted = 1. * len(accepted_lumis) / len(full_lumi_list) block[self.lumiType] = len(accepted_lumis) block['NumberOfFiles'] *= ratio_accepted block['NumberOfEvents'] *= ratio_accepted # get lumi info if needed and not already available if self.args['SliceType'] == self.lumiType and not block.get(self.lumiType): blockSummary = dbs.getDBSSummaryInfo(block = block["Name"]) block[self.lumiType] = blockSummary[self.lumiType] # save locations self.data[block['Name']] = sitesFromStorageEelements([x['Name'] for x in block['StorageElementList']]) validBlocks.append(block) return validBlocks
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() validBlocks = [] blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() if task.getLumiMask( ): # if we have a lumi mask get only the relevant blocks maskedBlocks = self.getMaskedBlocks(task, dbs, datasetPath) if task.getTrustSitelists().get('trustlists'): siteWhitelist = task.siteWhitelist() siteBlacklist = task.siteBlacklist() self.sites = makeLocationsList(siteWhitelist, siteBlacklist) blocks = [] # Take data inputs or from spec if not self.data: if blockWhiteList: self.data = dict((block, []) for block in blockWhiteList) else: self.data = { datasetPath: [] } # same structure as in WorkQueueElement for data in self.data: if data.find('#') > -1: Lexicon.block(data) # check block name datasetPath = str(data.split('#')[0]) blocks.append(str(data)) else: Lexicon.dataset(data) # check dataset name for block in dbs.listFileBlocks(data, onlyClosedBlocks=True): blocks.append(str(block)) for blockName in blocks: # check block restrictions if blockWhiteList and blockName not in blockWhiteList: continue if blockName in blockBlackList: continue if blockName in self.blockBlackListModifier: # Don't duplicate blocks rejected before or blocks that were included and therefore are now in the blacklist continue if task.getLumiMask() and blockName not in maskedBlocks: self.rejectedWork.append(blockName) continue block = dbs.getDBSSummaryInfo(datasetPath, block=blockName) # blocks with 0 valid files should be ignored # - ideally they would be deleted but dbs can't delete blocks if not block['NumberOfFiles'] or block['NumberOfFiles'] == '0': self.rejectedWork.append(blockName) continue # check lumi restrictions if task.getLumiMask(): accepted_lumis = sum([ len(maskedBlocks[blockName][lfn].getLumis()) for lfn in maskedBlocks[blockName] ]) # use the information given from getMaskedBlocks to compute che size of the block block['NumberOfFiles'] = len(maskedBlocks[blockName]) # ratio = lumis which are ok in the block / total num lumis ratioAccepted = 1. * accepted_lumis / float( block['NumberOfLumis']) block['NumberOfEvents'] = float( block['NumberOfEvents']) * ratioAccepted block[self.lumiType] = accepted_lumis # check run restrictions elif runWhiteList or runBlackList: # listRunLumis returns a dictionary with the lumi sections per run runLumis = dbs.listRunLumis(block=block['block']) runs = set(runLumis.keys()) recalculateLumiCounts = False if len(runs) > 1: # If more than one run in the block # Then we must calculate the lumi counts after filtering the run list # This has to be done rarely and requires calling DBS file information recalculateLumiCounts = True # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: self.rejectedWork.append(blockName) continue if len(runs) == len(runLumis): # If there is no change in the runs, then we can skip recalculating lumi counts recalculateLumiCounts = False if recalculateLumiCounts: # Recalculate effective size of block # We pull out file info, since we don't do this often acceptedLumiCount = 0 acceptedEventCount = 0 acceptedFileCount = 0 fileInfo = dbs.listFilesInBlock( fileBlockName=block['block']) for fileEntry in fileInfo: acceptedFile = False acceptedFileLumiCount = 0 for lumiInfo in fileEntry['LumiList']: runNumber = lumiInfo['RunNumber'] if runNumber in runs: acceptedFile = True acceptedFileLumiCount += 1 acceptedLumiCount += len( lumiInfo['LumiSectionNumber']) if acceptedFile: acceptedFileCount += 1 if len(fileEntry['LumiList'] ) != acceptedFileLumiCount: acceptedEventCount += float(acceptedFileLumiCount) * fileEntry['NumberOfEvents'] \ / len(fileEntry['LumiList']) else: acceptedEventCount += fileEntry[ 'NumberOfEvents'] block[self.lumiType] = acceptedLumiCount block['NumberOfFiles'] = acceptedFileCount block['NumberOfEvents'] = acceptedEventCount # save locations if task.getTrustSitelists().get('trustlists'): self.data[block['block']] = self.sites else: self.data[block['block']] = self.siteDB.PNNstoPSNs( dbs.listFileBlockLocation(block['block'])) # TODO: need to decide what to do when location is no find. # There could be case for network problem (no connection to dbs, phedex) # or DBS se is not recorded (This will be retried anyway by location mapper) if not self.data[block['block']]: self.data[block['block']] = ["NoInitialSite"] # # No sites for this block, move it to rejected # self.rejectedWork.append(blockName) # continue validBlocks.append(block) return validBlocks
def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] try: workLoads = self.getAvailableRequests() except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True, agent=False) except TERMINAL_EXCEPTIONS as ex: # fatal error - report back to ReqMgr self.logger.error( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nSee log for details.\nError: "%s"' % str(ex) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work
class WorkQueueReqMgrInterface(): """Helper class for ReqMgr interaction""" def __init__(self, **kwargs): if not kwargs.get('logger'): import logging kwargs['logger'] = logging self.logger = kwargs['logger'] self.reqMgr = RequestManager(kwargs) self.previous_state = {} def __call__(self, queue): """Synchronize WorkQueue and RequestManager""" msg = '' try: # pull in new work work = self.queueNewRequests(queue) msg += "New Work: %d\n" % work except Exception: self.logger.exception("Error caught during RequestManager pull") try: # get additional open-running work extraWork = self.addNewElementsToOpenRequests(queue) msg += "Work added: %d\n" % extraWork except Exception: self.logger.exception("Error caught during RequestManager split") try: # report back to ReqMgr uptodate_elements = self.report(queue) msg += "Updated ReqMgr status for: %s\n" % ", ".join( [x['RequestName'] for x in uptodate_elements]) except: self.logger.exception("Error caught during RequestManager update") else: try: # Delete finished requests from WorkQueue self.deleteFinishedWork(queue, uptodate_elements) except: self.logger.exception("Error caught during work deletion") queue.backend.recordTaskActivity('reqmgr_sync', msg) def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info( 'Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests(*queue.params['Teams']) except Exception, ex: msg = "Error contacting RequestManager: %s" % str(ex) self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: # try: # self.reportRequestStatus(reqName, "negotiating") # except Exception, ex: # self.logger.error(""" # Unable to update ReqMgr state to negotiating: %s # Ignoring this request: %s""" % (str(ex), reqName)) # continue try: try: Lexicon.couchurl(workLoadUrl) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) except (WorkQueueWMSpecError, WorkQueueNoWorkError), ex: # fatal error - report back to ReqMgr self.logger.info( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue
def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info('Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests(queue.params['Teams']) except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: # try: # self.reportRequestStatus(reqName, "negotiating") # except Exception, ex: # self.logger.error(""" # Unable to update ReqMgr state to negotiating: %s # Ignoring this request: %s""" % (str(ex), reqName)) # continue try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError(None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request = reqName, team = team) except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex: # fatal error - report back to ReqMgr self.logger.info('Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message = str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' \ '\nError: "%s"' % (reqName, str(ex)) self.logger.info(msg) self.sendMessage(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' \ '\nSee log for details.\nError: "%s"' % (reqName, str(ex)) self.logger.exception('Unknown error processing %s' % reqName) self.sendMessage(reqName, msg, 'error') continue try: if self.reqmgr2Only: self.reqMgr2.updateRequestStatus(reqName, "acquired") else: self.markAcquired(reqName, queue.params.get('QueueURL', 'No Queue')) except Exception as ex: self.logger.warning("Unable to update ReqMgr state: %s" % str(ex)) self.logger.warning('Will try again later') self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() Lexicon.dataset(datasetPath) # check dataset name validBlocks = [] locations = None blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() if task.getTrustSitelists(): siteWhitelist = task.siteWhitelist() siteBlacklist = task.siteBlacklist() self.sites = makeLocationsList(siteWhitelist, siteBlacklist) for blockName in dbs.listFileBlocks(datasetPath): block = dbs.getDBSSummaryInfo(datasetPath, block=blockName) # check block restrictions if blockWhiteList and block['block'] not in blockWhiteList: continue if block['block'] in blockBlackList: continue # check run restrictions if runWhiteList or runBlackList: # listRunLumis returns a dictionary with the lumi sections per run runLumis = dbs.listRunLumis(block=block['block']) runs = set(runLumis.keys()) recalculateLumiCounts = False if len(runs) > 1: # If more than one run in the block # Then we must calculate the lumi counts after filtering the run list # This has to be done rarely and requires calling DBS file information recalculateLumiCounts = True # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: continue if recalculateLumiCounts: # get correct lumi count # Recalculate effective size of block # We pull out file info, since we don't do this often acceptedLumiCount = 0 acceptedEventCount = 0 acceptedFileCount = 0 fileInfo = dbs.listFilesInBlock( fileBlockName=block['block']) for fileEntry in fileInfo: acceptedFile = False acceptedFileLumiCount = 0 for lumiInfo in fileEntry['LumiList']: runNumber = lumiInfo['RunNumber'] if runNumber in runs: acceptedFile = True acceptedFileLumiCount += 1 if acceptedFile: acceptedFileCount += 1 acceptedLumiCount += acceptedFileLumiCount if len(fileEntry['LumiList'] ) != acceptedFileLumiCount: acceptedEventCount += float( acceptedFileLumiCount ) * fileEntry['NumberOfEvents'] / len( fileEntry['LumiList']) else: acceptedEventCount += fileEntry[ 'NumberOfEvents'] else: acceptedLumiCount = block["NumberOfLumis"] acceptedFileCount = block['NumberOfFiles'] acceptedEventCount = block['NumberOfEvents'] # recalculate effective size of block # make a guess for new event/file numbers from ratio # of accepted lumi sections (otherwise have to pull file info) block[self.lumiType] = acceptedLumiCount block['NumberOfFiles'] = acceptedFileCount block['NumberOfEvents'] = acceptedEventCount validBlocks.append(block) if locations is None: locations = set(dbs.listFileBlockLocation(block['block'])) else: locations = locations.intersection( dbs.listFileBlockLocation(block['block'])) # all needed blocks present at these sites if self.wmspec.getTrustLocationFlag(): self.data[datasetPath] = self.sites elif locations: self.data[datasetPath] = list( set(self.siteDB.PNNstoPSNs(locations))) return validBlocks
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() validBlocks = [] blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() if task.getLumiMask(): #if we have a lumi mask get only the relevant blocks maskedBlocks = self.getMaskedBlocks(task, dbs, datasetPath) blocks = [] # Take data inputs or from spec if not self.data: if blockWhiteList: self.data = dict((block, []) for block in blockWhiteList) else: self.data = {datasetPath : []} # same structure as in WorkQueueElement for data in self.data: if data.find('#') > -1: Lexicon.block(data) # check block name datasetPath = str(data.split('#')[0]) blocks.append(str(data)) else: Lexicon.dataset(data) # check dataset name for block in dbs.listFileBlocks(data): blocks.append(str(block)) for blockName in blocks: # check block restrictions if blockWhiteList and blockName not in blockWhiteList: continue if blockName in blockBlackList: continue if task.getLumiMask() and blockName not in maskedBlocks: continue block = dbs.getDBSSummaryInfo(datasetPath, block = blockName) # blocks with 0 valid files should be ignored # - ideally they would be deleted but dbs can't delete blocks if not block['NumberOfFiles'] or block['NumberOfFiles'] == '0': continue #check lumi restrictions if task.getLumiMask(): accepted_lumis = sum( [ len(maskedBlocks[blockName][file]) for file in maskedBlocks[blockName] ] ) #use the information given from getMaskedBlocks to compute che size of the block block['NumberOfFiles'] = len(maskedBlocks[blockName]) #ratio = lumis which are ok in the block / total num lumis ratio_accepted = 1. * accepted_lumis / float(block['NumberOfLumis']) block['NumberOfEvents'] = float(block['NumberOfEvents']) * ratio_accepted block[self.lumiType] = accepted_lumis # check run restrictions elif runWhiteList or runBlackList: # listRuns returns a run number per lumi section full_lumi_list = dbs.listRuns(block = block['block']) runs = set(full_lumi_list) # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: continue # recalculate effective size of block # make a guess for new event/file numbers from ratio # of accepted lumi sections (otherwise have to pull file info) accepted_lumis = [x for x in full_lumi_list if x in runs] ratio_accepted = 1. * len(accepted_lumis) / len(full_lumi_list) block[self.lumiType] = len(accepted_lumis) block['NumberOfFiles'] = float(block['NumberOfFiles']) * ratio_accepted block['NumberOfEvents'] = float(block['NumberOfEvents']) * ratio_accepted # save locations self.data[block['block']] = sitesFromStorageEelements(dbs.listFileBlockLocation(block['block'])) validBlocks.append(block) return validBlocks
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() validBlocks = [] blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() if task.getLumiMask(): #if we have a lumi mask get only the relevant blocks maskedBlocks = self.getMaskedBlocks(task, dbs, datasetPath) if task.inputLocationFlag(): # Then get the locations from the site whitelist/blacklist + SiteDB siteWhitelist = task.siteWhitelist() siteBlacklist = task.siteBlacklist() if siteWhitelist: # Just get the ses matching the whitelists self.sites = siteWhitelist elif siteBlacklist: # Get all CMS sites less the blacklist allSites = cmsSiteNames() self.sites = list(set(allSites) - set (siteBlacklist)) else: # Run at any CMS site self.sites = cmsSiteNames() blocks = [] # Take data inputs or from spec if not self.data: if blockWhiteList: self.data = dict((block, []) for block in blockWhiteList) else: self.data = {datasetPath : []} # same structure as in WorkQueueElement for data in self.data: if data.find('#') > -1: Lexicon.block(data) # check block name datasetPath = str(data.split('#')[0]) blocks.append(str(data)) else: Lexicon.dataset(data) # check dataset name for block in dbs.listFileBlocks(data): blocks.append(str(block)) for blockName in blocks: # check block restrictions if blockWhiteList and blockName not in blockWhiteList: continue if blockName in blockBlackList: continue if task.getLumiMask() and blockName not in maskedBlocks: continue block = dbs.getDBSSummaryInfo(datasetPath, block = blockName) # blocks with 0 valid files should be ignored # - ideally they would be deleted but dbs can't delete blocks if not block['NumberOfFiles'] or block['NumberOfFiles'] == '0': continue #check lumi restrictions if task.getLumiMask(): accepted_lumis = sum( [ len(maskedBlocks[blockName][file]) for file in maskedBlocks[blockName] ] ) #use the information given from getMaskedBlocks to compute che size of the block block['NumberOfFiles'] = len(maskedBlocks[blockName]) #ratio = lumis which are ok in the block / total num lumis ratioAccepted = 1. * accepted_lumis / float(block['NumberOfLumis']) block['NumberOfEvents'] = float(block['NumberOfEvents']) * ratioAccepted block[self.lumiType] = accepted_lumis # check run restrictions elif runWhiteList or runBlackList: # listRunLumis returns a dictionary with the lumi sections per run runLumis = dbs.listRunLumis(block = block['block']) runs = set(runLumis.keys()) recalculateLumiCounts = False if len(runs) > 1: # If more than one run in the block # Then we must calculate the lumi counts after filtering the run list # This has to be done rarely and requires calling DBS file information recalculateLumiCounts = True # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: continue if len(runs) == len(runLumis): # If there is no change in the runs, then we can skip recalculating lumi counts recalculateLumiCounts = False if recalculateLumiCounts: # Recalculate effective size of block # We pull out file info, since we don't do this often acceptedLumiCount = 0 acceptedEventCount = 0 acceptedFileCount = 0 fileInfo = dbs.listFilesInBlock(fileBlockName = block['block']) for fileEntry in fileInfo: acceptedFile = False acceptedFileLumiCount = 0 for lumiInfo in fileEntry['LumiList']: runNumber = lumiInfo['RunNumber'] if runNumber in runs: acceptedFile = True acceptedFileLumiCount += 1 if acceptedFile: acceptedFileCount += 1 acceptedLumiCount += acceptedFileLumiCount if len(fileEntry['LumiList']) != acceptedFileLumiCount: acceptedEventCount += float(acceptedFileLumiCount) * fileEntry['NumberOfEvents']/len(fileEntry['LumiList']) else: acceptedEventCount += fileEntry['NumberOfEvents'] block[self.lumiType] = acceptedLumiCount block['NumberOfFiles'] = acceptedFileCount block['NumberOfEvents'] = acceptedEventCount # save locations if task.inputLocationFlag(): self.data[block['block']] = self.sites else: self.data[block['block']] = sitesFromStorageEelements(dbs.listFileBlockLocation(block['block'])) validBlocks.append(block) return validBlocks
def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info( 'Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests(queue.params['Teams']) except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: # try: # self.reportRequestStatus(reqName, "negotiating") # except Exception, ex: # self.logger.error(""" # Unable to update ReqMgr state to negotiating: %s # Ignoring this request: %s""" % (str(ex), reqName)) # continue try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True) except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex: # fatal error - report back to ReqMgr self.logger.info( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' \ '\nError: "%s"' % (reqName, str(ex)) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' \ '\nSee log for details.\nError: "%s"' % (reqName, str(ex)) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue try: self.reportRequestStatus(reqName, "acquired") except Exception as ex: self.logger.warning("Unable to update ReqMgr state: %s" % str(ex)) self.logger.warning('Will try again later') self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work
except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error if self.initialTask.siteBlacklist(): if type(self.initialTask.siteBlacklist()) in types.StringTypes: error = WorkQueueWMSpecError( self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type(self.initialTask.siteBlacklist())) raise error try: [ Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist() ] except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site blacklist validation error: %s" % str(ex)) raise error # splitter settings if self.args.get('SliceSize', 1) <= 0: error = WorkQueueWMSpecError( self.wmspec, 'Zero or negative SliceSize parameter') raise error if self.args.get('SubSliceSize', 1) <= 0: error = WorkQueueWMSpecError(
class WorkQueue(WorkQueueBase): """ _WorkQueue_ WorkQueue object - interface to WorkQueue functionality. """ def __init__(self, logger=None, dbi=None, **params): WorkQueueBase.__init__(self, logger, dbi) self.parent_queue = None self.params = params # config argument (within params) shall be reference to # Configuration instance (will later be checked for presence of "Alert") self.config = params.get("Config", None) self.params.setdefault('CouchUrl', os.environ.get('COUCHURL')) if not self.params.get('CouchUrl'): raise RuntimeError, 'CouchUrl config value mandatory' self.params.setdefault('DbName', 'workqueue') self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox') self.params.setdefault('ParentQueueCouchUrl', None) # We get work from here self.backend = WorkQueueBackend(self.params['CouchUrl'], self.params['DbName'], self.params['InboxDbName'], self.params['ParentQueueCouchUrl'], self.params.get('QueueURL'), logger=self.logger) if self.params.get('ParentQueueCouchUrl'): try: self.parent_queue = WorkQueueBackend( self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0], self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1]) except IndexError, ex: # Probable cause: Someone didn't put the global WorkQueue name in # the ParentCouchUrl msg = "Parsing failure for ParentQueueCouchUrl - probably missing dbname in input\n" msg += "Exception: %s\n" % str(ex) msg += str("ParentQueueCouchUrl: %s\n" % self.params['ParentQueueCouchUrl']) self.logger.error(msg) raise WorkQueueError(msg) self.params['ParentQueueCouchUrl'] = self.parent_queue.queueUrl self.params.setdefault( "GlobalDBS", "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet") self.params.setdefault('QueueDepth', 0.5) # when less than this locally self.params.setdefault('LocationRefreshInterval', 600) self.params.setdefault('FullLocationRefreshInterval', 7200) self.params.setdefault('TrackLocationOrSubscription', 'subscription') self.params.setdefault('ReleaseIncompleteBlocks', False) self.params.setdefault('ReleaseRequireSubscribed', True) self.params.setdefault('PhEDExEndpoint', None) self.params.setdefault('PopulateFilesets', True) self.params.setdefault('LocalQueueFlag', True) self.params.setdefault('QueueRetryTime', 86400) self.params.setdefault('stuckElementAlertTime', 86400) self.params.setdefault('reqmgrCompleteGraceTime', 604800) self.params.setdefault('cancelGraceTime', 604800) self.params.setdefault('JobDumpConfig', None) self.params.setdefault('BossAirConfig', None) self.params[ 'QueueURL'] = self.backend.queueUrl # url this queue is visible on # backend took previous QueueURL and sanitized it self.params.setdefault('WMBSUrl', None) # this will only be set on local Queue if self.params.get('WMBSUrl'): self.params['WMBSUrl'] = Lexicon.sanitizeURL( self.params['WMBSUrl'])['url'] self.params.setdefault('Teams', []) self.params.setdefault('DrainMode', False) if self.params.get('CacheDir'): try: os.makedirs(self.params['CacheDir']) except OSError: pass elif self.params.get('PopulateFilesets'): raise RuntimeError, 'CacheDir mandatory for local queue' self.params.setdefault('SplittingMapping', {}) self.params['SplittingMapping'].setdefault('DatasetBlock', { 'name': 'Block', 'args': {} }) self.params['SplittingMapping'].setdefault('MonteCarlo', { 'name': 'MonteCarlo', 'args': {} }) self.params['SplittingMapping'].setdefault('Dataset', { 'name': 'Dataset', 'args': {} }) self.params['SplittingMapping'].setdefault('Block', { 'name': 'Block', 'args': {} }) self.params['SplittingMapping'].setdefault('ResubmitBlock', { 'name': 'ResubmitBlock', 'args': {} }) self.params.setdefault('EndPolicySettings', {}) assert (self.params['TrackLocationOrSubscription'] in ('subscription', 'location')) # Can only release blocks on location if self.params['TrackLocationOrSubscription'] == 'location': if self.params['SplittingMapping']['DatasetBlock'][ 'name'] != 'Block': raise RuntimeError, 'Only blocks can be released on location' if self.params.get('PhEDEx'): self.phedexService = self.params['PhEDEx'] else: phedexArgs = {} if self.params.get('PhEDExEndpoint'): phedexArgs['endpoint'] = self.params['PhEDExEndpoint'] self.phedexService = PhEDEx(phedexArgs) if self.params.get('SiteDB'): self.SiteDB = self.params['SiteDB'] else: self.SiteDB = SiteDB() if type(self.params['Teams']) in types.StringTypes: self.params['Teams'] = [x.strip() for x in \ self.params['Teams'].split(',')] self.dataLocationMapper = WorkQueueDataLocationMapper( self.logger, self.backend, phedex=self.phedexService, sitedb=self.SiteDB, locationFrom=self.params['TrackLocationOrSubscription'], incompleteBlocks=self.params['ReleaseIncompleteBlocks'], requireBlocksSubscribed=not self.params['ReleaseIncompleteBlocks'], fullRefreshInterval=self.params['FullLocationRefreshInterval'], updateIntervalCoarseness=self.params['LocationRefreshInterval']) # initialize alerts sending client (self.sendAlert() method) # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10 # 1 - 4 - lower levels ; 5 - 10 higher levels preAlert, self.alertSender = \ alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager") self.sendAlert = alertAPI.getSendAlert(sender=self.alertSender, preAlert=preAlert) self.logger.debug("WorkQueue created successfully")
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() Lexicon.dataset(datasetPath) # check dataset name validBlocks = [] locations = None blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() siteWhiteList = task.siteWhitelist() for blockName in dbs.listFileBlocks(datasetPath): block = dbs.getDBSSummaryInfo(datasetPath, block = blockName) # check block restrictions if blockWhiteList and block['block'] not in blockWhiteList: continue if block['block'] in blockBlackList: continue # check run restrictions if runWhiteList or runBlackList: # listRunLumis returns a dictionary with the lumi sections per run runLumis = dbs.listRunLumis(block = block['block']) runs = set(runLumis.keys()) recalculateLumiCounts = False if len(runs) > 1: # If more than one run in the block # Then we must calculate the lumi counts after filtering the run list # This has to be done rarely and requires calling DBS file information recalculateLumiCounts = True # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: continue if recalculateLumiCounts: # get correct lumi count # Recalculate effective size of block # We pull out file info, since we don't do this often acceptedLumiCount = 0 acceptedEventCount = 0 acceptedFileCount = 0 fileInfo = dbs.listFilesInBlock(fileBlockName = block['block']) for fileEntry in fileInfo: acceptedFile = False acceptedFileLumiCount = 0 for lumiInfo in fileEntry['LumiList']: runNumber = lumiInfo['RunNumber'] if runNumber in runs: acceptedFile = True acceptedFileLumiCount += 1 if acceptedFile: acceptedFileCount += 1 acceptedLumiCount += acceptedFileLumiCount if len(fileEntry['LumiList']) != acceptedFileLumiCount: acceptedEventCount += float(acceptedFileLumiCount) * fileEntry['NumberOfEvents']/len(fileEntry['LumiList']) else: acceptedEventCount += fileEntry['NumberOfEvents'] else: acceptedLumiCount = block["NumberOfLumis"] acceptedFileCount = block['NumberOfFiles'] acceptedEventCount = block['NumberOfEvents'] # recalculate effective size of block # make a guess for new event/file numbers from ratio # of accepted lumi sections (otherwise have to pull file info) fullLumiCount = block["NumberOfLumis"] block[self.lumiType] = acceptedLumiCount block['NumberOfFiles'] = acceptedFileCount block['NumberOfEvents'] = acceptedEventCount validBlocks.append(block) if locations is None: locations = set(sitesFromStorageEelements(dbs.listFileBlockLocation(block['block']))) else: locations = locations.intersection(set(sitesFromStorageEelements(dbs.listFileBlockLocation(block['block'])))) if self.wmspec.locationDataSourceFlag(): locations = locations.union(siteWhiteList) # all needed blocks present at these sites if locations: self.data[datasetPath] = list(locations) return validBlocks
if self.initialTask.siteWhitelist(): if type(self.initialTask.siteWhitelist()) in types.StringTypes: error = WorkQueueWMSpecError(self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type(self.initialTask.siteWhitelist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist()] except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error if self.initialTask.siteBlacklist(): if type(self.initialTask.siteBlacklist()) in types.StringTypes: error = WorkQueueWMSpecError(self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type(self.initialTask.siteBlacklist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist()] except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site blacklist validation error: %s" % str(ex)) raise error # splitter settings if self.args.get('SliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SliceSize parameter') raise error if self.args.get('SubSliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SubSliceSize parameter') raise error # check input dataset is valid try: if self.initialTask.getInputDatasetPath():
def validBlocks(self, task, dbs): """Return blocks that pass the input data restriction""" datasetPath = task.getInputDatasetPath() Lexicon.dataset(datasetPath) # check dataset name validBlocks = [] locations = None blockWhiteList = task.inputBlockWhitelist() blockBlackList = task.inputBlockBlacklist() runWhiteList = task.inputRunWhitelist() runBlackList = task.inputRunBlacklist() for blockName in dbs.listFileBlocks(datasetPath): block = dbs.getDBSSummaryInfo(datasetPath, block=blockName) # check block restrictions if blockWhiteList and block['block'] not in blockWhiteList: continue if block['block'] in blockBlackList: continue # check run restrictions if runWhiteList or runBlackList: # listRunLumis returns a dictionary with the lumi sections per run runLumis = dbs.listRunLumis(block=block['block']) runs = set(runLumis.keys()) # apply blacklist runs = runs.difference(runBlackList) # if whitelist only accept listed runs if runWhiteList: runs = runs.intersection(runWhiteList) # any runs left are ones we will run on, if none ignore block if not runs: continue # recalculate effective size of block # make a guess for new event/file numbers from ratio # of accepted lumi sections (otherwise have to pull file info) acceptedLumiCount = 0 fullLumiCount = 0 acceptedLumiCount = 0 fullLumiCount = 0 for run in runLumis: if run in runs: acceptedLumiCount += runLumis[run] fullLumiCount += runLumis[run] ratioAccepted = float(acceptedLumiCount) / fullLumiCount block[self.lumiType] = acceptedLumiCount block['NumberOfFiles'] = int( float(block['NumberOfFiles']) * ratioAccepted) block['NumberOfEvents'] = int( float(block['NumberOfEvents']) * ratioAccepted) validBlocks.append(block) if locations is None: locations = set( sitesFromStorageEelements( dbs.listFileBlockLocation(block['block']))) else: locations = locations.intersection( set( sitesFromStorageEelements( dbs.listFileBlockLocation(block['block'])))) # all needed blocks present at these sites if locations: self.data[datasetPath] = list(locations) return validBlocks