def testGrouper(self): """ Test the grouper function (returns chunk of an iterable) """ listChunks = [i for i in grouper(list(range(0, 7)), 3)] # Want list(range) for python 3 iterChunks = [i for i in grouper(xrange(0, 7), 3)] # xrange becomes range in python 3 for a, b in itertools.izip_longest(listChunks, iterChunks): self.assertEqual(a, b) self.assertEqual(listChunks[-1], [6])
def testGrouper(self): """ Test the grouper function (returns chunk of an iterable) """ listChunks = [i for i in grouper(list(range(0, 7)), 3) ] # Want list(range) for python 3 iterChunks = [i for i in grouper(xrange(0, 7), 3) ] # xrange becomes range in python 3 for a, b in itertools.izip_longest(listChunks, iterChunks): self.assertEqual(a, b) self.assertEqual(listChunks[-1], [6])
def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1): """ currently only take one lfn but dbs api need be updated """ try: if blockName: lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly) elif lfns: lumiLists = [] for slfn in grouper(lfns, 50): lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name = slfn)) else: # shouldn't call this with both blockName and lfns empty # but still returns empty dict for that case return {} except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFileLumiArray(%s)\n" % lfns msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) lumiDict = {} for lumisItem in lumiLists: lumiDict.setdefault(lumisItem['logical_file_name'], []) item = {} item["RunNumber"] = lumisItem['run_num'] item['LumiSectionNumber'] = lumisItem['lumi_section_num'] lumiDict[lumisItem['logical_file_name']].append(item) return lumiDict
def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1): """ currently only take one lfn but dbs api need be updated """ try: if blockName: lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly) elif lfns: lumiLists = [] for slfn in grouper(lfns, 50): lumiLists.extend( self.dbs.listFileLumiArray(logical_file_name=slfn)) except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFileLumiArray(%s)\n" % lfns msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) lumiDict = {} for lumisItem in lumiLists: lumiDict.setdefault(lumisItem['logical_file_name'], []) item = {} item["RunNumber"] = lumisItem['run_num'] item['LumiSectionNumber'] = lumisItem['lumi_section_num'] lumiDict[lumisItem['logical_file_name']].append(item) return lumiDict
def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: data = self.fwjrAPI.getFWJRByArchiveStatus('ready', limit=self.numDocsRetrievePerPolling)['rows'] logging.info("Found %i not archived documents from FWRJ db to upload to WMArchive.", len(data)) for slicedData in grouper(data, self.numDocsUploadPerCall): jobIDs = [] archiveDocs = [] for job in slicedData: doc = createArchiverDoc(job) archiveDocs.append(doc) jobIDs.append(job["id"]) response = self.wmarchiver.archiveData(archiveDocs) # Partial success is not allowed either all the insert is successful or none is if response[0]['status'] == "ok" and len(response[0]['ids']) == len(jobIDs): archiveIDs = response[0]['ids'] for docID in jobIDs: self.fwjrAPI.updateArchiveUploadedStatus(docID) logging.info("...successfully uploaded %d docs", len(jobIDs)) logging.debug("JobIDs uploaded: %s", jobIDs) logging.debug("Archived IDs returned: %s", archiveIDs) else: logging.warning("Upload failed and it will be retried in the next cycle: %s: %s.", response[0]['status'], response[0]['reason']) logging.debug("failed JobIds %s", jobIDs) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc())
def listFilesInBlockWithParents(self, fileBlockName, lumis = True, validFileOnly = 1): """ _listFilesInBlockWithParents_ Get a list of files in the named fileblock including the parents of that file. TODO: lumis can be false when lumi splitting is not required However WMBSHelper expect file['LumiList'] to get the run number so for now it will be always true. """ if not self.blockExists(fileBlockName): msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data" raise DBSReaderError(msg % fileBlockName) try: #TODO: shoud we get only valid block for this? files = self.dbs.listFileParents(block_name = fileBlockName) fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly) except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % ( fileBlockName,) msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) childByParents = defaultdict(list) for f in files: # Probably a child can have more than 1 parent file for fp in f['parent_logical_file_name']: childByParents[fp].append(f['logical_file_name']) parentsLFNs = childByParents.keys() parentFilesDetail = [] #TODO: slicing parentLFNs util DBS api is handling that. #Remove slicing if DBS api handles for pLFNs in grouper(parentsLFNs, 50): parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name = pLFNs, detail = True)) if lumis: parentLumis = self._getLumiList(lfns = parentsLFNs) parentsByLFN = defaultdict(list) for pf in parentFilesDetail: parentLFN = pf['logical_file_name'] dbsFile = remapDBS3Keys(pf, stringify = True) if lumis: dbsFile["LumiList"] = parentLumis[parentLFN] for childLFN in childByParents[parentLFN]: parentsByLFN[childLFN].append(dbsFile) for fileInfo in fileDetails: fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']] return fileDetails
def getClassAds(self): """ _getClassAds_ Grab CONDOR classAds using CONDOR-PYTHON This looks at the schedd running on the Submit-Host and edit/remove jobs """ jobInfo = {} schedd = condor.Schedd() try: logging.debug("Start: Retrieving classAds using Condor Python XQuery") itobj = schedd.xquery( 'WMAgent_JobID =!= "UNDEFINED" && WMAgent_AgentName == %s' % classad.quote(str(self.agent)), ["JobStatus", "EnteredCurrentStatus", "JobStartDate", "QDate", "DESIRED_Sites", "ExtDESIRED_Sites", "MATCH_EXP_JOBGLIDEIN_CMSSite", "WMAgent_JobID"] ) logging.debug("Finish: Retrieving classAds using Condor Python XQuery") except: msg = "Query to condor schedd failed in PyCondorPlugin" logging.debug(msg) return None, None else: for slicedAds in grouper(itobj, 1000): for jobAd in slicedAds: ### This condition ignores jobs that are Removed, but stay in the X state ### For manual condor_rm removal, job wont be in the queue \ ### and status of the jobs will be read from condor log if jobAd["JobStatus"] == 3: continue else: ## For some strange race condition, schedd sometimes does not publish StartDate for a Running Job ## Get the entire classad for such a job ## Do not crash WMA, wait for next polling cycle to get all the info. if jobAd["JobStatus"] == 2 and jobAd.get("JobStartDate") is None: logging.debug("THIS SHOULD NOT HAPPEN. JobStartDate is MISSING from the CLASSAD.") logging.debug("Could be caused by some race condition. Wait for the next Polling Cycle") logging.debug("%s", str(jobAd)) continue tmpDict = {} tmpDict["JobStatus"] = int(jobAd.get("JobStatus", 100)) tmpDict["stateTime"] = int(jobAd["EnteredCurrentStatus"]) tmpDict["runningTime"] = int(jobAd.get("JobStartDate", 0)) tmpDict["submitTime"] = int(jobAd["QDate"]) tmpDict["DESIRED_Sites"] = jobAd["DESIRED_Sites"] tmpDict["ExtDESIRED_Sites"] = jobAd["ExtDESIRED_Sites"] tmpDict["runningCMSSite"] = jobAd.get("MATCH_EXP_JOBGLIDEIN_CMSSite", None) tmpDict["WMAgentID"] = int(jobAd["WMAgent_JobID"]) jobInfo[tmpDict["WMAgentID"]] = tmpDict logging.info("Retrieved %i classAds", len(jobInfo)) return jobInfo, schedd
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): cluster_ad = self.getClusterAd() proc_ads = self.getProcAds(jobsReady) logging.debug( "Start: Submitting %d jobs using Condor Python SubmitMany" % len(proc_ads)) try: clusterId = schedd.submitMany(cluster_ad, proc_ads) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.error( "Moving on the the next batch of jobs and/or cycle....") logging.exception(ex) condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug( "Finish: Submitting jobs using Condor Python SubmitMany") for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = '1' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info( "Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def createDirectories(dirList): """ Create the directory if everything is sane """ for sdirList in grouper(dirList, 500): cmdArgs = ['mkdir'] cmdArgs.extend(sdirList) pipe = Popen(cmdArgs, stdout=PIPE, stderr=PIPE, shell=False) stdout, stderr = pipe.communicate() if not stderr == "": msg = "Error in making directories: %s\n" % stderr logging.error(msg) logging.debug("Executing command %s\n" % cmdArgs) raise CreateWorkAreaException(msg) return
def createDirectories(dirList): """ Create the directory if everything is sane """ for sdirList in grouper(dirList, 500): cmdArgs = ['mkdir'] cmdArgs.extend(sdirList) pipe = Popen(cmdArgs, stdout = PIPE, stderr = PIPE, shell = False) stdout, stderr = pipe.communicate() if not stderr == "": msg = "Error in making directories: %s\n" % stderr logging.error(msg) logging.debug("Executing command %s\n" % cmdArgs) raise CreateWorkAreaException(msg) return
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): cluster_ad = self.getClusterAd() proc_ads = self.getProcAds(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany" % len(proc_ads)) try: clusterId = schedd.submitMany(cluster_ad, proc_ads) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.error("Moving on the the next batch of jobs and/or cycle....") logging.exception(ex) condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Finish: Submitting jobs using Condor Python SubmitMany") for index,job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def archiveJobs(self): """ _archiveJobs_ archiveJobs will handle the master task of looking for finished jobs, and running the code that cleans them out. """ doneList = self.findFinishedJobs() logging.info("Found %i finished jobs to archive", len(doneList)) jobCounter = 0 for slicedList in grouper(doneList, 10000): self.cleanWorkArea(slicedList) successList = [] failList = [] killList = [] for job in slicedList: if job["outcome"] == "success": successList.append(job) elif job["outcome"] == "killed": killList.append(job) else: failList.append(job) if self.uploadPublishInfo: self.createAndUploadPublish(successList) myThread = threading.currentThread() myThread.transaction.begin() self.changeState.propagate(successList, "cleanout", "success") self.changeState.propagate(failList, "cleanout", "exhausted") self.changeState.propagate(killList, "cleanout", "killed") myThread.transaction.commit() jobCounter += len(slicedList) logging.info("Successfully archived %d jobs out of %d.", jobCounter, len(doneList))
def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: logging.info("Getting not archived data info from FWRJ db...") data = self.fwjrAPI.getFWJRByArchiveStatus('ready', limit=1000)['rows'] for slicedData in grouper(data, self.numDocsUploadPerCall): jobIDs = [] archiverDocs = [] for job in slicedData: doc = createArchiverDoc(job) archiverDocs.append(doc) jobIDs.append(job["id"]) response = self.wmarchiver.archiveData(archiverDocs) # Partial success is not allowed either all the insert is successful of none is successful. if response[0]['status'] == "ok" and len(response[0]['ids']) == len(jobIDs): archiveIDs = response[0]['ids'] for docID in jobIDs: self.fwjrAPI.updateArchiveUploadedStatus(docID) logging.info("...successfully uploaded %d docs", len(jobIDs)) logging.debug("JobIDs uploaded: %s", jobIDs) logging.debug("Archive IDs returned: %s", response[0]['ids']) if len(set(archiveIDs)) == len(archiveIDs): duplicateIDs = set([x for x in archiveIDs if archiveIDs.count(x) > 1]) logging.info("There are duplicate entry %s", duplicateIDs) else: logging.warning("Upload failed: %s: %s", response[0]['status'], response[0]['reason']) logging.debug("failed JobIds %s", jobIDs) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc())
def execute(self, emulator=None): """ _execute_ """ scramCommand = self.step.application.setup.scramCommand scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion # Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get( 'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters castorStageOutParams = {} castorStageOutParams['command'] = overrides.get('command', "xrdcp") castorStageOutParams['option'] = overrides.get('option', "--cerncastor") castorStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") castorStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms") # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "") eosStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://eoscms.cern.ch//eos/cms") # are we using the new stageout method ? useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True try: castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % ( self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X) result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion) useEdmCopyUtil = False if result: try: if int(result.group(1)) >= 8: useEdmCopyUtil = True except ValueError: pass # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get( 'X509_USER_PROXY', None) for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error( "Indefinite hang during edmCopyUtil/stageIn of logArchives" ) except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append( os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") if useEdmCopyUtil: with open('scramOutput.log', 'r') as f: logging.error("Scram output: %s", f.read()) for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: tarFile = tarfile.open(tarLocation, 'w:') for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) tarFile.close() else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to Castor and EOS") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % ( now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN Castor signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: castorStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to Castor" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", file=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # then, perform best effort stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: logging.error( "Indefinite hang during stageOut of LogCollect to EOS") except Exception as ex: logging.error("Unable to stageOut LogCollect to EOS:\n", ex) signal.alarm(0) # we got this far, delete input for log in deleteLogArchives: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = { 'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None } deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def execute(self, emulator = None): """ _execute_ """ scramCommand = self.step.application.setup.scramCommand scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion # Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if 'newStageOut' in overrides and overrides.get('newStageOut'): useNewStageOutCode = True # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters castorStageOutParams = {} castorStageOutParams['command'] = overrides.get('command', "srmv2-lcg") castorStageOutParams['option'] = overrides.get('option', "") castorStageOutParams['se-name'] = overrides.get('se-name', "srm-cms.cern.ch") castorStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") castorStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms") # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "srmv2-lcg") eosStageOutParams['option'] = overrides.get('option', "") eosStageOutParams['se-name'] = overrides.get('se-name', "srm-eoscms.cern.ch") eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "srm://srm-eoscms.cern.ch:8443/srm/v2/server?SFN=/eos/cms") # are we using the new stageout method ? useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True try: if useNewStageOutCode: # is this even working ??? #logging.info("LOGCOLLECT IS USING NEW STAGEOUT CODE") #stageOutMgr = StageOutMgr(retryPauseTime = self.step.retryDelay, # numberOfRetries = self.step.retryCount, # **overrides) #stageInMgr = StageInMgr(retryPauseTime = 0, # numberOfRetries = 0) #deleteMgr = DeleteMgr(retryPauseTime = 0, # numberOfRetries = 0) castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() else: castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host , self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X) result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion) useEdmCopyUtil = False if result: try: if int(result.group(1)) >= 8: useEdmCopyUtil = True except ValueError: pass # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None) for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives") except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName = "logArchives", lfn = log['lfn']) else: logging.error("Unable to copy logArchives to local disk") if useEdmCopyUtil: with open('scramOutput.log', 'r') as f: logging.error("Scram output: %s", f.read()) for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: tarFile = tarfile.open(tarLocation, 'w:') for log in localLogs: path = log.split('/') tarFile.add(name = log, arcname = os.path.join(path[-3], path[-2], path[-1])) os.remove(log) tarFile.close() else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to Castor and EOS") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN' : lfn, 'PFN' : tarLocation, 'SEName' : None, 'PNN' : None, 'GUID' : None} # perform mandatory stage out to CERN Castor signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: castorStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to Castor" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule = "LogCollect", file = tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] tarInfo = {'LFN' : lfn, 'PFN' : tarLocation, 'SEName' : None, 'PNN' : None, 'GUID' : None} # then, perform best effort stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: logging.error("Indefinite hang during stageOut of LogCollect to EOS") except Exception as ex: logging.error("Unable to stageOut LogCollect to EOS:\n", ex) signal.alarm(0) # we got this far, delete input for log in deleteLogArchives: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = {'LFN': log['lfn'], 'PFN': None, 'SEName': None, 'PNN': None, 'StageOutCommand': None} deleteMgr(fileToDelete = fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up PyCondorPlugin worker pool") self.inputQueue = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target=submitWorker, args=(self.inputQueue, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Submit the jobs nSubmits = 0 queueError = False for jobsReady in grouper(jobs, self.jobsPerWorker): if queueError: # If the queue has failed, then we must not process any more jobs this cycle. break idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList=jobsReady) if not jdlList: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) with open(jdlFile, 'w') as handle: handle.writelines(jdlList) jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs", len(jobsReady)) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.inputQueue.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for dummy in range(nSubmits): try: res = self.result.get(block=True, timeout=timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds.", timeout) continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) continue try: dummyOut = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error=error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName="BossAirPyCondorPlugin") sendAlert = alertAPI.getSendAlert(sender=sender, preAlert=preAlert) sendAlert(6, msg=msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if self.deleteJDLFiles: for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in PyCondorPlugin") return successfulJobs, failedJobs
def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1): """ _listFilesInBlockWithParents_ Get a list of files in the named fileblock including the parents of that file. TODO: lumis can be false when lumi splitting is not required However WMBSHelper expect file['LumiList'] to get the run number so for now it will be always true. """ if not self.blockExists(fileBlockName): msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data" raise DBSReaderError(msg % fileBlockName) try: #TODO: shoud we get only valid block for this? files = self.dbs.listFileParents(block_name=fileBlockName) fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly) except dbsClientException as ex: msg = "Error in " msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % ( fileBlockName, ) msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) childByParents = defaultdict(list) for f in files: # Probably a child can have more than 1 parent file for fp in f['parent_logical_file_name']: childByParents[fp].append(f['logical_file_name']) parentsLFNs = childByParents.keys() parentFilesDetail = [] #TODO: slicing parentLFNs util DBS api is handling that. #Remove slicing if DBS api handles for pLFNs in grouper(parentsLFNs, 50): parentFilesDetail.extend( self.dbs.listFileArray(logical_file_name=pLFNs, detail=True)) if lumis: parentLumis = self._getLumiList(lfns=parentsLFNs) parentsByLFN = defaultdict(list) for pf in parentFilesDetail: parentLFN = pf['logical_file_name'] dbsFile = remapDBS3Keys(pf, stringify=True) if lumis: dbsFile["LumiList"] = parentLumis[parentLFN] for childLFN in childByParents[parentLFN]: parentsByLFN[childLFN].append(dbsFile) for fileInfo in fileDetails: fileInfo["ParentList"] = parentsByLFN[ fileInfo['logical_file_name']] return fileDetails
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up PyCondorPlugin worker pool") self.inputQueue = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target=submitWorker, args=(self.inputQueue, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Submit the jobs nSubmits = 0 queueError = False for jobsReady in grouper(jobs, self.jobsPerWorker): if queueError: # If the queue has failed, then we must not process any more jobs this cycle. break idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList=jobsReady) if not jdlList: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) with open(jdlFile, 'w') as handle: handle.writelines(jdlList) jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs", len(jobsReady)) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % ( self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.inputQueue.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for dummy in range(nSubmits): try: res = self.result.get(block=True, timeout=timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error( "However, no information of any use was obtained due to process failure." ) logging.error( "Either process failed, or process timed out after %s seconds.", timeout) continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) continue try: dummyOut = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) continue if not exitCode == 0: logging.error( "Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error=error) logging.error( "Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # stop the component if self.errorCount > self.errorThreshold: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) raise BossAirPluginException(msg) # Remove JDL files unless commanded otherwise if self.deleteJDLFiles: for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in PyCondorPlugin") return successfulJobs, failedJobs