def testCheckLumiInformation(self): """ _testCheckLumiInformation_ Test the function that checks if all files have run lumi information """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) myReport.checkForRunLumiInformation(stepName="cmsRun1") self.assertNotEqual(myReport.getExitCode(), 70452) # Remove the lumi information on purpose myReport2 = Report("cmsRun1") myReport2.parse(self.xmlPath) fRefs = myReport2.getAllFileRefsFromStep(step="cmsRun1") for fRef in fRefs: fRef.runs = ConfigSection() myReport2.checkForRunLumiInformation(stepName="cmsRun1") self.assertFalse(myReport2.stepSuccessful(stepName="cmsRun1")) self.assertEqual(myReport2.getExitCode(), 70452) return
def testCheckLumiInformation(self): """ _testCheckLumiInformation_ Test the function that checks if all files have run lumi information """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) myReport.checkForRunLumiInformation(stepName = "cmsRun1") self.assertNotEqual(myReport.getExitCode(), 60452) # Remove the lumi information on purpose myReport2 = Report("cmsRun1") myReport2.parse(self.xmlPath) fRefs = myReport2.getAllFileRefsFromStep(step = "cmsRun1") for fRef in fRefs: fRef.runs = ConfigSection() myReport2.checkForRunLumiInformation(stepName = "cmsRun1") self.assertFalse(myReport2.stepSuccessful(stepName = "cmsRun1")) self.assertEqual(myReport2.getExitCode(), 60452) return
def testASONoNameChange(self): AsyncStageOut_t.FakeTransferWorker.setFailProbability(0) testJob = self.roundtripHelper(preserveLFN = True) stepReport = Report('cmsRun1') stepReport.unpersist(testJob['fwjr_path']) files = stepReport.getAllFileRefsFromStep(step = 'cmsRun1') for file in files: self.assertNotEqual( file.lfn.find('store/temp'), -1, "The lfn should still have store/temp: %s" % file.lfn)
def testGetAdlerChecksum(self): """ _testGetAdlerChecksum_ Test the function that sees if all files have an adler checksum. For some reason, our default XML report doesn't have checksums Therefore it should fail. """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) myReport.checkForAdlerChecksum(stepName="cmsRun1") self.assertFalse(myReport.stepSuccessful(stepName="cmsRun1")) self.assertEqual(myReport.getExitCode(), 60451) # Now see what happens if the adler32 is set to None myReport2 = Report("cmsRun1") myReport2.parse(self.xmlPath) fRefs = myReport2.getAllFileRefsFromStep(step="cmsRun1") for fRef in fRefs: fRef.checksums = {'adler32': None} myReport2.checkForAdlerChecksum(stepName="cmsRun1") self.assertFalse(myReport2.stepSuccessful(stepName="cmsRun1")) self.assertEqual(myReport2.getExitCode(), 60451) myReport3 = Report("cmsRun1") myReport3.parse(self.xmlPath) fRefs = myReport3.getAllFileRefsFromStep(step="cmsRun1") for fRef in fRefs: fRef.checksums = {'adler32': 100} myReport3.checkForAdlerChecksum(stepName="cmsRun1") self.assertTrue(myReport3.getExitCode() != 60451) return
def testGetAdlerChecksum(self): """ _testGetAdlerChecksum_ Test the function that sees if all files have an adler checksum. For some reason, our default XML report doesn't have checksums Therefore it should fail. """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) myReport.checkForAdlerChecksum(stepName = "cmsRun1") self.assertFalse(myReport.stepSuccessful(stepName = "cmsRun1")) self.assertEqual(myReport.getExitCode(), 60451) # Now see what happens if the adler32 is set to None myReport2 = Report("cmsRun1") myReport2.parse(self.xmlPath) fRefs = myReport2.getAllFileRefsFromStep(step = "cmsRun1") for fRef in fRefs: fRef.checksums = {'adler32': None} myReport2.checkForAdlerChecksum(stepName = "cmsRun1") self.assertFalse(myReport2.stepSuccessful(stepName = "cmsRun1")) self.assertEqual(myReport2.getExitCode(), 60451) myReport3 = Report("cmsRun1") myReport3.parse(self.xmlPath) fRefs = myReport3.getAllFileRefsFromStep(step = "cmsRun1") for fRef in fRefs: fRef.checksums = {'adler32': 100} myReport3.checkForAdlerChecksum(stepName = "cmsRun1") self.assertTrue(myReport3.getExitCode() != 60451) return
def post(self, emulator=None): """ _post_ Post execution checkpointing """ # Another emulator check if emulator is not None: return emulator.emulatePost(self.step) logging.info("Steps.Executors.%s.post called", self.__class__.__name__) for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: # Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report(step) stepReport.unpersist(reportLocation) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue files = stepReport.getAllFileRefsFromStep(step=step) for fileInfo in files: if hasattr(fileInfo, 'lfn') and hasattr( fileInfo, 'location') and hasattr(fileInfo, 'guid'): fileInfo.user_dn = getattr(self.step, "userDN", None) fileInfo.async_dest = getattr(self.step, "asyncDest", None) fileInfo.user_vogroup = getattr(self.step, "owner_vogroup", '') fileInfo.user_vorole = getattr(self.step, "owner_vorole", '') stepReport.persist(reportLocation) return None
def post(self, emulator = None): """ _post_ Post execution checkpointing """ #Another emulator check if (emulator != None): return emulator.emulatePost( self.step ) for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s" % step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s" \ % (step, stepLocation)) continue # First, get everything from a file and 'unpersist' it stepReport = Report(step) stepReport.unpersist(reportLocation) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue files = stepReport.getAllFileRefsFromStep(step = step) for file in files: if not hasattr(file, 'lfn') or not hasattr(file, 'location') or \ not hasattr(file, 'guid'): continue file.user_dn = getattr(self.step, "userDN", None) file.async_dest = getattr(self.step, "asyncDest", None) file.user_vogroup = getattr(self.step, "owner_vogroup", '') file.user_vorole = getattr(self.step, "owner_vorole", '') stepReport.persist(reportLocation) print "Steps.Executors.StageOut.post called" return None
def setJobWantsASO(self, filename, preserveLFN = True): stepReport = Report('cmsRun1') stepReport.unpersist(filename) files = stepReport.getAllFileRefsFromStep(step = 'cmsRun1') for file in files: if not hasattr(file, 'lfn') or not hasattr(file, 'location') or \ not hasattr(file, 'guid'): continue file.user_dn = "/CN=dummy-name/O=melopartydotcom" file.async_dest = "T2_US_Vanderbilt" file.user_vogroup = '' file.user_vorole = '' file.preserve_lfn = preserveLFN stepReport.persist(filename)
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) logging.info("StageOut override is: %s " % self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if overrides.has_key('newStageOut') and overrides.get('newStageOut'): useNewStageOutCode = True stageOutCall = {} if overrides.has_key("command") and overrides.has_key("option") \ and overrides.has_key("se-name") and overrides.has_key("lfn-prefix"): logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['se-name'] = overrides.get('se-name') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print "STAGEOUT IS USING NEW STAGEOUT CODE" manager = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s" % (step)) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s" \ % (step, stepLocation)) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) taskID = getattr(stepReport.data, 'id', None) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step = step) for file in files: if not hasattr(file, 'lfn') and hasattr(file, 'pfn'): # Then we're truly hosed on this file; ignore it msg = "Not a file: %s" % file logging.error(msg) continue # Support direct-to-merge # This requires pulling a bunch of stuff from everywhere # First check if it's needed if hasattr(self.step.output, 'minMergeSize') \ and hasattr(file, 'size') \ and not getattr(file, 'merged', False): # We need both of those to continue, and we don't # direct-to-merge if getattr(self.step.output, 'doNotDirectMerge', False): # Then we've been told explicitly not to do direct-to-merge continue if file.size >= self.step.output.minMergeSize: # Then this goes direct to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception, ex: logging.error("Encountered error while handling LFN for merge due to size.\n") logging.error(str(ex)) logging.debug(file) logging.debug("minMergeSize: %s" % self.step.output.minMergeSize) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) elif getattr(self.step.output, 'maxMergeEvents', None) != None\ and getattr(file, 'events', None) != None\ and not getattr(file, 'merged', False): # Then direct-to-merge due to events if # the file is large enough: if file.events >= self.step.output.maxMergeEvents: # straight to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception, ex: logging.error("Encountered error while handling LFN for merge due to events.\n") logging.error(str(ex)) logging.debug(file) logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60402, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move file.pfn to the output PFN file.InputPFN = file.pfn lfn = getattr(file, 'lfn') fileSource = getattr(file, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(file, 'pfn'), 'SEName' : None, 'StageOutCommand': None} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) file.StageOutCommand = fileForTransfer['StageOutCommand'] file.location = fileForTransfer['SEName'] file.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.persist("Report.pkl") except Exception, ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist("Report.pkl") raise
def testReportHandling(self): """ _testReportHandling_ Verify that we're able to parse a CMSSW report, convert it to a Report() style report, pickle it and then have the accountant process it. """ self.procPath = os.path.join(WMCore.WMBase.getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWProcessingReport.xml") myReport = Report("cmsRun1") myReport.parse(self.procPath) # Fake some metadata that should be added by the stageout scripts. for fileRef in myReport.getAllFileRefsFromStep("cmsRun1"): fileRef.size = 1024 fileRef.location = "cmssrm.fnal.gov" fwjrPath = os.path.join(self.tempDir, "ProcReport.pkl") cmsRunStep = myReport.retrieveStep("cmsRun1") cmsRunStep.status = 0 myReport.setTaskName('/TestWF/None') myReport.persist(fwjrPath) self.setFWJRAction.execute(jobID = self.testJob["id"], fwjrPath = fwjrPath) pFile = DBSBufferFile(lfn = "/path/to/some/lfn", size = 600000, events = 60000) pFile.setAlgorithm(appName = "cmsRun", appVer = "UNKNOWN", appFam = "RECO", psetHash = "GIBBERISH", configContent = "MOREGIBBERISH") pFile.setDatasetPath("/bogus/dataset/path") #pFile.addRun(Run(1, *[45])) pFile.create() config = self.createConfig(workerThreads = 1) accountant = JobAccountantPoller(config) accountant.setup() accountant.algorithm() self.verifyJobSuccess(self.testJob["id"]) self.verifyFileMetaData(self.testJob["id"], myReport.getAllFilesFromStep("cmsRun1")) inputFile = File(lfn = "/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR09_R_34X_V5_All_v1/0000/outputRECORECO.root") inputFile.load() self.testMergeJob = Job(name = "testMergeJob", files = [inputFile]) self.testMergeJob.create(group = self.mergeJobGroup) self.testMergeJob["state"] = "complete" self.stateChangeAction.execute(jobs = [self.testMergeJob]) self.mergePath = os.path.join(WMCore.WMBase.getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWMergeReport.xml") myReport = Report("mergeReco") myReport.parse(self.mergePath) # Fake some metadata that should be added by the stageout scripts. for fileRef in myReport.getAllFileRefsFromStep("mergeReco"): fileRef.size = 1024 fileRef.location = "cmssrm.fnal.gov" fileRef.dataset = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_4_2_patch1", "primaryDataset": "MinimumBias", "processedDataset": "Rereco-v1", "dataTier": "RECO"} fwjrPath = os.path.join(self.tempDir, "MergeReport.pkl") myReport.setTaskName('/MergeWF/None') cmsRunStep = myReport.retrieveStep("mergeReco") cmsRunStep.status = 0 myReport.persist(fwjrPath) self.setFWJRAction.execute(jobID = self.testMergeJob["id"], fwjrPath = fwjrPath) accountant.algorithm() self.verifyJobSuccess(self.testMergeJob["id"]) self.verifyFileMetaData(self.testMergeJob["id"], myReport.getAllFilesFromStep("mergeReco")) return
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to two hours per retry # this alarm leaves a subprocess behing that may cause trouble, see #6273 waitTime = overrides.get('waitTime', 7200 * self.step.retryCount) logging.info("StageOut override is: %s ", self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node']= overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print("STAGEOUT IS USING NEW STAGEOUT CODE") manager = FMStageOutMgr(retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step = step) for fileName in files: # make sure the file information is consistent if hasattr(fileName, 'pfn') and ( not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label') ): msg = "Not a valid file: %s" % fileName logging.error(msg) continue # Figuring out if we should do straight to merge # - should we do straight to merge at all ? # - is straight to merge disabled for this output ? # - are we over the size threshold # - are we over the event threshold ? straightToMerge = False if not getattr(fileName, 'merged', False) and hasattr(self.step.output, 'minMergeSize'): if fileName.module_label not in getattr(self.step.output, 'forceUnmergedOutputs', []): if getattr(fileName, 'size', 0) >= self.step.output.minMergeSize: straightToMerge = True if getattr(fileName, 'events', 0) >= getattr(self.step.output, 'maxMergeEvents', sys.maxsize): straightToMerge = True if straightToMerge: try: fileName = self.handleLFNForMerge(mergefile = fileName, step = step) except Exception as ex: logging.info("minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None)) logging.info("maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None)) logging.error("Encountered error while handling LFN for merge %s", fileName) logging.error(str(ex)) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move fileName.pfn to the output PFN fileName.InputPFN = fileName.pfn lfn = getattr(fileName, 'lfn') fileSource = getattr(fileName, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(fileName, 'pfn'), 'PNN' : None, 'StageOutCommand': None, 'Checksums' : getattr(fileName, 'checksums', None)} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) fileName.StageOutCommand = fileForTransfer['StageOutCommand'] fileName.location = fileForTransfer['PNN'] fileName.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.setStepStatus(self.stepName, 1) # well, if it fails for one file, it fails for the whole job... break except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist(reportLocation) raise signal.alarm(0) # Am DONE with report. Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files", len(filesTransferred)) return
def execute(self, emulator=None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to two hours per retry # this alarm leaves a subprocess behing that may cause trouble, see #6273 waitTime = overrides.get('waitTime', 7200 * self.step.retryCount) logging.info("StageOut override is: %s ", self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node'] = overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print("STAGEOUT IS USING NEW STAGEOUT CODE") manager = FMStageOutMgr(retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step=step) for fileName in files: # make sure the file information is consistent if hasattr(fileName, 'pfn') and (not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label')): msg = "Not a valid file: %s" % fileName logging.error(msg) continue # Figuring out if we should do straight to merge # - should we do straight to merge at all ? # - is straight to merge disabled for this output ? # - are we over the size threshold # - are we over the event threshold ? straightToMerge = False if not getattr(fileName, 'merged', False) and hasattr( self.step.output, 'minMergeSize'): if fileName.module_label not in getattr( self.step.output, 'forceUnmergedOutputs', []): if getattr(fileName, 'size', 0) >= self.step.output.minMergeSize: straightToMerge = True if getattr(fileName, 'events', 0) >= getattr( self.step.output, 'maxMergeEvents', sys.maxsize): straightToMerge = True if straightToMerge: try: fileName = self.handleLFNForMerge(mergefile=fileName, step=step) except Exception as ex: logging.info( "minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None)) logging.info( "maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None)) logging.error( "Encountered error while handling LFN for merge %s", fileName) logging.error(str(ex)) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move fileName.pfn to the output PFN fileName.InputPFN = fileName.pfn lfn = getattr(fileName, 'lfn') fileSource = getattr(fileName, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = { 'LFN': lfn, 'PFN': getattr(fileName, 'pfn'), 'PNN': None, 'StageOutCommand': None, 'Checksums': getattr(fileName, 'checksums', None) } signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) fileName.StageOutCommand = fileForTransfer[ 'StageOutCommand'] fileName.location = fileForTransfer['PNN'] fileName.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.setStepStatus(self.stepName, 1) # well, if it fails for one file, it fails for the whole job... break except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist(reportLocation) raise signal.alarm(0) # Am DONE with report. Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files", len(filesTransferred)) return
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) logging.info("StageOut override is: %s " % self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "se-name" in overrides and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['se-name'] = overrides.get('se-name') stageOutCall['phedex-node']= overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print "STAGEOUT IS USING NEW STAGEOUT CODE" manager = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s" % (step)) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s" \ % (step, stepLocation)) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) taskID = getattr(stepReport.data, 'id', None) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step = step) for file in files: if not hasattr(file, 'lfn') and hasattr(file, 'pfn'): # Then we're truly hosed on this file; ignore it msg = "Not a file: %s" % file logging.error(msg) continue # Support direct-to-merge # This requires pulling a bunch of stuff from everywhere # First check if it's needed if hasattr(self.step.output, 'minMergeSize') \ and hasattr(file, 'size') \ and not getattr(file, 'merged', False): # We need both of those to continue, and we don't # direct-to-merge if getattr(self.step.output, 'doNotDirectMerge', False): # Then we've been told explicitly not to do direct-to-merge continue if file.size >= self.step.output.minMergeSize: # Then this goes direct to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception as ex: logging.error("Encountered error while handling LFN for merge due to size.\n") logging.error(str(ex)) logging.debug(file) logging.debug("minMergeSize: %s" % self.step.output.minMergeSize) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) elif getattr(self.step.output, 'maxMergeEvents', None) != None\ and getattr(file, 'events', None) != None\ and not getattr(file, 'merged', False): # Then direct-to-merge due to events if # the file is large enough: if file.events >= self.step.output.maxMergeEvents: # straight to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception as ex: logging.error("Encountered error while handling LFN for merge due to events.\n") logging.error(str(ex)) logging.debug(file) logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60402, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move file.pfn to the output PFN file.InputPFN = file.pfn lfn = getattr(file, 'lfn') fileSource = getattr(file, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(file, 'pfn'), 'SEName' : None, 'PNN' : None, 'StageOutCommand': None, 'Checksums' : getattr(file, 'checksums', None)} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) file.StageOutCommand = fileForTransfer['StageOutCommand'] # file.location = fileForTransfer['SEName'] file.location = fileForTransfer['PNN'] file.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.persist("Report.pkl") except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist("Report.pkl") raise signal.alarm(0) # Am DONE with report # Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files" %(len(filesTransferred))) return