def cleanSuccessfulStageOuts(self): """ _cleanSucessfulStageOuts_ In the event of a failed stage out, this method can be called to cleanup the files that may have previously been staged out so that the job ends in a clear state of failure, rather than a partial success """ for lfn, fileInfo in self.completedFiles.items(): pfn = fileInfo['PFN'] command = fileInfo['StageOutCommand'] msg = "Cleaning out file: %s\n" % lfn msg += "Removing PFN: %s" % pfn msg += "Using command implementation: %s\n" % command print msg delManager = DeleteMgr(**self.overrideConf) try: delManager.deletePFN(pfn, lfn, command) except StageOutFailure, ex: msg = "Failed to cleanup staged out file after error:" msg += " %s\n%s" % (lfn, str(ex)) print msg
def cleanSuccessfulStageOuts(self): """ _cleanSucessfulStageOuts_ In the event of a failed stage out, this method can be called to cleanup the files that may have previously been staged out so that the job ends in a clear state of failure, rather than a partial success """ for lfn, fileInfo in self.completedFiles.items(): pfn = fileInfo['PFN'] command = fileInfo['StageOutCommand'] msg = "Cleaning out file: %s\n" % lfn msg += "Removing PFN: %s" % pfn msg += "Using command implementation: %s\n" % command print msg delManager = DeleteMgr(**self.overrideConf) try: delManager.deletePFN(pfn, lfn, command) except StageOutFailure as ex: msg = "Failed to cleanup staged out file after error:" msg += " %s\n%s" % (lfn, str(ex)) print msg
def execute(self, emulator=None): """ _execute_ """ scramCommand = self.step.application.setup.scramCommand scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion # Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get( 'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters castorStageOutParams = {} castorStageOutParams['command'] = overrides.get('command', "xrdcp") castorStageOutParams['option'] = overrides.get('option', "--cerncastor") castorStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") castorStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms") # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "") eosStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://eoscms.cern.ch//eos/cms") # are we using the new stageout method ? useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True try: castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % ( self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X) result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion) useEdmCopyUtil = False if result: try: if int(result.group(1)) >= 8: useEdmCopyUtil = True except ValueError: pass # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get( 'X509_USER_PROXY', None) for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error( "Indefinite hang during edmCopyUtil/stageIn of logArchives" ) except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append( os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") if useEdmCopyUtil: with open('scramOutput.log', 'r') as f: logging.error("Scram output: %s", f.read()) for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: tarFile = tarfile.open(tarLocation, 'w:') for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) tarFile.close() else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to Castor and EOS") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % ( now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN Castor signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: castorStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to Castor" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", file=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # then, perform best effort stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: logging.error( "Indefinite hang during stageOut of LogCollect to EOS") except Exception as ex: logging.error("Unable to stageOut LogCollect to EOS:\n", ex) signal.alarm(0) # we got this far, delete input for log in deleteLogArchives: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = { 'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None } deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) logging.info("Steps.Executors.%s.execute called", self.__class__.__name__) scramCommand = self.step.application.setup.scramCommand cmsswVersion = self.step.application.setup.cmsswVersion scramArch = getSingleScramArch(self.step.application.setup.scramArch) overrideCatalog = getattr(self.step.application, 'overrideCatalog', None) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery") eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms") try: eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X # see: https://github.com/cms-sw/cmssw/issues/25292 useEdmCopyUtil = True if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"): pass elif scramArch.startswith('slc7_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc7_amd64_gcc820" elif scramArch.startswith('slc6_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc6_amd64_gcc700" else: useEdmCopyUtil = False logging.info("Using edmCopyUtil: %s", useEdmCopyUtil) # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None) # specify TFC if necessary if overrideCatalog: copyCommand += " -c %s" % overrideCatalog for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives") except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: with tarfile.open(tarLocation, 'w:') as tarFile: for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to EOS" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] # we got this far, delete ALL input files assigned to this job for log in self.job["input_files"]: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = {'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None} deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) self.logger.info("Steps.Executors.%s.execute called", self.__class__.__name__) self.logger.info("Step set to numberOfRetries: %s, retryDelay: %s", self.step.retryCount, self.step.retryDelay) # Look! I can steal from StageOut # DeleteMgr uses the same manager structure as StageOutMgr overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Wait up to 5min for a single file deletion overrides.setdefault('waitTime', 300) self.logger.info("Step with the following overrides:") for keyName, value in overrides.items(): self.logger.info(" %s : %s", keyName, value) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if 'newStageOut' in overrides and overrides.get('newStageOut'): useNewStageOutCode = True stageOutCall = {} stageOutCall['logger'] = self.logger if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and "lfn-prefix" in overrides: stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node'] = overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = DeleteMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style self.logger.critical("DeleteFiles IS USING NEW STAGEOUT CODE") manager = NewDeleteMgr(retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **stageOutCall) # This is where the deleted files go filesDeleted = [] for fileDict in self.job['input_files']: self.logger.debug("Deleting LFN: %s", fileDict.get('lfn')) fileForTransfer = { 'LFN': fileDict.get('lfn'), 'PFN': None, # PFNs are assigned in the Delete Manager 'PNN': None, # PNN is assigned in the delete manager 'StageOutCommand': None } if self.deleteOneFile(fileForTransfer, manager, overrides['waitTime']): filesDeleted.append(fileForTransfer) # Alan: I do not get why we would have two sets of files to be deleted! if hasattr(self.step, 'filesToDelete'): # files from the configTree to be deleted for k, v in self.step.filesToDelete.dictionary_().iteritems(): if k.startswith('file'): self.logger.info("Deleting LFN: %s", v) fileForTransfer = { 'LFN': v, 'PFN': None, 'PNN': None, 'StageOutCommand': None } if self.deleteOneFile(fileForTransfer, manager, overrides['waitTime']): filesDeleted.append(fileForTransfer) if not filesDeleted: raise DeleteMgrError(WM_JOB_ERROR_CODES[60313]) # Now we've got to put things in the report for fileDict in filesDeleted: self.report.addRemovedCleanupFile(**fileDict)