def __rescheduleFailedJob( self, jobID, message, stop = True ): """ Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager """ self.log.warn( 'Failure during %s' % ( message ) ) jobManager = RPCClient( 'WorkloadManagement/JobManager' ) jobReport = JobReport( int( jobID ), 'JobAgent@%s' % self.siteName ) #Setting a job parameter does not help since the job will be rescheduled, #instead set the status with the cause and then another status showing the #reschedule operation. jobReport.setJobStatus( status = 'Rescheduled', application = message, sendFlag = True ) self.log.info( 'Job will be rescheduled' ) result = jobManager.rescheduleJob( jobID ) if not result['OK']: self.log.error( result['Message'] ) return self.__finish( 'Problem Rescheduling Job', stop ) self.log.info( 'Job Rescheduled %s' % ( jobID ) ) return self.__finish( 'Job Rescheduled', stop )
def rescheduleFailedJob(jobID,message): try: import DIRAC global jobReport gLogger.warn('Failure during %s' %(message)) #Setting a job parameter does not help since the job will be rescheduled, #instead set the status with the cause and then another status showing the #reschedule operation. if not jobReport: gLogger.info('Creating a new JobReport Object') jobReport = JobReport(int(jobID),'JobWrapperTemplate') jobReport.setApplicationStatus( 'Failed %s ' % message, sendFlag = False ) jobReport.setJobStatus( 'Rescheduled', message, sendFlag = False ) # We must send Job States and Parameters before it gets reschedule jobReport.sendStoredStatusInfo() jobReport.sendStoredJobParameters() gLogger.info('Job will be rescheduled after exception during execution of the JobWrapper') jobManager = RPCClient('WorkloadManagement/JobManager') result = jobManager.rescheduleJob(int(jobID)) if not result['OK']: gLogger.warn(result) # Send mail to debug errors mailAddress = DIRAC.alarmMail site = DIRAC.siteName() subject = 'Job rescheduled at %s' % site ret = systemCall(0,'hostname') wn = ret['Value'][1] msg = 'Job %s rescheduled at %s, wn=%s\n' % ( jobID, site, wn ) msg += message NotificationClient().sendMail(mailAddress,subject,msg,fromAddress="*****@*****.**",localAttempt=False) return except Exception,x: gLogger.exception('JobWrapperTemplate failed to reschedule Job') return
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available') self.log.info(result['Message']) return self.__finish('CE Not Available') self.log.info(result['Message']) ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots with %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn('Job has no CPU requirement defined in JDL parameters') # Job requirement for a number of processors processors = int(params.get('NumberOfProcessors', 1)) wholeNode = 'WholeNode' in params if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result['PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() return S_OK('Job Agent cycle complete')
def main(): from DIRAC.Core.Base import Script ### DoCtaIrf options ########################################################## Script.registerSwitch("A:", "analysis=", "Analysis Type", setAnalysisType) Script.registerSwitch("C:", "cuts=", "Cuts Config", setCutsConfig) Script.registerSwitch("R:", "runlist=", "Runlist", setRunlist) Script.registerSwitch("Z:", "zenith=", "Zenith", setZenith) Script.registerSwitch("O:", "offset=", "Offset", setOffset) Script.registerSwitch("M:", "energy=", "Energy Method", setEnergyMethod) Script.registerSwitch("T:", "arrayconfig=", "Array Configuration", setArrayConfig) Script.registerSwitch("P:", "particle=", "Particle Type", setParticleType) ## other options Script.registerSwitch("V:", "version=", "HAP version", setVersion) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) < 1: Script.showHelp() from CTADIRAC.Core.Workflow.Modules.HapApplication import HapApplication from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int(jobID) jobReport = JobReport(jobID) ha = HapApplication() HapPack = 'HAP/' + version + '/HAP' packs = ['HESS/v0.2/lib', 'HESS/v0.3/root', HapPack] for package in packs: DIRAC.gLogger.notice('Checking:', package) if sharedArea: if checkSoftwarePackage(package, sharedArea())['OK']: DIRAC.gLogger.notice('Package found in Shared Area:', package) continue if localArea: if checkSoftwarePackage(package, localArea())['OK']: DIRAC.gLogger.notice('Package found in Local Area:', package) continue if installSoftwarePackage(package, localArea())['OK']: continue DIRAC.gLogger.error('Check Failed for software package:', package) DIRAC.gLogger.error('Software package not available') DIRAC.exit(-1) ha.setSoftwarePackage(HapPack) ha.hapExecutable = 'DoCtaIrf' runlistdir = os.environ['PWD'] build_infile(runlist) ha.hapArguments = [ analysistype, cutsconfig, runlistdir, runlist, zenith, offset, arrayconfig, energymethod, particle ] DIRAC.gLogger.notice('Executing Hap Application') res = ha.execute() if not res['OK']: DIRAC.gLogger.error('Failed to execute Hap Application') jobReport.setApplicationStatus('Hap Application: Failed') DIRAC.exit(-1) DIRAC.exit()
def main(): from DIRAC.Core.Base import Script Script.registerSwitch("p:", "run_number=", "Run Number", setRunNumber) Script.registerSwitch("T:", "template=", "Template", setCorsikaTemplate) Script.registerSwitch("E:", "executable=", "Executable", setExecutable) Script.registerSwitch("S:", "simtelConfig=", "SimtelConfig", setConfig) Script.registerSwitch("V:", "version=", "Version", setVersion) Script.registerSwitch("M:", "mode=", "Mode", setMode) Script.registerSwitch("C:", "savecorsika=", "Save Corsika", setSaveCorsika) from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient from DIRAC.Resources.Catalog.FileCatalog import FileCatalog Script.parseCommandLine() global fcc, fcL, storage_element from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea from CTADIRAC.Core.Workflow.Modules.CorsikaApp import CorsikaApp from CTADIRAC.Core.Workflow.Modules.Read_CtaApp import Read_CtaApp from DIRAC.Core.Utilities.Subprocess import systemCall jobID = os.environ['JOBID'] jobID = int(jobID) global jobReport jobReport = JobReport(jobID) ########### ## Checking MD coherence fc = FileCatalog('LcgFileCatalog') res = fc._getCatalogConfigDetails('DIRACFileCatalog') print 'DFC CatalogConfigDetails:', res res = fc._getCatalogConfigDetails('LcgFileCatalog') print 'LCG CatalogConfigDetails:', res fcc = FileCatalogClient() fcL = FileCatalog('LcgFileCatalog') from DIRAC.Interfaces.API.Dirac import Dirac dirac = Dirac() ############# simtelConfigFilesPath = 'sim_telarray/multi' simtelConfigFile = simtelConfigFilesPath + '/multi_cta-ultra5.cfg' #simtelConfigFile = simtelConfigFilesPath + '/multi_cta-prod1s.cfg' createGlobalsFromConfigFiles('prodConfigFile', corsikaTemplate, version) ######################Building prod Directory Metadata ####################### resultCreateProdDirMD = createProdFileSystAndMD() if not resultCreateProdDirMD['OK']: DIRAC.gLogger.error('Failed to create prod Directory MD') jobReport.setApplicationStatus('Failed to create prod Directory MD') DIRAC.gLogger.error('Metadata coherence problem, no file produced') DIRAC.exit(-1) else: print 'prod Directory MD successfully created' ######################Building corsika Directory Metadata ####################### resultCreateCorsikaDirMD = createCorsikaFileSystAndMD() if not resultCreateCorsikaDirMD['OK']: DIRAC.gLogger.error('Failed to create corsika Directory MD') jobReport.setApplicationStatus('Failed to create corsika Directory MD') DIRAC.gLogger.error( 'Metadata coherence problem, no corsikaFile produced') DIRAC.exit(-1) else: print 'corsika Directory MD successfully created' ############ Producing Corsika File global CorsikaSimtelPack CorsikaSimtelPack = os.path.join('corsika_simhessarray', version, 'corsika_simhessarray') install_CorsikaSimtelPack(version, 'sim') cs = CorsikaApp() cs.setSoftwarePackage(CorsikaSimtelPack) cs.csExe = executable cs.csArguments = [ '--run-number', run_number, '--run', 'corsika', corsikaTemplate ] corsikaReturnCode = cs.execute() if corsikaReturnCode != 0: DIRAC.gLogger.error('Corsika Application: Failed') jobReport.setApplicationStatus('Corsika Application: Failed') DIRAC.exit(-1) ###################### rename of corsika output file ####################### rundir = 'run' + run_number filein = rundir + '/' + corsikaOutputFileName corsikaFileName = particle + '_' + thetaP + '_' + phiP + '_alt' + obslev + '_' + 'run' + run_number + '.corsika.gz' mv_cmd = 'mv ' + filein + ' ' + corsikaFileName if (os.system(mv_cmd)): DIRAC.exit(-1) ######################## ######################## ## files spread in 1000-runs subDirectories runNum = int(run_number) subRunNumber = '%03d' % runNum runNumModMille = runNum % 1000 runNumTrunc = (runNum - runNumModMille) / 1000 runNumSeriesDir = '%03dxxx' % runNumTrunc print 'runNumSeriesDir=', runNumSeriesDir ### create corsika tar luisa #################### corsikaTarName = particle + '_' + thetaP + '_' + phiP + '_alt' + obslev + '_' + 'run' + run_number + '.corsika.tar.gz' filetar1 = rundir + '/' + 'input' filetar2 = rundir + '/' + 'DAT' + run_number + '.dbase' filetar3 = rundir + '/run' + str(int(run_number)) + '.log' cmdTuple = [ '/bin/tar', 'zcf', corsikaTarName, filetar1, filetar2, filetar3 ] DIRAC.gLogger.notice('Executing command tuple:', cmdTuple) ret = systemCall(0, cmdTuple, sendOutput) if not ret['OK']: DIRAC.gLogger.error('Failed to execute tar') DIRAC.exit(-1) ###################################################### corsikaOutFileDir = os.path.join(corsikaDirPath, particle, 'Data', runNumSeriesDir) corsikaOutFileLFN = os.path.join(corsikaOutFileDir, corsikaFileName) corsikaRunNumberSeriesDirExist = fcc.isDirectory( corsikaOutFileDir)['Value']['Successful'][corsikaOutFileDir] newCorsikaRunNumberSeriesDir = ( corsikaRunNumberSeriesDirExist != True ) # if new runFileSeries, will need to add new MD #### create a file to DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK ################ f = open('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK', 'w') f.close() if savecorsika == 'True': DIRAC.gLogger.notice('Put and register corsika File in LFC and DFC:', corsikaOutFileLFN) ret = dirac.addFile(corsikaOutFileLFN, corsikaFileName, storage_element) res = CheckCatalogCoherence(corsikaOutFileLFN) if res != DIRAC.S_OK: DIRAC.gLogger.error('Job failed: Catalog Coherence problem found') jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) if not ret['OK']: DIRAC.gLogger.error('Error during addFile call:', ret['Message']) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) # put and register corsikaTarFile: corsikaTarFileDir = os.path.join(corsikaDirPath, particle, 'Log', runNumSeriesDir) corsikaTarFileLFN = os.path.join(corsikaTarFileDir, corsikaTarName) ##### If storage element is IN2P3-tape save simtel file on disk ############### if storage_element == 'CC-IN2P3-Tape': storage_element = 'CC-IN2P3-Disk' DIRAC.gLogger.notice( 'Put and register corsikaTar File in LFC and DFC:', corsikaTarFileLFN) ret = dirac.addFile(corsikaTarFileLFN, corsikaTarName, storage_element) ####Checking and restablishing catalog coherence ##################### res = CheckCatalogCoherence(corsikaTarFileLFN) if res != DIRAC.S_OK: DIRAC.gLogger.error('Job failed: Catalog Coherence problem found') jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) if not ret['OK']: DIRAC.gLogger.error('Error during addFile call:', ret['Message']) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) ###################################################################### if newCorsikaRunNumberSeriesDir: insertRunFileSeriesMD(corsikaOutFileDir, runNumTrunc) insertRunFileSeriesMD(corsikaTarFileDir, runNumTrunc) ###### insert corsika File Level metadata ############################################ corsikaFileMD = {} corsikaFileMD['runNumber'] = int(run_number) corsikaFileMD['jobID'] = jobID corsikaFileMD['corsikaReturnCode'] = corsikaReturnCode corsikaFileMD['nbShowers'] = nbShowers result = fcc.setMetadata(corsikaOutFileLFN, corsikaFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] result = fcc.setMetadata(corsikaTarFileLFN, corsikaFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] ##### Exit now if only corsika simulation required if (mode == 'corsika_standalone'): DIRAC.exit() ############ Producing SimTel File ######################Building simtel Directory Metadata ####################### cfg_dict = { "4MSST": 'cta-prod2-4m-dc', "SCSST": 'cta-prod2-sc-sst', "STD": 'cta-prod2', "NSBX3": 'cta-prod2', "ASTRI": 'cta-prod2-astri', "SCMST": 'cta-prod2-sc3', "NORTH": 'cta-prod2n' } if simtelConfig == "6INROW": all_configs = ["4MSST", "SCSST", "ASTRI", "NSBX3", "STD", "SCMST"] elif simtelConfig == "5INROW": all_configs = ["4MSST", "SCSST", "ASTRI", "NSBX3", "STD"] elif simtelConfig == "3INROW": all_configs = ["SCSST", "STD", "SCMST"] else: all_configs = [simtelConfig] ############################################ #for current_conf in all_configs: #DIRAC.gLogger.notice('current conf is',current_conf) #if current_conf == "SCMST": #current_version = version + '_sc3' #DIRAC.gLogger.notice('current version is', current_version) #if os.path.isdir('sim_telarray'): #DIRAC.gLogger.notice('Package found in the local area. Removing package...') #cmd = 'rm -R sim_telarray corsika-6990 hessioxxx corsika-run' #if(os.system(cmd)): #DIRAC.exit( -1 ) #install_CorsikaSimtelPack(current_version) #else: #current_version = version #DIRAC.gLogger.notice('current version is', current_version) ############################################################# for current_conf in all_configs: DIRAC.gLogger.notice('current conf is', current_conf) if current_conf == "SCMST": current_version = version + '_sc3' DIRAC.gLogger.notice('current version is', current_version) installSoftwareEnviron(CorsikaSimtelPack, workingArea(), 'sim-sc3') else: current_version = version DIRAC.gLogger.notice('current version is', current_version) ######################################################## global simtelDirPath global simtelProdVersion simtelProdVersion = current_version + '_simtel' simtelDirPath = os.path.join(corsikaParticleDirPath, simtelProdVersion) resultCreateSimtelDirMD = createSimtelFileSystAndMD(current_conf) if not resultCreateSimtelDirMD['OK']: DIRAC.gLogger.error('Failed to create simtelArray Directory MD') jobReport.setApplicationStatus( 'Failed to create simtelArray Directory MD') DIRAC.gLogger.error( 'Metadata coherence problem, no simtelArray File produced') DIRAC.exit(-1) else: DIRAC.gLogger.notice('simtel Directory MD successfully created') ############## check simtel data file LFN exists ######################## simtelFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str( obslev) + '_' + 'run' + run_number + '.simtel.gz' simtelDirPath_conf = simtelDirPath + '_' + current_conf simtelOutFileDir = os.path.join(simtelDirPath_conf, 'Data', runNumSeriesDir) simtelOutFileLFN = os.path.join(simtelOutFileDir, simtelFileName) res = CheckCatalogCoherence(simtelOutFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('Current conf already done', current_conf) continue #### execute simtelarray ################ fd = open('run_sim.sh', 'w') fd.write("""#! /bin/sh source ./Corsika_simhessarrayEnv.sh export SVNPROD2=$PWD export SVNTAG=SVN-PROD2_rev10503 export CORSIKA_IO_BUFFER=800MB cp ../grid_prod2-repro.sh . ln -s ../%s ln -s ../$SVNTAG ./grid_prod2-repro.sh %s %s""" % (corsikaFileName, corsikaFileName, current_conf)) fd.close() #################################### os.system('chmod u+x run_sim.sh') cmdTuple = ['./run_sim.sh'] ret = systemCall(0, cmdTuple, sendOutputSimTel) simtelReturnCode, stdout, stderr = ret['Value'] if (os.system('grep Broken simtel.log') == 0): DIRAC.gLogger.error('Broken string found in simtel.log') jobReport.setApplicationStatus('Broken pipe') DIRAC.exit(-1) if not ret['OK']: DIRAC.gLogger.error('Failed to execute run_sim.sh') DIRAC.gLogger.error('run_sim.sh status is:', simtelReturnCode) DIRAC.exit(-1) ## check simtel data/log/histo Output File exist cfg = cfg_dict[current_conf] #cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Data/*.simtel.gz ' + simtelFileName if current_conf == "SCMST": cmdprefix = 'mv sim-sc3/Data/sim_telarray/' + cfg + '/0.0deg/' else: cmdprefix = 'mv sim/Data/sim_telarray/' + cfg + '/0.0deg/' cmd = cmdprefix + 'Data/*' + cfg + '_*.simtel.gz ' + simtelFileName if (os.system(cmd)): DIRAC.exit(-1) ############################################ simtelRunNumberSeriesDirExist = fcc.isDirectory( simtelOutFileDir)['Value']['Successful'][simtelOutFileDir] newSimtelRunFileSeriesDir = ( simtelRunNumberSeriesDirExist != True ) # if new runFileSeries, will need to add new MD simtelLogFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str(obslev) + '_' + 'run' + run_number + '.log.gz' #cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Log/*.log.gz ' + simtelLogFileName cmd = cmdprefix + 'Log/*' + cfg + '_*.log.gz ' + simtelLogFileName if (os.system(cmd)): DIRAC.exit(-1) simtelOutLogFileDir = os.path.join(simtelDirPath_conf, 'Log', runNumSeriesDir) simtelOutLogFileLFN = os.path.join(simtelOutLogFileDir, simtelLogFileName) simtelHistFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str( obslev) + '_' + 'run' + run_number + '.hdata.gz' #cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Histograms/*.hdata.gz ' + simtelHistFileName cmd = cmdprefix + 'Histograms/*' + cfg + '_*.hdata.gz ' + simtelHistFileName if (os.system(cmd)): DIRAC.exit(-1) simtelOutHistFileDir = os.path.join(simtelDirPath_conf, 'Histograms', runNumSeriesDir) simtelOutHistFileLFN = os.path.join(simtelOutHistFileDir, simtelHistFileName) ########### quality check on Histo ############################################# fd = open('check_histo.sh', 'w') fd.write("""#! /bin/sh nsim=$(list_histograms %s|fgrep 'Histogram 6 '|sed 's/^.*contents: //'| sed 's:/.*$::') nevents=%d if [ $nsim -lt $(( $nevents - 20 )) ]; then echo 'nsim found:' $nsim echo 'nsim expected:' $nevents exit 1 else echo 'nsim found:' $nsim echo 'nsim expected:' $nevents fi """ % (simtelHistFileName, int(nbShowers) * int(cscat))) fd.close() ret = getSoftwareEnviron(CorsikaSimtelPack) if not ret['OK']: error = ret['Message'] DIRAC.gLogger.error(error, CorsikaSimtelPack) DIRAC.exit(-1) corsikaEnviron = ret['Value'] os.system('chmod u+x check_histo.sh') cmdTuple = ['./check_histo.sh'] DIRAC.gLogger.notice('Executing command tuple:', cmdTuple) ret = systemCall(0, cmdTuple, sendOutput, env=corsikaEnviron) checkHistoReturnCode, stdout, stderr = ret['Value'] if not ret['OK']: DIRAC.gLogger.error('Failed to execute check_histo.sh') DIRAC.gLogger.error('check_histo.sh status is:', checkHistoReturnCode) DIRAC.exit(-1) if (checkHistoReturnCode != 0): DIRAC.gLogger.error('Failure during check_histo.sh') DIRAC.gLogger.error('check_histo.sh status is:', checkHistoReturnCode) jobReport.setApplicationStatus('Histo check Failed') DIRAC.exit(-1) ########## quality check on Log ############################# cmd = 'zcat %s | grep Finished.' % simtelLogFileName DIRAC.gLogger.notice('Executing system call:', cmd) if (os.system(cmd)): jobReport.setApplicationStatus('Log check Failed') DIRAC.exit(-1) ################################################ from DIRAC.Core.Utilities import List from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations opsHelper = Operations() global seList seList = opsHelper.getValue('ProductionOutputs/SimtelProd', []) seList = List.randomize(seList) DIRAC.gLogger.notice('SeList is:', seList) ######### Upload simtel data/log/histo ############################################## res = upload_to_seList(simtelOutFileLFN, simtelFileName) if res != DIRAC.S_OK: DIRAC.gLogger.error('OutputData Upload Error', simtelOutFileLFN) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) res = CheckCatalogCoherence(simtelOutLogFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('Log file already exists. Removing:', simtelOutLogFileLFN) ret = dirac.removeFile(simtelOutLogFileLFN) res = upload_to_seList(simtelOutLogFileLFN, simtelLogFileName) if res != DIRAC.S_OK: DIRAC.gLogger.error('Upload simtel Log Error', simtelOutLogFileLFN) DIRAC.gLogger.notice('Removing simtel data file:', simtelOutFileLFN) ret = dirac.removeFile(simtelOutFileLFN) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) res = CheckCatalogCoherence(simtelOutHistFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('Histo file already exists. Removing:', simtelOutHistFileLFN) ret = dirac.removeFile(simtelOutHistFileLFN) res = upload_to_seList(simtelOutHistFileLFN, simtelHistFileName) if res != DIRAC.S_OK: DIRAC.gLogger.error('Upload simtel Histo Error', simtelOutHistFileLFN) DIRAC.gLogger.notice('Removing simtel data file:', simtelOutFileLFN) ret = dirac.removeFile(simtelOutFileLFN) DIRAC.gLogger.notice('Removing simtel log file:', simtelOutLogFileLFN) ret = dirac.removeFile(simtelOutLogFileLFN) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) # simtelRunNumberSeriesDirExist = fcc.isDirectory(simtelOutFileDir)['Value']['Successful'][simtelOutFileDir] # newSimtelRunFileSeriesDir = (simtelRunNumberSeriesDirExist != True) # if new runFileSeries, will need to add new MD if newSimtelRunFileSeriesDir: print 'insertRunFileSeriesMD' insertRunFileSeriesMD(simtelOutFileDir, runNumTrunc) insertRunFileSeriesMD(simtelOutLogFileDir, runNumTrunc) insertRunFileSeriesMD(simtelOutHistFileDir, runNumTrunc) else: print 'NotinsertRunFileSeriesMD' ###### simtel File level metadata ############################################ simtelFileMD = {} simtelFileMD['runNumber'] = int(run_number) simtelFileMD['jobID'] = jobID simtelFileMD['simtelReturnCode'] = simtelReturnCode result = fcc.setMetadata(simtelOutFileLFN, simtelFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] result = fcc.setMetadata(simtelOutLogFileLFN, simtelFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] result = fcc.setMetadata(simtelOutHistFileLFN, simtelFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] if savecorsika == 'True': result = fcc.addFileAncestors( {simtelOutFileLFN: { 'Ancestors': [corsikaOutFileLFN] }}) print 'result addFileAncestor:', result result = fcc.addFileAncestors( {simtelOutLogFileLFN: { 'Ancestors': [corsikaOutFileLFN] }}) print 'result addFileAncestor:', result result = fcc.addFileAncestors( {simtelOutHistFileLFN: { 'Ancestors': [corsikaOutFileLFN] }}) print 'result addFileAncestor:', result ##### Exit now if only corsika simulation required if (mode == 'corsika_simtel'): continue ######### run read_cta ####################################### rcta = Read_CtaApp() rcta.setSoftwarePackage(CorsikaSimtelPack) rcta.rctaExe = 'read_cta' powerlaw_dict = { 'gamma': '-2.57', 'gamma_ptsrc': '-2.57', 'proton': '-2.70', 'electron': '-3.21' } dstFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str( obslev) + '_' + 'run' + run_number + '.simtel-dst0.gz' dstHistoFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str( obslev) + '_' + 'run' + run_number + '.hdata-dst0.gz' ## added some options starting from Armazones_2K prod. rcta.rctaArguments = [ '-r', '4', '-u', '--integration-scheme', '4', '--integration-window', '7,3', '--tail-cuts', '6,8', '--min-pix', '2', '--min-amp', '20', '--type', '1,0,0,400', '--tail-cuts', '9,12', '--min-amp', '20', '--type', '2,0,0,100', '--tail-cuts', '8,11', '--min-amp', '19', '--type', '3,0,0,40', '--tail-cuts', '6,9', '--min-amp', '15', '--type', '4,0,0,15', '--tail-cuts', '3.7,5.5', '--min-amp', '8', '--type', '5,0,0,70,5.6', '--tail-cuts', '2.4,3.2', '--min-amp', '5.6', '--dst-level', '0', '--dst-file', dstFileName, '--histogram-file', dstHistoFileName, '--powerlaw', powerlaw_dict[particle], simtelFileName ] rctaReturnCode = rcta.execute() if rctaReturnCode != 0: DIRAC.gLogger.error('read_cta Application: Failed') jobReport.setApplicationStatus('read_cta Application: Failed') DIRAC.exit(-1) ######## run dst quality checks ###################################### fd = open('check_dst_histo.sh', 'w') fd.write("""#! /bin/sh dsthistfilename=%s dstfile=%s n6="$(list_histograms -h 6 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" n12001="$(list_histograms -h 12001 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" if [ $n6 -ne $n12001 ]; then echo 'n6 found:' $n6 echo 'n12001 found:' $n12001 exit 1 else echo 'n6 found:' $n6 echo 'n12001 found:' $n12001 fi n12002="$(list_histograms -h 12002 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" nev="$(statio ${dstfile} | egrep '^2010' | cut -f2)" if [ -z "$nev" ]; then nev="0"; fi if [ $nev -ne $n12002 ]; then echo 'nev found:' $nev echo 'n12002 found:' $n12002 exit 1 else echo 'nev found:' $nev echo 'n12002 found:' $n12002 fi """ % (dstHistoFileName, dstFileName)) fd.close() os.system('chmod u+x check_dst_histo.sh') cmdTuple = ['./check_dst_histo.sh'] DIRAC.gLogger.notice('Executing command tuple:', cmdTuple) ret = systemCall(0, cmdTuple, sendOutput, env=corsikaEnviron) checkHistoReturnCode, stdout, stderr = ret['Value'] if not ret['OK']: DIRAC.gLogger.error('Failed to execute check_dst_histo.sh') DIRAC.gLogger.error('check_dst_histo.sh status is:', checkHistoReturnCode) DIRAC.exit(-1) if (checkHistoReturnCode != 0): DIRAC.gLogger.error('Failure during check_dst_histo.sh') DIRAC.gLogger.error('check_dst_histo.sh status is:', checkHistoReturnCode) jobReport.setApplicationStatus('Histo check Failed') DIRAC.exit(-1) ############create MD and upload dst data/histo ########################################################## global dstDirPath global dstProdVersion dstProdVersion = current_version + '_dst' dstDirPath = os.path.join(simtelDirPath_conf, dstProdVersion) dstOutFileDir = os.path.join(dstDirPath, 'Data', runNumSeriesDir) dstOutFileLFN = os.path.join(dstOutFileDir, dstFileName) resultCreateDstDirMD = createDstFileSystAndMD() if not resultCreateDstDirMD['OK']: DIRAC.gLogger.error('Failed to create Dst Directory MD') jobReport.setApplicationStatus('Failed to create Dst Directory MD') DIRAC.gLogger.error( 'Metadata coherence problem, no Dst File produced') DIRAC.exit(-1) else: DIRAC.gLogger.notice('Dst Directory MD successfully created') ############################################################ res = CheckCatalogCoherence(dstOutFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('dst file already exists. Removing:', dstOutFileLFN) ret = dirac.removeFile(dstOutFileLFN) res = upload_to_seList(dstOutFileLFN, dstFileName) if res != DIRAC.S_OK: DIRAC.gLogger.error('Upload dst Error', dstOutFileLFN) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) ############################################################## dstHistoFileDir = os.path.join(dstDirPath, 'Histograms', runNumSeriesDir) dstHistoFileLFN = os.path.join(dstHistoFileDir, dstHistoFileName) res = CheckCatalogCoherence(dstHistoFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('dst histo file already exists. Removing:', dstHistoFileLFN) ret = dirac.removeFile(dstHistoFileLFN) res = upload_to_seList(dstHistoFileLFN, dstHistoFileName) if res != DIRAC.S_OK: DIRAC.gLogger.error('Upload dst Error', dstHistoFileName) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) ########### Insert RunNumSeries MD ########################## dstRunNumberSeriesDirExist = fcc.isDirectory( dstOutFileDir)['Value']['Successful'][dstOutFileDir] newDstRunFileSeriesDir = ( dstRunNumberSeriesDirExist != True ) # if new runFileSeries, will need to add new MD if newDstRunFileSeriesDir: insertRunFileSeriesMD(dstOutFileDir, runNumTrunc) insertRunFileSeriesMD(dstHistoFileDir, runNumTrunc) ####### dst File level metadata ############################################### dstFileMD = {} dstFileMD['runNumber'] = int(run_number) dstFileMD['jobID'] = jobID dstFileMD['rctaReturnCode'] = rctaReturnCode result = fcc.setMetadata(dstOutFileLFN, dstFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] result = fcc.setMetadata(dstHistoFileLFN, dstFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] ########## set the ancestors for dst ##################################### result = fcc.addFileAncestors( {dstOutFileLFN: { 'Ancestors': [simtelOutFileLFN] }}) print 'result addFileAncestor:', result result = fcc.addFileAncestors( {dstHistoFileLFN: { 'Ancestors': [simtelOutFileLFN] }}) print 'result addFileAncestor:', result ###################################################### DIRAC.exit()
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn( "Disabling filling mode as errors calculating time left", self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', '%d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions: params['Arguments'] = (params.get('Arguments', '') + ' ' + self.extraOptions).strip() params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self._setupProxy(ownerDN, jobGroup) if not result['OK']: return self._rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result = self._submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode, maxNumberOfProcessors, mpTag) if not result['OK']: return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self._getCPUTimeLeft() return S_OK('Job Agent cycle complete')
def execute( arguments ): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int( jobID ) if arguments.has_key( 'WorkingDirectory' ): wdir = os.path.expandvars( arguments['WorkingDirectory'] ) if os.path.isdir( wdir ): os.chdir( wdir ) else: try: os.makedirs( wdir ) if os.path.isdir( wdir ): os.chdir( wdir ) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' ) return 1 gJobReport = JobReport( jobID, 'JobWrapper' ) try: job = JobWrapper( jobID, gJobReport ) job.initialize( arguments ) except Exception as e: gLogger.exception( 'JobWrapper failed the initialization phase', lException = e ) rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport ) try: job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' ) except Exception as e: gLogger.exception( 'JobWrapper failed sending job accounting', lException = e ) return 1 if arguments['Job'].has_key( 'InputSandbox' ): gJobReport.commit() try: result = job.transferInputSandbox( arguments['Job']['InputSandbox'] ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception: gLogger.exception( 'JobWrapper failed to download input sandbox' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' ) return 1 else: gLogger.verbose( 'Job has no InputSandbox requirement' ) gJobReport.commit() if arguments['Job'].has_key( 'InputData' ): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception as x: gLogger.exception( 'JobWrapper failed to resolve input data' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' ) return 1 else: gLogger.verbose( 'Job has a null InputData requirement:' ) gLogger.verbose( arguments ) else: gLogger.verbose( 'Job has no InputData requirement' ) gJobReport.commit() try: result = job.execute( arguments ) if not result['OK']: gLogger.error( 'Failed to execute job', result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception as x: if str( x ) == '0': gLogger.verbose( 'JobWrapper exited with status=0 after execution' ) else: gLogger.exception( 'Job failed in execution phase' ) gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False ) gJobReport.setJobStatus( 'Failed', 'Exception During Execution', sendFlag = False ) job.sendFailoverRequest( 'Failed', 'Exception During Execution' ) return 1 if arguments['Job'].has_key( 'OutputSandbox' ) or arguments['Job'].has_key( 'OutputData' ): try: result = job.processJobOutputs( arguments ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception as x: gLogger.exception( 'JobWrapper failed to process output files' ) gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False ) gJobReport.setJobStatus( 'Failed', 'Uploading Job Outputs', sendFlag = False ) job.sendFailoverRequest( 'Failed', 'Uploading Job Outputs' ) return 2 else: gLogger.verbose( 'Job has no OutputData or OutputSandbox requirement' ) try: # Failed jobs will return 1 / successful jobs will return 0 return job.finalize( arguments ) except Exception: gLogger.exception( 'JobWrapper failed the finalization phase' ) return 2
def execute(self): """The JobAgent execution method.""" self.log.verbose("Job Agent execution loop") queueDictItems = list(self.queueDict.items()) random.shuffle(queueDictItems) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result for queueName, queueDictionary in queueDictItems: # Make sure there is no problem with the queue before trying to submit if not self._allowedToSubmit(queueName): continue # Get a working proxy ce = queueDictionary["CE"] cpuTime = 86400 * 3 self.log.verbose("Getting pilot proxy", "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result proxy = result["Value"] result = proxy.getRemainingSecs() # pylint: disable=no-member if not result["OK"]: return result lifetime_secs = result["Value"] ce.setProxy(proxy, lifetime_secs) # Check that there is enough slots in the remote CE to match a job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 continue # Get environment details and enhance them result = self._getCEDict(ce) if not result["OK"]: self.failedQueues[queueName] += 1 continue ceDictList = result["Value"] for ceDict in ceDictList: # Information about number of processors might not be returned in CE.getCEStatus() ceDict["NumberOfProcessors"] = ce.ceParameters.get("NumberOfProcessors") self._setCEDict(ceDict) # Update the configuration with the names of the Site, CE and queue to target # This is used in the next stages self._updateConfiguration("Site", queueDictionary["Site"]) self._updateConfiguration("GridCE", queueDictionary["CEName"]) self._updateConfiguration("CEQueue", queueDictionary["QueueName"]) self._updateConfiguration("RemoteExecution", True) # Try to match a job jobRequest = self._matchAJob(ceDictList) while jobRequest["OK"]: # Check matcher information returned matcherParams = ["JDL", "DN", "Group"] matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName) result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] ceDict = matcherInfo["CEDict"] matchTime = matcherInfo["matchTime"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] # Get JDL paramters parameters = self._getJDLParameters(jobJDL) if not parameters["OK"]: jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters") self.log.warn("Could Not Extract JDL Parameters", parameters["Message"]) self.failedQueues[queueName] += 1 break params = parameters["Value"] result = self._extractValuesFromJobParams(params, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break submissionParams = result["Value"] jobID = submissionParams["jobID"] jobType = submissionParams["jobType"] self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info( "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup) ) try: jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False) jobReport.setJobStatus( status=JobStatus.MATCHED, minorStatus="Job Received by Agent", sendFlag=False ) # Setup proxy result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy["OK"]: result = self._rescheduleFailedJob(jobID, result_setupProxy["Message"]) self.failedQueues[queueName] += 1 break proxyChain = result_setupProxy.get("Value") # Check software and install them if required software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" result = self._rescheduleFailedJob(jobID, errorMsg) self.failedQueues[queueName] += 1 break # Submit the job to the CE self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=submissionParams["processors"], wholeNode=submissionParams["wholeNode"], maxNumberOfProcessors=submissionParams["maxNumberOfProcessors"], mpTag=submissionParams["mpTag"], ) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res["OK"]: resFD = jobReport.generateForwardDISET() if not resFD["OK"]: self.log.error("Error generating ForwardDISET operation", resFD["Message"]) elif resFD["Value"]: # Here we create the Request. op = resFD["Value"] request = Request() requestName = "jobAgent_%s" % jobID request.RequestName = requestName.replace('"', "") request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob["OK"]: self.log.error("Error during submission", result_submitJob["Message"]) self.failedQueues[queueName] += 1 break elif "PayloadFailed" in result_submitJob: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % result_submitJob["PayloadFailed"] self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result # Check that there is enough slots in the remote CE to match a new job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 break # Try to match a new job jobRequest = self._matchAJob(ceDictList) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) result = self._rescheduleFailedJob(jobID, "Job processing failed with exception") self.failedQueues[queueName] += 1 break if not jobRequest["OK"]: self._checkMatchingIssues(jobRequest) self.failedQueues[queueName] += 1 continue return S_OK("Push Job Agent cycle complete")
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info("Attempting to check CPU time left for filling mode") if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft)) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result["OK"]: return self.__finish(result["Message"]) else: return self.__finish("Filling Mode is Disabled") self.log.verbose("Job Agent execution loop") available = self.computingElement.available() if not available["OK"] or not available["Value"]: self.log.info("Resource is not available") self.log.info(available["Message"]) return self.__finish("CE Not Available") self.log.info(available["Message"]) result = self.computingElement.getDescription() if not result["OK"]: return result ceDict = result["Value"] # Add pilot information gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown") if gridCE != "Unknown": ceDict["GridCE"] = gridCE if not "PilotReference" in ceDict: ceDict["PilotReference"] = str(self.pilotReference) ceDict["PilotBenchmark"] = self.cpuFactor ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict("/AgentJobRequirements") if result["OK"]: requirementsDict = result["Value"] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime = %.2f (s)" % (matchTime)) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: if re.search("No match found", jobRequest["Message"]): self.log.notice("Job request OK: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("seconds timeout") != -1: self.log.error(jobRequest["Message"]) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("Pilot version does not match") != -1: self.log.error(jobRequest["Message"]) return S_ERROR(jobRequest["Message"]) else: self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False) matcherParams = ["JDL", "DN", "Group"] for param in matcherParams: if not matcherInfo.has_key(param): self.__report(jobID, "Failed", "Matcher did not return %s" % (param)) return self.__finish("Matcher Failed") elif not matcherInfo[param]: self.__report(jobID, "Failed", "Matcher returned null %s" % (param)) return self.__finish("Matcher Failed") else: self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param])) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters(jobJDL) if not parameters["OK"]: self.__report(jobID, "Failed", "Could Not Extract JDL Parameters") self.log.warn(parameters["Message"]) return self.__finish("JDL Problem") params = parameters["Value"] if not params.has_key("JobID"): msg = "Job has not JobID defined in JDL parameters" self.__report(jobID, "Failed", msg) self.log.warn(msg) return self.__finish("JDL Problem") else: jobID = params["JobID"] if not params.has_key("JobType"): self.log.warn("Job has no JobType defined in JDL parameters") jobType = "Unknown" else: jobType = params["JobType"] if not params.has_key("SystemConfig"): self.log.warn("Job has no system configuration defined in JDL parameters") systemConfig = gConfig.getValue("/LocalSite/Architecture", "") self.log.info( "Setting system config to /LocalSite/Architecture = %s since it was not specified" % systemConfig ) if not systemConfig: self.log.warn("/LocalSite/Architecture is not defined") params["SystemConfig"] = systemConfig else: systemConfig = params["SystemConfig"] if systemConfig.lower() == "any": systemConfig = gConfig.getValue("/LocalSite/Architecture", "") self.log.info( "Setting SystemConfig = /LocalSite/Architecture =", '"%s" since it was set to "ANY" in the job description' % systemConfig, ) if not systemConfig: self.log.warn("/LocalSite/Architecture is not defined") params["SystemConfig"] = systemConfig if not params.has_key("CPUTime"): self.log.warn("Job has no CPU requirement defined in JDL parameters") self.log.verbose("Job request successful: \n %s" % (jobRequest["Value"])) self.log.info("Received JobID=%s, JobType=%s, SystemConfig=%s" % (jobID, jobType, systemConfig)) self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False) if self.gridCEQueue: jobReport.setJobParameter("GridCEQueue", self.gridCEQueue, sendFlag=False) if os.environ.has_key("BOINC_JOB_ID"): # Report BOINC environment for p in ["BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"]: jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False) jobReport.setJobStatus("Matched", "Job Received by Agent") # self.__setJobSite( jobID, self.siteName ) if not self.pilotInfoReportedFlag: self.__reportPilotInfo(jobID) result = self.__setupProxy(ownerDN, jobGroup) if not result["OK"]: return self.__rescheduleFailedJob(jobID, result["Message"], params, self.stopOnApplicationFailure) if "Value" in result and result["Value"]: proxyChain = result["Value"] software = self.__checkInstallSoftware(jobID, params, ceDict) if not software["OK"]: self.log.error("Failed to install software for job %s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" return self.__rescheduleFailedJob(jobID, errorMsg, params, self.stopOnApplicationFailure) self.log.verbose("Before %sCE submitJob()" % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain) if not submission["OK"]: self.__report(jobID, "Failed", submission["Message"]) return self.__finish(submission["Message"]) elif "PayloadFailed" in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( "Payload execution failed with error code %s" % submission["PayloadFailed"], self.stopOnApplicationFailure, ) self.log.verbose("After %sCE submitJob()" % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, "Job processing failed with exception", params, self.stopOnApplicationFailure ) currentTimes = list(os.times()) for i in range(len(currentTimes)): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft(cpuTime) if result["OK"]: self.timeLeft = result["Value"] else: if result["Message"] != "Current batch system is not supported": self.timeLeftError = result["Message"] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()["Value"] self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK("Job Agent cycle complete")
def execute(self): """ Main execution function. """ self.log.info('Initializing %s' % self.version) result = self.resolveInputVariables() if not result['OK']: self.log.error(result['Message']) return result if not self.fileReport: self.fileReport = FileReport('Transformation/TransformationManager') if self.InputData: inputFiles = self.fileReport.getFiles() for lfn in self.InputData: if not lfn in inputFiles: self.log.verbose('No status populated for input data %s, setting to "Unused"' % lfn) result = self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused') if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.info('Workflow status = %s, step status = %s' %(self.workflowStatus['OK'], self.stepStatus['OK'])) inputFiles = self.fileReport.getFiles() for lfn in inputFiles: if inputFiles[lfn] != 'ApplicationCrash': self.log.info('Forcing status to "Unused" due to workflow failure for: %s' % (lfn)) self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused') else: inputFiles = self.fileReport.getFiles() if inputFiles: self.log.info('Workflow status OK, setting input file status to Processed') for lfn in inputFiles: self.log.info('Setting status to "Processed" for: %s' % (lfn)) self.fileReport.setFileStatus(int(self.productionID), lfn, 'Processed') result = self.fileReport.commit() if not result['OK']: self.log.error('Failed to report file status to ProductionDB, request will be generated', result['Message']) else: self.log.info('Status of files have been properly updated in the ProcessingDB') # Must ensure that the local job report instance is used to report the final status # in case of failure and a subsequent failover operation if self.workflowStatus['OK'] and self.stepStatus['OK']: if not self.jobReport: self.jobReport = JobReport(int(self.jobID)) jobStatus = self.jobReport.setApplicationStatus('Job Finished Successfully') if not jobStatus['OK']: self.log.warn(jobStatus['Message']) # Retrieve the accumulated reporting request reportRequest = None if self.jobReport: result = self.jobReport.generateRequest() if not result['OK']: self.log.warn('Could not generate request for job report with result:\n%s' % (result)) else: reportRequest = result['Value'] if reportRequest: self.log.info('Populating request with job report information') self.request.update(reportRequest) fileReportRequest = None if self.fileReport: result = self.fileReport.generateRequest() if not result['OK']: self.log.warn('Could not generate request for file report with result:\n%s' % (result)) else: fileReportRequest = result['Value'] if fileReportRequest: self.log.info('Populating request with file report information') result = self.request.update(fileReportRequest) accountingReport = None if self.workflow_commons.has_key('AccountingReport'): accountingReport = self.workflow_commons['AccountingReport'] if accountingReport: result = accountingReport.commit() if not result['OK']: self.log.info('Populating request with accounting report information') self.request.setDISETRequest(result['rpcStub']) if self.request.isEmpty()['Value']: self.log.info('Request is empty, nothing to do.') return self.finalize() request_string = self.request.toXML()['Value'] self.log.debug(request_string) # Write out the request string fname = '%s_%s_request.xml' % (self.productionID, self.prodJobID) xmlfile = open(fname, 'w') xmlfile.write(request_string) xmlfile.close() self.log.info('Creating failover request for deferred operations for job %s:' % self.jobID) result = self.request.getDigest() if result['OK']: digest = result['Value'] self.log.info(digest) if not self.enable: self.log.info('Module is disabled by control flag') return S_OK('Module is disabled by control flag') return self.finalize()
class FailoverRequest(ModuleBase): """ Handle the failover requests issued by previous steps. Used in production. """ ############################################################################# def __init__(self): """Module initialization. """ super(FailoverRequest, self).__init__() self.version = __RCSID__ self.log = gLogger.getSubLogger( "FailoverRequest" ) #Internal parameters self.enable = True self.jobID = '' self.productionID = None self.prodJobID = None #Workflow parameters self.jobReport = None self.fileReport = None self.request = None ############################################################################# def applicationSpecificInputs(self): """ By convention the module input parameters are resolved here. """ self.log.debug(self.workflow_commons) self.log.debug(self.step_commons) if os.environ.has_key('JOBID'): self.jobID = os.environ['JOBID'] self.log.verbose('Found WMS JobID = %s' %self.jobID) else: self.log.info('No WMS JobID found, disabling module via control flag') self.enable = False if self.step_commons.has_key('Enable'): self.enable = self.step_commons['Enable'] if not type(self.enable) == type(True): self.log.warn('Enable flag set to non-boolean value %s, setting to False' % self.enable) self.enable = False #Earlier modules will have populated the report objects if self.workflow_commons.has_key('JobReport'): self.jobReport = self.workflow_commons['JobReport'] if self.workflow_commons.has_key('FileReport'): self.fileReport = self.workflow_commons['FileReport'] if self.InputData: if type(self.InputData) != type([]): self.InputData = self.InputData.split(';') self.InputData = [x.replace('LFN:','') for x in self.InputData] if self.workflow_commons.has_key('Request'): self.request = self.workflow_commons['Request'] if not self.request: self.request = RequestContainer() self.request.setRequestName('job_%s_request.xml' % self.jobID) self.request.setJobID(self.jobID) self.request.setSourceComponent("Job_%s" % self.jobID) if self.workflow_commons.has_key('PRODUCTION_ID'): self.productionID = self.workflow_commons['PRODUCTION_ID'] if self.workflow_commons.has_key('JOB_ID'): self.prodJobID = self.workflow_commons['JOB_ID'] return S_OK('Parameters resolved') ############################################################################# def execute(self): """ Main execution function. """ self.log.info('Initializing %s' % self.version) result = self.resolveInputVariables() if not result['OK']: self.log.error(result['Message']) return result if not self.fileReport: self.fileReport = FileReport('Transformation/TransformationManager') if self.InputData: inputFiles = self.fileReport.getFiles() for lfn in self.InputData: if not lfn in inputFiles: self.log.verbose('No status populated for input data %s, setting to "Unused"' % lfn) result = self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused') if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.info('Workflow status = %s, step status = %s' %(self.workflowStatus['OK'], self.stepStatus['OK'])) inputFiles = self.fileReport.getFiles() for lfn in inputFiles: if inputFiles[lfn] != 'ApplicationCrash': self.log.info('Forcing status to "Unused" due to workflow failure for: %s' % (lfn)) self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused') else: inputFiles = self.fileReport.getFiles() if inputFiles: self.log.info('Workflow status OK, setting input file status to Processed') for lfn in inputFiles: self.log.info('Setting status to "Processed" for: %s' % (lfn)) self.fileReport.setFileStatus(int(self.productionID), lfn, 'Processed') result = self.fileReport.commit() if not result['OK']: self.log.error('Failed to report file status to ProductionDB, request will be generated', result['Message']) else: self.log.info('Status of files have been properly updated in the ProcessingDB') # Must ensure that the local job report instance is used to report the final status # in case of failure and a subsequent failover operation if self.workflowStatus['OK'] and self.stepStatus['OK']: if not self.jobReport: self.jobReport = JobReport(int(self.jobID)) jobStatus = self.jobReport.setApplicationStatus('Job Finished Successfully') if not jobStatus['OK']: self.log.warn(jobStatus['Message']) # Retrieve the accumulated reporting request reportRequest = None if self.jobReport: result = self.jobReport.generateRequest() if not result['OK']: self.log.warn('Could not generate request for job report with result:\n%s' % (result)) else: reportRequest = result['Value'] if reportRequest: self.log.info('Populating request with job report information') self.request.update(reportRequest) fileReportRequest = None if self.fileReport: result = self.fileReport.generateRequest() if not result['OK']: self.log.warn('Could not generate request for file report with result:\n%s' % (result)) else: fileReportRequest = result['Value'] if fileReportRequest: self.log.info('Populating request with file report information') result = self.request.update(fileReportRequest) accountingReport = None if self.workflow_commons.has_key('AccountingReport'): accountingReport = self.workflow_commons['AccountingReport'] if accountingReport: result = accountingReport.commit() if not result['OK']: self.log.info('Populating request with accounting report information') self.request.setDISETRequest(result['rpcStub']) if self.request.isEmpty()['Value']: self.log.info('Request is empty, nothing to do.') return self.finalize() request_string = self.request.toXML()['Value'] self.log.debug(request_string) # Write out the request string fname = '%s_%s_request.xml' % (self.productionID, self.prodJobID) xmlfile = open(fname, 'w') xmlfile.write(request_string) xmlfile.close() self.log.info('Creating failover request for deferred operations for job %s:' % self.jobID) result = self.request.getDigest() if result['OK']: digest = result['Value'] self.log.info(digest) if not self.enable: self.log.info('Module is disabled by control flag') return S_OK('Module is disabled by control flag') return self.finalize() ############################################################################# def finalize(self): """ Finalize and report correct status for the workflow based on the workflow or step status. """ self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.warn('Workflow status is not ok, will not overwrite status') self.log.info('Workflow failed, end of FailoverRequest module execution.') return S_ERROR('Workflow failed, FailoverRequest module completed') self.log.info('Workflow successful, end of FailoverRequest module execution.') return S_OK('FailoverRequest module completed') #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#
def main(): from DIRAC.Core.Base import Script Script.initialize() DIRAC.gLogger.notice('Platform is:') os.system('dirac-platform') from DIRAC.DataManagementSystem.Client.DataManager import DataManager from CTADIRAC.Core.Workflow.Modules.EvnDispApp import EvnDispApp from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int(jobID) jobReport = JobReport(jobID) version = sys.argv[3] DIRAC.gLogger.notice('Version:', version) EvnDispPack = os.path.join('evndisplay', version, 'evndisplay') packs = [EvnDispPack] for package in packs: DIRAC.gLogger.notice('Checking:', package) if checkSoftwarePackage(package, sharedArea())['OK']: DIRAC.gLogger.notice('Package found in Shared Area:', package) installSoftwareEnviron(package, sharedArea()) continue else: installSoftwarePackage(package, workingArea()) DIRAC.gLogger.notice('Package found in workingArea:', package) continue DIRAC.gLogger.error('Check Failed for software package:', package) DIRAC.gLogger.error('Software package not available') DIRAC.exit(-1) ed = EvnDispApp() ed.setSoftwarePackage(EvnDispPack) dstFileLFNList = sys.argv[-1].split('ParametricParameters={')[1].split( '}')[0].replace(',', ' ') args = [] i = 0 for word in dstFileLFNList.split(): i = i + 1 dstfile = os.path.basename(word) ###### execute evndisplay stage1 ############### executable = sys.argv[5] logfileName = executable + '_' + str(i) + '.log' args = ['-sourcefile', dstfile, '-outputdirectory', 'outdir'] # add other arguments for evndisp specified by user ###### evndispparfile = open('evndisp.par', 'r').readlines() for line in evndispparfile: for word in line.split(): args.append(word) execute_module(ed, executable, args) for name in glob.glob('outdir/*.root'): evndispOutFile = name.split('.root')[0] + '_' + str( jobID) + '_evndisp.root' cmd = 'mv ' + name + ' ' + os.path.basename(evndispOutFile) if (os.system(cmd)): DIRAC.exit(-1) ########### quality check on Log ############################################# cmd = 'mv ' + executable + '.log' + ' ' + logfileName if (os.system(cmd)): DIRAC.exit(-1) fd = open('check_log.sh', 'w') fd.write("""#! /bin/sh if grep -i "error" %s; then exit 1 fi if grep "Final checks on result file (seems to be OK):" %s; then exit 0 else exit 1 fi """ % (logfileName, logfileName)) fd.close() os.system('chmod u+x check_log.sh') cmd = './check_log.sh' DIRAC.gLogger.notice('Executing system call:', cmd) if (os.system(cmd)): jobReport.setApplicationStatus('EvnDisp Log Check Failed') DIRAC.exit(-1) ################################################################## ########### remove the dst file ############################################# cmd = 'rm ' + dstfile if (os.system(cmd)): DIRAC.exit(-1) DIRAC.exit()
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info("Attempting to check CPU time left for filling mode") if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish("No more time left") # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result["OK"]: return self.__finish(result["Message"]) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join(".", self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection("/LocalSite"): localCfg.createNewSection("/LocalSite") localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish("Filling Mode is Disabled") self.log.verbose("Job Agent execution loop") available = self.computingElement.available() if not available["OK"] or not available["Value"]: self.log.info("Resource is not available") self.log.info(available["Message"]) return self.__finish("CE Not Available") self.log.info(available["Message"]) result = self.computingElement.getDescription() if not result["OK"]: return result ceDict = result["Value"] # Add pilot information gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown") if gridCE != "Unknown": ceDict["GridCE"] = gridCE if not "PilotReference" in ceDict: ceDict["PilotReference"] = str(self.pilotReference) ceDict["PilotBenchmark"] = self.cpuFactor ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict("/AgentJobRequirements") if result["OK"]: requirementsDict = result["Value"] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime = %.2f (s)" % (matchTime)) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: if re.search("No match found", jobRequest["Message"]): self.log.notice("Job request OK: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("seconds timeout") != -1: self.log.error("Timeout while requesting job", jobRequest["Message"]) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("Pilot version does not match") != -1: errorMsg = "Pilot version does not match the production version" self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, "")) return S_ERROR(jobRequest["Message"]) else: self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest["Value"] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False) jobID = matcherInfo["JobID"] matcherParams = ["JDL", "DN", "Group"] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, "Failed", "Matcher did not return %s" % (param)) return self.__finish("Matcher Failed") elif not matcherInfo[param]: self.__report(jobID, "Failed", "Matcher returned null %s" % (param)) return self.__finish("Matcher Failed") else: self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param])) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters["OK"]: self.__report(jobID, "Failed", "Could Not Extract JDL Parameters") self.log.warn(parameters["Message"]) return self.__finish("JDL Problem") params = parameters["Value"] if "JobID" not in params: msg = "Job has not JobID defined in JDL parameters" self.__report(jobID, "Failed", msg) self.log.warn(msg) return self.__finish("JDL Problem") else: jobID = params["JobID"] if "JobType" not in params: self.log.warn("Job has no JobType defined in JDL parameters") jobType = "Unknown" else: jobType = params["JobType"] if "CPUTime" not in params: self.log.warn("Job has no CPU requirement defined in JDL parameters") if self.extraOptions: params["Arguments"] += " " + self.extraOptions params["ExtraOptions"] = self.extraOptions self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType)) self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False) if "BOINC_JOB_ID" in os.environ: # Report BOINC environment for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"): jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False) jobReport.setJobStatus("Matched", "Job Received by Agent") result = self.__setupProxy(ownerDN, jobGroup) if not result["OK"]: return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure) proxyChain = result.get("Value") # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug("Before %sCE submitJob()" % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission["OK"]: self.__report(jobID, "Failed", submission["Message"]) return self.__finish(submission["Message"]) elif "PayloadFailed" in submission: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % submission["PayloadFailed"] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, "Job processing failed with exception", self.stopOnApplicationFailure ) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime) if result["OK"]: self.timeLeft = result["Value"] else: if result["Message"] != "Current batch system is not supported": self.timeLeftError = result["Message"] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU() self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK("Job Agent cycle complete")
#!/usr/bin/env python import os import sys from DIRAC.Core.Base import Script Script.initialize(ignoreErrors=True) from DIRAC.Interfaces.API.Dirac import Dirac from DIRAC.Interfaces.API.Job import Job from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ.get('DIRACJOBID', '0') if not jobID: print 'DIRAC job ID not found' sys.exit(1) jobReport = JobReport(jobID, 'JUNO_JobScript') result = jobReport.setApplicationStatus(', '.join(sys.argv[1:])) if not result['OK']: print 'Set application status error: %s' % result
def execute( self ): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info( 'Attempting to check CPU time left for filling mode' ) if self.fillingMode: if self.timeLeftError: self.log.warn( self.timeLeftError ) return self.__finish( self.timeLeftError ) self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) ) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft ) if not result['OK']: return self.__finish( result['Message'] ) else: return self.__finish( 'Filling Mode is Disabled' ) self.log.verbose( 'Job Agent execution loop' ) available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info( 'Resource is not available' ) self.log.info( available['Message'] ) return self.__finish( 'CE Not Available' ) self.log.info( available['Message'] ) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' ) if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str( self.pilotReference ) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag self.log.verbose( ceDict ) start = time.time() jobRequest = self.__requestJob( ceDict ) matchTime = time.time() - start self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches ) if not jobRequest['OK']: if re.search( 'No work available', jobRequest['Message'] ): self.log.info( 'Job request OK: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return S_ERROR( 'Nothing to do' ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "seconds timeout" ) != -1: self.log.error( jobRequest['Message'] ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return S_ERROR( 'Nothing to do' ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 : self.log.error( jobRequest['Message'] ) return S_ERROR( jobRequest['Message'] ) else: self.log.info( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return S_ERROR( 'Nothing to do' ) return S_OK( jobRequest['Message'] ) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False ) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key( param ): self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) elif not matcherInfo[param]: self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) else: self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) ) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters( jobJDL ) if not parameters['OK']: self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' ) self.log.warn( parameters['Message'] ) return self.__finish( 'JDL Problem' ) params = parameters['Value'] if not params.has_key( 'JobID' ): msg = 'Job has not JobID defined in JDL parameters' self.__report( jobID, 'Failed', msg ) self.log.warn( msg ) return self.__finish( 'JDL Problem' ) else: jobID = params['JobID'] if not params.has_key( 'JobType' ): self.log.warn( 'Job has no JobType defined in JDL parameters' ) jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key( 'SystemConfig' ): self.log.warn( 'Job has no system configuration defined in JDL parameters' ) systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' ) self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig ) if not systemConfig: self.log.warn( '/LocalSite/Architecture is not defined' ) params['SystemConfig'] = systemConfig else: systemConfig = params['SystemConfig'] if systemConfig.lower() == 'any': systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' ) self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =', '"%s" since it was set to "ANY" in the job description' % systemConfig ) if not systemConfig: self.log.warn( '/LocalSite/Architecture is not defined' ) params['SystemConfig'] = systemConfig if not params.has_key( 'MaxCPUTime' ): self.log.warn( 'Job has no CPU requirement defined in JDL parameters' ) self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) ) self.log.info( 'Received JobID=%s, JobType=%s, SystemConfig=%s' % ( jobID, jobType, systemConfig ) ) self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) ) self.jobCount += 1 try: jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName ) jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False ) if self.gridCEQueue: jobReport.setJobParameter( 'GridCEQueue', self.gridCEQueue, sendFlag = False ) jobReport.setJobStatus( 'Matched', 'Job Received by Agent' ) # self.__setJobSite( jobID, self.siteName ) if not self.pilotInfoReportedFlag: self.__reportPilotInfo( jobID ) result = self.__setupProxy( ownerDN, jobGroup ) if not result[ 'OK' ]: return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure ) if 'Value' in result and result[ 'Value' ]: proxyChain = result[ 'Value' ] # Is this necessary at all? saveJDL = self.__saveJobJDLRequest( jobID, jobJDL ) #self.__report(jobID,'Matched','Job Prepared to Submit') #resourceParameters = self.__getJDLParameters( resourceJDL ) #if not resourceParameters['OK']: # return resourceParameters #resourceParams = resourceParameters['Value'] software = self.__checkInstallSoftware( jobID, params, ceDict ) if not software['OK']: self.log.error( 'Failed to install software for job %s' % ( jobID ) ) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure ) self.log.verbose( 'Before %sCE submitJob()' % ( self.ceName ) ) submission = self.__submitJob( jobID, params, ceDict, optimizerParams, jobJDL, proxyChain ) if not submission['OK']: self.__report( jobID, 'Failed', submission['Message'] ) return self.__finish( submission['Message'] ) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure ) self.log.verbose( 'After %sCE submitJob()' % ( self.ceName ) ) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure ) result = self.timeLeftUtil.getTimeLeft( 0.0 ) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) ) self.scaledCPUTime = scaledCPUTime return S_OK( 'Job Agent cycle complete' )
def execute(arguments): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) if arguments.has_key('WorkingDirectory'): wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) if os.path.isdir(wdir): os.chdir(wdir) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory') rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory') return 1 gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) except Exception: gLogger.exception('JobWrapper failed the initialization phase') rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if arguments['Job'].has_key('InputData'): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception, x: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments)
def main(): from DIRAC.Core.Base import Script ### make_CTA_DST options ############################################### Script.registerSwitch( "R:", "run_number=", "Run Number", setRunNumber ) Script.registerSwitch( "I:", "infile=", "Input file", setInfile ) Script.registerSwitch( "T:", "tellist=", "Tellist", setTellist ) Script.registerSwitch( "N:", "nevent=", "Nevent", setNevent ) ### other options ############################################### Script.registerSwitch( "V:", "version=", "HAP version", setVersion ) Script.parseCommandLine( ignoreErrors = True ) args = Script.getPositionalArgs() if len( args ) < 1: Script.showHelp() if infile == None or tellist == None or version == None: Script.showHelp() jobReport.setApplicationStatus('Options badly specified') DIRAC.exit( -1 ) from CTADIRAC.Core.Workflow.Modules.HapRootMacro import HapRootMacro from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int( jobID ) jobReport = JobReport( jobID ) HapPack = 'HAP/' + version + '/HAP' packs = ['HESS/v0.2/lib','HESS/v0.3/root',HapPack] for package in packs: DIRAC.gLogger.notice( 'Checking:', package ) if sharedArea: if checkSoftwarePackage( package, sharedArea() )['OK']: DIRAC.gLogger.notice( 'Package found in Shared Area:', package ) continue if localArea: if checkSoftwarePackage( package, localArea() )['OK']: DIRAC.gLogger.notice( 'Package found in Local Area:', package ) continue if installSoftwarePackage( package, localArea() )['OK']: continue DIRAC.gLogger.error( 'Check Failed for software package:', package ) DIRAC.gLogger.error( 'Software package not available') DIRAC.exit( -1 ) hr = HapRootMacro() hr.setSoftwarePackage(HapPack) telconf = os.path.join( localArea(),'HAP/%s/config/%s' % (version,tellist)) infilestr = '"' + infile + '"' telconfstr = '"' + telconf + '"' args = [str(int(RunNum)), infilestr, telconfstr] try: args.extend([nevent]) except NameError: DIRAC.gLogger.info( 'nevent arg not used' ) DIRAC.gLogger.notice( 'make_CTA_DST macro Arguments:', args ) hr.rootMacro = '/hapscripts/dst/make_CTA_DST.C+' hr.rootArguments = args DIRAC.gLogger.notice( 'Executing Hap make_CTA_DST macro' ) res = hr.execute() if not res['OK']: DIRAC.gLogger.error( 'Failed to execute make_CTA_DST macro') jobReport.setApplicationStatus('Failure during make_CTA_DST') DIRAC.exit( -1 ) ############ check existance of output file #### filedst = 'dst_CTA_%08d' % int(RunNum) + '.root' if not os.path.isfile(filedst): DIRAC.gLogger.error('dst file not found:', filedst ) jobReport.setApplicationStatus('make_CTA_DST.C: DST file not created') DIRAC.exit( -1 ) ###################Check std out ############################# DIRAC.gLogger.notice('Executing DST Check step0') ret = getSoftwareEnviron(HapPack) if not ret['OK']: error = ret['Message'] DIRAC.gLogger.error( error, HapPack) DIRAC.exit( -1 ) hapEnviron = ret['Value'] hessroot = hapEnviron['HESSROOT'] check_script = hessroot + '/hapscripts/dst/check_dst0.csh' cmdTuple = [check_script] ret = systemCall( 0, cmdTuple, sendOutput) if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute DST Check step0') jobReport.setApplicationStatus('Check_dst0: Failed') DIRAC.exit( -1 ) status, stdout, stderr = ret['Value'] if status==1: jobReport.setApplicationStatus('Check_dst0: Big problem during the DST production') DIRAC.gLogger.error( 'DST Check step0 reports: Big problem during the DST production' ) DIRAC.exit( -1 ) if status==2: jobReport.setApplicationStatus('Check_dst0: No triggered events') DIRAC.gLogger.notice( 'DST Check step0 reports: No triggered events' ) DIRAC.exit( ) ############# run the CheckDST macro ################# DIRAC.gLogger.notice('Executing DST check step1') hr.rootMacro = '/hapscripts/dst/CheckDST.C+' fileoutstr = '"' + filedst + '"' args = [fileoutstr] DIRAC.gLogger.notice( 'CheckDST macro Arguments:', args ) hr.rootArguments = args DIRAC.gLogger.notice( 'Executing Hap CheckDST macro') res = hr.execute() if not res['OK']: DIRAC.gLogger.error( 'Failure during DST Check step1' ) jobReport.setApplicationStatus('Check_dst1: Failed') DIRAC.exit( -1 ) ######################check stdout of CheckDST macro ########################### DIRAC.gLogger.notice('Executing DST Check step2') check_script = hessroot + '/hapscripts/dst/check_dst2.csh' cmdTuple = [check_script] ret = systemCall( 0, cmdTuple, sendOutput ) if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute DST Check step2') jobReport.setApplicationStatus('Check_dst2: Failed') DIRAC.exit( -1 ) status, stdout, stderr = ret['Value'] if status==1: jobReport.setApplicationStatus('DST Check step2: Big problem during the DST production') DIRAC.gLogger.error( 'DST Check step2 reports: Big problem during the DST production' ) DIRAC.exit( -1 ) if status==2: jobReport.setApplicationStatus('DST Check step2: No triggered events') DIRAC.gLogger.notice( 'DST Check step2 reports: No triggered events' ) DIRAC.exit( ) DIRAC.exit()
def main(): from DIRAC.Core.Base import Script Script.initialize() DIRAC.gLogger.notice('Platform is:') os.system('dirac-platform') from CTADIRAC.Core.Workflow.Modules.Read_CtaApp import Read_CtaApp from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int( jobID ) jobReport = JobReport( jobID ) version = sys.argv[3] DIRAC.gLogger.notice( 'Version:', version ) install_CorsikaSimtelPack(version) ######### run read_cta ####################################### rcta = Read_CtaApp() CorsikaSimtelPack = os.path.join('corsika_simhessarray',version,'corsika_simhessarray') rcta.setSoftwarePackage(CorsikaSimtelPack) rcta.rctaExe = 'read_cta' # add arguments for read_cta specified by user ###### args = [] rctaparfile = open('read_cta.par', 'r').readlines() for line in rctaparfile: for word in line.split(): args.append(word) simtelFileLFN = sys.argv[-1].split('ParametricInputData=LFN:')[1] simtelFileName = os.path.basename(simtelFileLFN) dstFileName = simtelFileName.replace('simtel.gz','simtel-dst.gz') dstHistoFileName = simtelFileName.replace('simtel.gz','hdata-dst.gz') args.extend(['--dst-file', dstFileName, '--histogram-file', dstHistoFileName, simtelFileName]) rcta.rctaArguments = args rctaReturnCode = rcta.execute() if rctaReturnCode != 0: DIRAC.gLogger.error( 'read_cta Application: Failed') jobReport.setApplicationStatus('read_cta Application: Failed') DIRAC.exit( -1 ) ################################################################# from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron ret = getSoftwareEnviron( CorsikaSimtelPack ) if not ret['OK']: error = ret['Message'] DIRAC.gLogger.error( error, CorsikaSimtelPack ) DIRAC.exit( -1 ) read_ctaEnviron = ret['Value'] ######## run dst quality checks ###################################### fd = open('check_dst_histo.sh', 'w' ) fd.write( """#! /bin/sh dsthistfilename=%s dstfile=%s n6="$(list_histograms -h 6 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" n12001="$(list_histograms -h 12001 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" if [ $n6 -ne $n12001 ]; then echo 'n6 found:' $n6 echo 'n12001 found:' $n12001 exit 1 else echo 'n6 found:' $n6 echo 'n12001 found:' $n12001 fi n12002="$(list_histograms -h 12002 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" nev="$(statio ${dstfile} | egrep '^2010' | cut -f2)" if [ -z "$nev" ]; then nev="0"; fi if [ $nev -ne $n12002 ]; then echo 'nev found:' $nev echo 'n12002 found:' $n12002 exit 1 else echo 'nev found:' $nev echo 'n12002 found:' $n12002 fi """ % (dstHistoFileName,dstFileName)) fd.close() os.system('chmod u+x check_dst_histo.sh') cmdTuple = ['./check_dst_histo.sh'] DIRAC.gLogger.notice( 'Executing command tuple:', cmdTuple ) ret = systemCall( 0, cmdTuple, sendOutput,env = read_ctaEnviron) checkHistoReturnCode, stdout, stderr = ret['Value'] if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute check_dst_histo.sh') DIRAC.gLogger.error( 'check_dst_histo.sh status is:', checkHistoReturnCode) DIRAC.exit( -1 ) if (checkHistoReturnCode!=0): DIRAC.gLogger.error( 'Failure during check_dst_histo.sh') DIRAC.gLogger.error( 'check_dst_histo.sh status is:', checkHistoReturnCode) jobReport.setApplicationStatus('Histo check Failed') DIRAC.exit( -1 ) DIRAC.exit()
def main(): from DIRAC.Core.Base import Script #### eventio_cta options ########################################## Script.registerSwitch( "T:", "tellist=", "Tellist", setTellist ) Script.registerSwitch( "F:", "Nfirst_mcevt=", "Nfirst_mcevt", setNfirst_mcevt) Script.registerSwitch( "L:", "Nlast_mcevt=", "Nlast_mcevt", setNlast_mcevt) ## add other eventio_cta options ################################ # Script.registerSwitch( "N:", "num=", "Num", setNum) ## Script.registerSwitch( "L:", "limitmc=", "Limitmc", setLimitmc) # Script.registerSwitch( "S:", "telidoffset=", "Telidoffset", setTelidoffset) Script.registerSwitch( "P:", "pixelslices=", "setPixelslices (true/false)",setPixelslices) Script.registerSwitch( "p:", "run_number=", "Run Number (set automatically)", setRunNumber ) ### other options ############################################### Script.registerSwitch( "V:", "version=", "HAP version", setVersion ) Script.parseCommandLine( ignoreErrors = True ) args = Script.getPositionalArgs() if len( args ) < 1: Script.showHelp() if tellist == None or version == None: Script.showHelp() jobReport.setApplicationStatus('Options badly specified') DIRAC.exit( -1 ) from CTADIRAC.Core.Workflow.Modules.HapApplication import HapApplication from CTADIRAC.Core.Workflow.Modules.HapRootMacro import HapRootMacro from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int( jobID ) jobReport = JobReport( jobID ) HapPack = 'HAP/' + version + '/HAP' packs = ['HESS/v0.2/lib','HESS/v0.3/root',HapPack] for package in packs: DIRAC.gLogger.notice( 'Checking:', package ) if sharedArea: if checkSoftwarePackage( package, sharedArea() )['OK']: DIRAC.gLogger.notice( 'Package found in Shared Area:', package ) continue if localArea: if checkSoftwarePackage( package, localArea() )['OK']: DIRAC.gLogger.notice( 'Package found in Local Area:', package ) continue if installSoftwarePackage( package, localArea() )['OK']: continue DIRAC.gLogger.error( 'Check Failed for software package:', package ) DIRAC.gLogger.error( 'Software package not available') DIRAC.exit( -1 ) telconf = os.path.join( localArea(),'HAP/%s/config/%s' % (version,tellist)) ha = HapApplication() ha.setSoftwarePackage(HapPack) ha.hapExecutable = 'eventio_cta' fileout = 'raw_' + part_type + '_run' + run_number + '.root' infile = build_infile() ha.hapArguments = ['-file', infile, '-o', fileout, '-tellist', telconf] try: ha.hapArguments.extend(['-Nfirst_mcevt', Nfirst_mcevt, '-Nlast_mcevt', Nlast_mcevt]) except NameError: DIRAC.gLogger.info( 'Nfirst_mcevt/Nlast_mcevt options are not used' ) try: if(pixelslices == 'true'): ha.hapArguments.extend(['-pixelslices']) except NameError: DIRAC.gLogger.info( 'pixelslices option is not used' ) DIRAC.gLogger.notice( 'Executing Hap Converter Application' ) res = ha.execute() if not res['OK']: DIRAC.gLogger.error( 'Failed to execute eventio_cta Application') jobReport.setApplicationStatus('eventio_cta: Failed') DIRAC.exit( -1 ) if not os.path.isfile(fileout): error = 'raw file was not created:' DIRAC.gLogger.error( error, fileout ) jobReport.setApplicationStatus('eventio_cta: RawData not created') DIRAC.exit( -1 ) ###################### Check RAW DATA ####################### hr = HapRootMacro() hr.setSoftwarePackage(HapPack) DIRAC.gLogger.notice('Executing RAW check step0') hr.rootMacro = '/hapscripts/dst/Open_Raw.C+' outfilestr = '"' + fileout + '"' args = [outfilestr] DIRAC.gLogger.notice( 'Open_Raw macro Arguments:', args ) hr.rootArguments = args DIRAC.gLogger.notice( 'Executing Hap Open_Raw macro') res = hr.execute() if not res['OK']: DIRAC.gLogger.error( 'Open_Raw: Failed' ) DIRAC.exit( -1 ) #################Check stdout of 'Open_Raw.C macro ############################### DIRAC.gLogger.notice('Executing Raw Check step1') ret = getSoftwareEnviron(HapPack) if not ret['OK']: error = ret['Message'] DIRAC.gLogger.error( error, HapPack) DIRAC.exit( -1 ) hapEnviron = ret['Value'] hessroot = hapEnviron['HESSROOT'] check_script = hessroot + '/hapscripts/dst/check_raw.csh' cmdTuple = [check_script] ret = systemCall( 0, cmdTuple, sendOutput) if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute RAW Check step1') jobReport.setApplicationStatus('Check_raw: Failed') DIRAC.exit( -1 ) status, stdout, stderr = ret['Value'] if status==1: jobReport.setApplicationStatus('RAW Check step1: Big problem during RAW production') DIRAC.gLogger.error( 'Check_raw: Big problem during RAW production' ) DIRAC.exit( -1 ) ############## DST production #######################" hr = HapRootMacro() hr.setSoftwarePackage(HapPack) infile = build_infile() infilestr = '"' + fileout + '"' telconfstr = '"' + telconf + '"' args = [str(int(run_number)), infilestr, telconfstr] try: args.extend([nevent]) except NameError: DIRAC.gLogger.info( 'nevent arg not used' ) DIRAC.gLogger.notice( 'make_CTA_DST macro Arguments:', args ) hr.rootMacro = '/hapscripts/dst/make_CTA_DST.C+' hr.rootArguments = args DIRAC.gLogger.notice( 'Executing Hap make_CTA_DST macro' ) res = hr.execute() if not res['OK']: DIRAC.gLogger.error( 'Failed to execute make_CTA_DST macro') jobReport.setApplicationStatus('Failure during make_CTA_DST') DIRAC.exit( -1 ) ############ check existance of output file #### filedst = 'dst_CTA_%08d' % int(run_number) + '.root' if not os.path.isfile(filedst): DIRAC.gLogger.error('dst file not found:', filedst ) jobReport.setApplicationStatus('make_CTA_DST.C: DST file not created') DIRAC.exit( -1 ) fileout = 'dst_' + part_type + '_run' + run_number + '.root' cmd = 'mv ' + filedst + ' ' + fileout os.system(cmd) #####################Check stdout ########################### DIRAC.gLogger.notice('Executing DST Check step0') check_script = hessroot + '/hapscripts/dst/check_dst0.csh' cmdTuple = [check_script] ret = systemCall( 0, cmdTuple, sendOutput) if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute DST Check step0') jobReport.setApplicationStatus('Check_dst0: Failed') DIRAC.exit( -1 ) status, stdout, stderr = ret['Value'] if status==1: jobReport.setApplicationStatus('Check_dst0: Big problem during the DST production') DIRAC.gLogger.error( 'DST Check step0 reports: Big problem during the DST production' ) DIRAC.exit( -1 ) if status==2: jobReport.setApplicationStatus('Check_dst0: No triggered events') DIRAC.gLogger.notice( 'DST Check step0 reports: No triggered events' ) DIRAC.exit( ) ############# run the CheckDST macro ################# DIRAC.gLogger.notice('Executing DST check step1') hr.rootMacro = '/hapscripts/dst/CheckDST.C+' fileoutstr = '"' + fileout + '"' args = [fileoutstr] DIRAC.gLogger.notice( 'CheckDST macro Arguments:', args ) hr.rootArguments = args DIRAC.gLogger.notice( 'Executing Hap CheckDST macro') res = hr.execute() if not res['OK']: DIRAC.gLogger.error( 'Failure during DST Check step1' ) jobReport.setApplicationStatus('Check_dst1: Failed') DIRAC.exit( -1 ) #######################Check stdout of CheckDST.C macro ########################## DIRAC.gLogger.notice('Executing DST Check step2') check_script = hessroot + '/hapscripts/dst/check_dst2.csh' cmdTuple = [check_script] ret = systemCall( 0, cmdTuple, sendOutput ) if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute DST Check step2') jobReport.setApplicationStatus('Check_dst2: Failed') DIRAC.exit( -1 ) status, stdout, stderr = ret['Value'] if status==1: jobReport.setApplicationStatus('DST Check step2: Big problem during the DST production') DIRAC.gLogger.error( 'DST Check step2 reports: Big problem during the DST production' ) DIRAC.exit( -1 ) if status==2: jobReport.setApplicationStatus('DST Check step2: No triggered events') DIRAC.gLogger.notice( 'DST Check step2 reports: No triggered events' ) DIRAC.exit( ) DIRAC.exit()
if arguments.has_key('WorkingDirectory'): wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) if os.path.isdir(wdir): os.chdir(wdir) except Exception, x: gLogger.exception('JobWrapperTemplate could not create working directory') rescheduleFailedJob(jobID,'Could Not Create Working Directory') return 1 #root = arguments['CE']['Root'] jobReport = JobReport(jobID,'JobWrapper') try: job = JobWrapper( jobID, jobReport ) job.initialize(arguments) except Exception, x: gLogger.exception('JobWrapper failed the initialization phase') rescheduleFailedJob(jobID,'Job Wrapper Initialization') job.sendWMSAccounting('Failed','Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): jobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']:
def main(): from DIRAC.Core.Base import Script Script.registerSwitch( "p:", "run_number=", "Run Number", setRunNumber ) Script.registerSwitch( "R:", "run=", "Run", setRun ) Script.registerSwitch( "P:", "config_path=", "Config Path", setConfigPath ) Script.registerSwitch( "T:", "template=", "Template", setTemplate ) Script.registerSwitch( "E:", "executable=", "Executable", setExecutable ) Script.registerSwitch( "V:", "version=", "Version", setVersion ) Script.registerSwitch( "M:", "mode=", "Mode", setMode ) Script.parseCommandLine( ignoreErrors = True ) args = Script.getPositionalArgs() if len( args ) < 1: Script.showHelp() if version == None or executable == None or run_number == None or run == None or template == None: Script.showHelp() jobReport.setApplicationStatus('Options badly specified') DIRAC.exit( -1 ) from CTADIRAC.Core.Workflow.Modules.CorsikaApp import CorsikaApp from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int( jobID ) jobReport = JobReport( jobID ) CorsikaSimtelPack = 'corsika_simhessarray/' + version + '/corsika_simhessarray' packs = [CorsikaSimtelPack] for package in packs: DIRAC.gLogger.notice( 'Checking:', package ) if sharedArea: if checkSoftwarePackage( package, sharedArea() )['OK']: DIRAC.gLogger.notice( 'Package found in Shared Area:', package ) installSoftwareEnviron( package, workingArea() ) packageTuple = package.split('/') corsika_subdir = sharedArea() + '/' + packageTuple[0] + '/' + version cmd = 'cp -r ' + corsika_subdir + '/* .' os.system(cmd) continue if workingArea: if checkSoftwarePackage( package, workingArea() )['OK']: DIRAC.gLogger.notice( 'Package found in Local Area:', package ) continue if installSoftwarePackage( package, workingArea() )['OK']: ############## compile ############################# if version == 'clean_23012012': cmdTuple = ['./build_all','ultra','qgs2'] elif version in ['prod-2_21122012','prod-2_08032013','prod-2_06052013']: cmdTuple = ['./build_all','prod2','qgs2'] ret = systemCall( 0, cmdTuple, sendOutput) if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute build') DIRAC.exit( -1 ) continue DIRAC.gLogger.error( 'Check Failed for software package:', package ) DIRAC.gLogger.error( 'Software package not available') DIRAC.exit( -1 ) cs = CorsikaApp() cs.setSoftwarePackage(CorsikaSimtelPack) cs.csExe = executable cs.csArguments = ['--run-number',run_number,'--run',run,template] corsikaReturnCode = cs.execute() if corsikaReturnCode != 0: DIRAC.gLogger.error( 'Failed to execute corsika Application') jobReport.setApplicationStatus('Corsika Application: Failed') DIRAC.exit( -1 ) ###### rename corsika file ################################# rundir = 'run' + run_number corsikaKEYWORDS = ['TELFIL'] dictCorsikaKW = fileToKWDict(template,corsikaKEYWORDS) corsikafilename = rundir + '/' + dictCorsikaKW['TELFIL'][0] destcorsikafilename = 'corsika_run' + run_number + '.corsika.gz' cmd = 'mv ' + corsikafilename + ' ' + destcorsikafilename os.system(cmd) ### create corsika tar #################### corsika_tar = 'corsika_run' + run_number + '.tar.gz' filetar1 = rundir + '/'+'input' filetar2 = rundir + '/'+ 'DAT' + run_number + '.dbase' filetar3 = rundir + '/run' + str(int(run_number)) + '.log' cmdTuple = ['/bin/tar','zcf',corsika_tar, filetar1,filetar2,filetar3] DIRAC.gLogger.notice( 'Executing command tuple:', cmdTuple ) ret = systemCall( 0, cmdTuple, sendOutput) if not ret['OK']: DIRAC.gLogger.error( 'Failed to execute tar') DIRAC.exit( -1 ) DIRAC.exit()
def execute(arguments): """ The only real function executed here """ global gJobReport jobID = arguments['Job'].get('JobID', 0) os.environ['JOBID'] = str(jobID) jobID = int(jobID) if 'WorkingDirectory' in arguments: wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs( wdir ) # this will raise an exception if wdir already exists (which is ~OK) if os.path.isdir(wdir): os.chdir(wdir) except OSError as osError: if osError.errno == errno.EEXIST and os.path.isdir(wdir): gLogger.exception( 'JobWrapperTemplate found that the working directory already exists' ) rescheduleResult = rescheduleFailedJob( jobID, 'Working Directory already exists') else: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory') return 1 gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) # initialize doesn't return S_OK/S_ERROR except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper failed the initialization phase', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) try: job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization') except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper failed sending job accounting', lException=exc) return 1 if 'InputSandbox' in arguments['Job']: gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception while downloading input sandbox', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if 'InputData' in arguments['Job']: if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception while resolving input data', lException=exc) rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments) else: gLogger.verbose('Job has no InputData requirement') gJobReport.commit() try: result = job.execute() if not result['OK']: gLogger.error('Failed to execute job', result['Message']) raise JobWrapperError((result['Message'], result['Errno'])) except JobWrapperError as exc: if exc.value[1] == 0 or str(exc.value[0]) == '0': gLogger.verbose('JobWrapper exited with status=0 after execution') if exc.value[1] == DErrno.EWMSRESC: gLogger.warn("Asked to reschedule job") rescheduleResult = rescheduleFailedJob(jobID, 'JobWrapper execution', gJobReport) job.sendJobAccounting(rescheduleResult, 'JobWrapper execution') return 1 gLogger.exception('Job failed in execution phase') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception('Job raised exception during execution phase', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']: try: result = job.processJobOutputs() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError as exc: gLogger.exception('JobWrapper failed to process output files') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 except Exception as exc: # pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception while processing output files', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 else: gLogger.verbose('Job has no OutputData or OutputSandbox requirement') try: # Failed jobs will return 1 / successful jobs will return 0 return job.finalize() except Exception as exc: # pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception during the finalization phase', lException=exc) return 2
if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) if os.path.isdir(wdir): os.chdir(wdir) except Exception, x: gLogger.exception( 'JobWrapperTemplate could not create working directory') rescheduleFailedJob(jobID, 'Could Not Create Working Directory') return 1 #root = arguments['CE']['Root'] jobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, jobReport) job.initialize(arguments) except Exception, x: gLogger.exception('JobWrapper failed the initialization phase') rescheduleFailedJob(jobID, 'Job Wrapper Initialization') job.sendWMSAccounting('Failed', 'Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): jobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']:
def execute(arguments): """The only real function executed here""" global gJobReport jobID = arguments["Job"].get("JobID", 0) os.environ["JOBID"] = str(jobID) jobID = int(jobID) if "WorkingDirectory" in arguments: wdir = os.path.expandvars(arguments["WorkingDirectory"]) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs( wdir ) # this will raise an exception if wdir already exists (which is ~OK) if os.path.isdir(wdir): os.chdir(wdir) except OSError as osError: if osError.errno == errno.EEXIST and os.path.isdir(wdir): gLogger.exception( "JobWrapperTemplate found that the working directory already exists" ) rescheduleResult = rescheduleFailedJob( jobID, "Working Directory already exists") else: gLogger.exception( "JobWrapperTemplate could not create working directory" ) rescheduleResult = rescheduleFailedJob( jobID, "Could Not Create Working Directory") return 1 gJobReport = JobReport(jobID, "JobWrapper") try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) # initialize doesn't return S_OK/S_ERROR except Exception as exc: # pylint: disable=broad-except gLogger.exception("JobWrapper failed the initialization phase", lException=exc) rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION) return 1 if "InputSandbox" in arguments["Job"]: gJobReport.commit() try: result = job.transferInputSandbox(arguments["Job"]["InputSandbox"]) if not result["OK"]: gLogger.warn(result["Message"]) raise JobWrapperError(result["Message"]) except JobWrapperError: gLogger.exception("JobWrapper failed to download input sandbox") rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX) return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception while downloading input sandbox", lException=exc) rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX) return 1 else: gLogger.verbose("Job has no InputSandbox requirement") gJobReport.commit() if "InputData" in arguments["Job"]: if arguments["Job"]["InputData"]: try: result = job.resolveInputData() if not result["OK"]: gLogger.warn(result["Message"]) raise JobWrapperError(result["Message"]) except JobWrapperError: gLogger.exception("JobWrapper failed to resolve input data") rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION) return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception while resolving input data", lException=exc) rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION) return 1 else: gLogger.verbose("Job has a null InputData requirement:") gLogger.verbose(arguments) else: gLogger.verbose("Job has no InputData requirement") gJobReport.commit() try: result = job.execute() if not result["OK"]: gLogger.error("Failed to execute job", result["Message"]) raise JobWrapperError((result["Message"], result["Errno"])) except JobWrapperError as exc: if exc.value[1] == 0 or str(exc.value[0]) == "0": gLogger.verbose("JobWrapper exited with status=0 after execution") if exc.value[1] == DErrno.EWMSRESC: gLogger.warn("Asked to reschedule job") rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION) return 1 gLogger.exception("Job failed in execution phase") gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC) return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception("Job raised exception during execution phase", lException=exc) gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC) return 1 if "OutputSandbox" in arguments["Job"] or "OutputData" in arguments["Job"]: try: result = job.processJobOutputs() if not result["OK"]: gLogger.warn(result["Message"]) raise JobWrapperError(result["Message"]) except JobWrapperError as exc: gLogger.exception("JobWrapper failed to process output files") gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS) return 2 except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception while processing output files", lException=exc) gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS) return 2 else: gLogger.verbose("Job has no OutputData or OutputSandbox requirement") try: # Failed jobs will return !=0 / successful jobs will return 0 return job.finalize() except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception during the finalization phase", lException=exc) return 2
def main(): from DIRAC.Core.Base import Script Script.registerSwitch("p:", "inputfile=", "Input File", setInputFile) Script.registerSwitch("E:", "simtelExecName=", "SimtelExecName", setExecutable) Script.registerSwitch("S:", "simtelConfig=", "SimtelConfig", setConfig) Script.registerSwitch("V:", "version=", "Version", setVersion) Script.registerSwitch("D:", "storage_element=", "Storage Element", setStorageElement) from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient from DIRAC.Resources.Catalog.FileCatalog import FileCatalog Script.parseCommandLine() DIRAC.gLogger.setLevel('INFO') global fcc, fcL, storage_element from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int(jobID) jobReport = JobReport(jobID) ########### ## Checking MD coherence fc = FileCatalog('LcgFileCatalog') res = fc._getCatalogConfigDetails('DIRACFileCatalog') print 'DFC CatalogConfigDetails:', res res = fc._getCatalogConfigDetails('LcgFileCatalog') print 'LCG CatalogConfigDetails:', res fcc = FileCatalogClient() fcL = FileCatalog('LcgFileCatalog') from DIRAC.Interfaces.API.Dirac import Dirac dirac = Dirac() ############################ ############# # simtelConfigFile should be built from ??? #simtelConfigFilesPath = 'sim_telarray/multi' #simtelConfigFile = simtelConfigFilesPath + '/multi_cta-ultra5.cfg' #createGlobalsFromConfigFiles(simtelConfigFile) #createGlobalsFromConfigFiles(current_version) ####################### ## files spread in 1000-runs subDirectories corsikaFileName = os.path.basename(corsikaFileLFN) run_number = corsikaFileName.split('run')[1].split('.corsika.gz')[ 0] # run001412.corsika.gz runNum = int(run_number) subRunNumber = '%03d' % runNum runNumModMille = runNum % 1000 runNumTrunc = (runNum - runNumModMille) / 1000 runNumSeriesDir = '%03dxxx' % runNumTrunc print 'runNumSeriesDir=', runNumSeriesDir f = open('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK', 'w') f.close() ##### If storage element is IN2P3-tape save simtel file on disk ############### if storage_element == 'CC-IN2P3-Tape': storage_element = 'CC-IN2P3-Disk' ############ Producing SimTel File ######################Building simtel Directory Metadata ####################### cfg_dict = { "4MSST": 'cta-prod2-4m-dc', "SCSST": 'cta-prod2-sc-sst', "STD": 'cta-prod2', "NSBX3": 'cta-prod2', "ASTRI": 'cta-prod2-astri', "SCMST": 'cta-prod2-sc3' } if simtelConfig == "6INROW": all_configs = ["SCMST", "4MSST", "SCSST", "ASTRI", "NSBX3", "STD"] elif simtelConfig == "5INROW": all_configs = ["4MSST", "SCSST", "ASTRI", "NSBX3", "STD"] else: all_configs = [simtelConfig] for current_conf in all_configs: DIRAC.gLogger.notice('current conf is', current_conf) if current_conf == "SCMST": current_version = version + '_sc3' else: current_version = version if os.path.isdir('sim_telarray'): DIRAC.gLogger.notice( 'Package found in the local area. Removing package...') cmd = 'rm -R sim_telarray corsika-6990 hessioxxx corsika-run' if (os.system(cmd)): DIRAC.exit(-1) DIRAC.gLogger.notice('current version is', current_version) CorsikaSimtelPack = 'corsika_simhessarray/' + current_version + '/corsika_simhessarray' packs = [CorsikaSimtelPack] for package in packs: DIRAC.gLogger.notice('Checking:', package) if sharedArea: if checkSoftwarePackage(package, sharedArea())['OK']: DIRAC.gLogger.notice('Package found in Shared Area:', package) installSoftwareEnviron(package, workingArea()) packageTuple = package.split('/') corsika_subdir = sharedArea( ) + '/' + packageTuple[0] + '/' + current_version cmd = 'cp -u -r ' + corsika_subdir + '/* .' os.system(cmd) continue DIRAC.gLogger.error('Check Failed for software package:', package) DIRAC.gLogger.error('Software package not available') DIRAC.exit(-1) createGlobalsFromConfigFiles(current_version) resultCreateSimtelDirMD = createSimtelFileSystAndMD( current_conf, current_version) if not resultCreateSimtelDirMD['OK']: DIRAC.gLogger.error('Failed to create simtelArray Directory MD') jobReport.setApplicationStatus( 'Failed to create simtelArray Directory MD') DIRAC.gLogger.error( 'Metadata coherence problem, no simtelArray File produced') DIRAC.exit(-1) else: print 'simtel Directory MD successfully created' ############## introduce file existence check here ######################## simtelFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str( obslev) + '_' + 'run' + run_number + '.simtel.gz' simtelDirPath_conf = simtelDirPath + '_' + current_conf simtelOutFileDir = os.path.join(simtelDirPath_conf, 'Data', runNumSeriesDir) simtelOutFileLFN = os.path.join(simtelOutFileDir, simtelFileName) res = CheckCatalogCoherence(simtelOutFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('Current conf already done', current_conf) continue #### execute simtelarray ################ fd = open('run_sim.sh', 'w') fd.write("""#! /bin/sh export SVNPROD2=$PWD export SVNTAG=SVN-PROD2_rev1869 export CORSIKA_IO_BUFFER=800MB ./grid_prod2-repro.sh %s %s""" % (corsikaFileName, current_conf)) fd.close() os.system('chmod u+x run_sim.sh') cmdTuple = ['./run_sim.sh'] ret = systemCall(0, cmdTuple, sendOutputSimTel) simtelReturnCode, stdout, stderr = ret['Value'] if (os.system('grep Broken simtel.log')): DIRAC.gLogger.notice('not broken') else: DIRAC.gLogger.notice('broken') # Tag corsika File if Broken Pipe corsikaTagMD = {} corsikaTagMD['CorsikaToReprocess'] = 'CorsikaToReprocess' result = fcc.setMetadata(corsikaFileLFN, corsikaTagMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] jobReport.setApplicationStatus('Broken pipe') DIRAC.exit(-1) if not ret['OK']: DIRAC.gLogger.error('Failed to execute run_sim.sh') DIRAC.gLogger.error('run_sim.sh status is:', simtelReturnCode) DIRAC.exit(-1) ## putAndRegister simtel data/log/histo Output File: cfg = cfg_dict[current_conf] cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Data/*.simtel.gz ' + simtelFileName if (os.system(cmd)): DIRAC.exit(-1) ############################################ simtelRunNumberSeriesDirExist = fcc.isDirectory( simtelOutFileDir)['Value']['Successful'][simtelOutFileDir] newSimtelRunFileSeriesDir = ( simtelRunNumberSeriesDirExist != True ) # if new runFileSeries, will need to add new MD simtelLogFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str(obslev) + '_' + 'run' + run_number + '.log.gz' cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Log/*.log.gz ' + simtelLogFileName if (os.system(cmd)): DIRAC.exit(-1) simtelOutLogFileDir = os.path.join(simtelDirPath_conf, 'Log', runNumSeriesDir) simtelOutLogFileLFN = os.path.join(simtelOutLogFileDir, simtelLogFileName) simtelHistFileName = particle + '_' + str(thetaP) + '_' + str( phiP) + '_alt' + str( obslev) + '_' + 'run' + run_number + '.hdata.gz' cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Histograms/*.hdata.gz ' + simtelHistFileName if (os.system(cmd)): DIRAC.exit(-1) simtelOutHistFileDir = os.path.join(simtelDirPath_conf, 'Histograms', runNumSeriesDir) simtelOutHistFileLFN = os.path.join(simtelOutHistFileDir, simtelHistFileName) ################################################ DIRAC.gLogger.notice('Put and register simtel File in LFC and DFC:', simtelOutFileLFN) ret = dirac.addFile(simtelOutFileLFN, simtelFileName, storage_element) res = CheckCatalogCoherence(simtelOutFileLFN) if res != DIRAC.S_OK: DIRAC.gLogger.error('Job failed: Catalog Coherence problem found') jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) if not ret['OK']: DIRAC.gLogger.error('Error during addFile call:', ret['Message']) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) ###################################################################### res = CheckCatalogCoherence(simtelOutLogFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('Log file already exists. Removing:', simtelOutLogFileLFN) ret = dirac.removeFile(simtelOutLogFileLFN) DIRAC.gLogger.notice( 'Put and register simtel Log File in LFC and DFC:', simtelOutLogFileLFN) ret = dirac.addFile(simtelOutLogFileLFN, simtelLogFileName, storage_element) res = CheckCatalogCoherence(simtelOutLogFileLFN) if res != DIRAC.S_OK: DIRAC.gLogger.error('Job failed: Catalog Coherence problem found') jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) if not ret['OK']: DIRAC.gLogger.error('Error during addFile call:', ret['Message']) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) ###################################################################### res = CheckCatalogCoherence(simtelOutHistFileLFN) if res == DIRAC.S_OK: DIRAC.gLogger.notice('Histo file already exists. Removing:', simtelOutHistFileLFN) ret = dirac.removeFile(simtelOutHistFileLFN) DIRAC.gLogger.notice( 'Put and register simtel Histo File in LFC and DFC:', simtelOutHistFileLFN) ret = dirac.addFile(simtelOutHistFileLFN, simtelHistFileName, storage_element) res = CheckCatalogCoherence(simtelOutHistFileLFN) if res != DIRAC.S_OK: DIRAC.gLogger.error('Job failed: Catalog Coherence problem found') jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) if not ret['OK']: DIRAC.gLogger.error('Error during addFile call:', ret['Message']) jobReport.setApplicationStatus('OutputData Upload Error') DIRAC.exit(-1) ###################################################################### if newSimtelRunFileSeriesDir: insertRunFileSeriesMD(simtelOutFileDir, runNumTrunc) insertRunFileSeriesMD(simtelOutLogFileDir, runNumTrunc) insertRunFileSeriesMD(simtelOutHistFileDir, runNumTrunc) ###### simtel File level metadata ############################################ simtelFileMD = {} simtelFileMD['runNumber'] = int(run_number) simtelFileMD['jobID'] = jobID simtelFileMD['simtelReturnCode'] = simtelReturnCode result = fcc.setMetadata(simtelOutFileLFN, simtelFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] result = fcc.setMetadata(simtelOutLogFileLFN, simtelFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] result = fcc.setMetadata(simtelOutHistFileLFN, simtelFileMD) print "result setMetadata=", result if not result['OK']: print 'ResultSetMetadata:', result['Message'] result = fcc.addFileAncestors( {simtelOutFileLFN: { 'Ancestors': [corsikaFileLFN] }}) print 'result addFileAncestor:', result result = fcc.addFileAncestors( {simtelOutLogFileLFN: { 'Ancestors': [corsikaFileLFN] }}) print 'result addFileAncestor:', result result = fcc.addFileAncestors( {simtelOutHistFileLFN: { 'Ancestors': [corsikaFileLFN] }}) print 'result addFileAncestor:', result result = fcc.setMetadata(simtelOutFileLFN, simtelFileMD) if not result['OK']: print 'ResultSetMetadata:', result['Message'] DIRAC.exit()
''' Created on 2015-05-19 21:45:37 @author: suo ''' import sys from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport from DIRAC.Core.Base import Script Script.parseCommandLine( ignoreErrors = False ) jobID = sys.argv[1] experiment = sys.argv[2] message = sys.argv[3] jobReport = JobReport(jobID,experiment) result = jobReport.setApplicationStatus(message) if not result['OK']: try: with open('job.err','a') as errFile: print >> errFile, 'setJobStatus error: %s' % result except IOError: print 'IOError:',str(e)
def execute(arguments): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) # Fix in the environment to get a reasonable performance from dCache, # until we move to a new version of root # os.environ['DCACHE_RAHEAD'] = str(1) # os.environ['DCACHE_RA_BUFFER'] = str(50*1024) if arguments.has_key('WorkingDirectory'): wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) if os.path.isdir(wdir): os.chdir(wdir) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory') rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory') return 1 #root = arguments['CE']['Root'] gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) except Exception: gLogger.exception('JobWrapper failed the initialization phase') rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) job.sendWMSAccounting(rescheduleResult, 'Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendWMSAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if arguments['Job'].has_key('InputData'): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception, x: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendWMSAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments)
def execute ( arguments ): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int( jobID ) # Fix in the environment to get a reasonable performance from dCache, # until we move to a new version of root # os.environ['DCACHE_RAHEAD'] = str(1) # os.environ['DCACHE_RA_BUFFER'] = str(50*1024) if arguments.has_key( 'WorkingDirectory' ): wdir = os.path.expandvars( arguments['WorkingDirectory'] ) if os.path.isdir( wdir ): os.chdir( wdir ) else: try: os.makedirs( wdir ) if os.path.isdir( wdir ): os.chdir( wdir ) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleFailedJob( jobID, 'Could Not Create Working Directory' ) return 1 #root = arguments['CE']['Root'] gJobReport = JobReport( jobID, 'JobWrapper' ) try: job = JobWrapper( jobID, gJobReport ) job.initialize( arguments ) except Exception: gLogger.exception( 'JobWrapper failed the initialization phase' ) rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport ) job.sendWMSAccounting( 'Failed', 'Job Wrapper Initialization' ) return 1 if arguments['Job'].has_key( 'InputSandbox' ): gJobReport.commit() try: result = job.transferInputSandbox( arguments['Job']['InputSandbox'] ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception: gLogger.exception( 'JobWrapper failed to download input sandbox' ) rescheduleFailedJob( jobID, 'Input Sandbox Download' ) job.sendWMSAccounting( 'Failed', 'Input Sandbox Download' ) return 1 else: gLogger.verbose( 'Job has no InputSandbox requirement' ) gJobReport.commit() if arguments['Job'].has_key( 'InputData' ): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception, x: gLogger.exception( 'JobWrapper failed to resolve input data' ) rescheduleFailedJob( jobID, 'Input Data Resolution' ) job.sendWMSAccounting( 'Failed', 'Input Data Resolution' ) return 1 else: gLogger.verbose( 'Job has a null InputData requirement:' ) gLogger.verbose( arguments )
def main(): from DIRAC.Core.Base import Script #### eventio_cta options ########################################## Script.registerSwitch("I:", "infile=", "Input file", setInfile) Script.registerSwitch("O:", "outfile=", "Output file", setOutfile) Script.registerSwitch("T:", "tellist=", "Tellist", setTellist) Script.registerSwitch("F:", "Nfirst_mcevt=", "Nfirst_mcevt", setNfirst_mcevt) Script.registerSwitch("L:", "Nlast_mcevt=", "Nlast_mcevt", setNlast_mcevt) Script.registerSwitch("P:", "pixelslices=", "setPixelslices (true/false)", setPixelslices) ### other options ############################################### Script.registerSwitch("V:", "version=", "HAP version", setVersion) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) < 1: Script.showHelp() if outfile == None or infile == None or tellist == None or version == None: Script.showHelp() jobReport.setApplicationStatus('Options badly specified') DIRAC.exit(-1) from CTADIRAC.Core.Workflow.Modules.HapApplication import HapApplication from CTADIRAC.Core.Workflow.Modules.HapRootMacro import HapRootMacro from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int(jobID) jobReport = JobReport(jobID) HapPack = 'HAP/' + version + '/HAP' packs = ['HESS/v0.2/lib', 'HESS/v0.3/root', HapPack] for package in packs: DIRAC.gLogger.notice('Checking:', package) if sharedArea: if checkSoftwarePackage(package, sharedArea())['OK']: DIRAC.gLogger.notice('Package found in Shared Area:', package) continue if localArea: if checkSoftwarePackage(package, localArea())['OK']: DIRAC.gLogger.notice('Package found in Local Area:', package) continue if installSoftwarePackage(package, localArea())['OK']: continue DIRAC.gLogger.error('Check Failed for software package:', package) DIRAC.gLogger.error('Software package not available') DIRAC.exit(-1) telconf = os.path.join(localArea(), 'HAP/%s/config/%s' % (version, tellist)) ha = HapApplication() ha.setSoftwarePackage(HapPack) ha.hapExecutable = 'eventio_cta' ha.hapArguments = ['-file', infile, '-o', outfile, '-tellist', telconf] try: ha.hapArguments.extend( ['-Nfirst_mcevt', Nfirst_mcevt, '-Nlast_mcevt', Nlast_mcevt]) except NameError: DIRAC.gLogger.info('Nfirst_mcevt/Nlast_mcevt options are not used') try: if (pixelslices == 'true'): ha.hapArguments.extend(['-pixelslices']) except NameError: DIRAC.gLogger.info('pixelslices option is not used') DIRAC.gLogger.notice('Executing Hap Converter Application') res = ha.execute() if not res['OK']: DIRAC.gLogger.error('Failed to execute eventio_cta Application') jobReport.setApplicationStatus('eventio_cta: Failed') DIRAC.exit(-1) if not os.path.isfile(outfile): error = 'raw file was not created:' DIRAC.gLogger.error(error, outfile) jobReport.setApplicationStatus('eventio_cta: RawData not created') DIRAC.exit(-1) ###################### Check RAW DATA ####################### hr = HapRootMacro() hr.setSoftwarePackage(HapPack) DIRAC.gLogger.notice('Executing RAW check step0') hr.rootMacro = '/hapscripts/dst/Open_Raw.C+' outfilestr = '"' + outfile + '"' args = [outfilestr] DIRAC.gLogger.notice('Open_Raw macro Arguments:', args) hr.rootArguments = args DIRAC.gLogger.notice('Executing Hap Open_Raw macro') res = hr.execute() if not res['OK']: DIRAC.gLogger.error('Open_Raw: Failed') DIRAC.exit(-1) #########################Quality Check for raw Output File: step1#################### DIRAC.gLogger.notice('Executing Raw Check step1') ret = getSoftwareEnviron(HapPack) if not ret['OK']: error = ret['Message'] DIRAC.gLogger.error(error, HapPack) DIRAC.exit(-1) hapEnviron = ret['Value'] hessroot = hapEnviron['HESSROOT'] check_script = hessroot + '/hapscripts/dst/check_raw.csh' cmdTuple = [check_script] ret = systemCall(0, cmdTuple, sendOutput) if not ret['OK']: DIRAC.gLogger.error('Failed to execute RAW Check step1') jobReport.setApplicationStatus('Check_raw: Failed') DIRAC.exit(-1) status, stdout, stderr = ret['Value'] if status == 1: jobReport.setApplicationStatus( 'RAW Check step1: Big problem during RAW production') DIRAC.gLogger.error('Check_raw: Big problem during RAW production') DIRAC.exit(-1) DIRAC.exit()
def execute ( arguments ): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int( jobID ) if arguments.has_key( 'WorkingDirectory' ): wdir = os.path.expandvars( arguments['WorkingDirectory'] ) if os.path.isdir( wdir ): os.chdir( wdir ) else: try: os.makedirs( wdir ) if os.path.isdir( wdir ): os.chdir( wdir ) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' ) return 1 gJobReport = JobReport( jobID, 'JobWrapper' ) try: job = JobWrapper( jobID, gJobReport ) job.initialize( arguments ) except Exception: gLogger.exception( 'JobWrapper failed the initialization phase' ) rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' ) return 1 if arguments['Job'].has_key( 'InputSandbox' ): gJobReport.commit() try: result = job.transferInputSandbox( arguments['Job']['InputSandbox'] ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception: gLogger.exception( 'JobWrapper failed to download input sandbox' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' ) return 1 else: gLogger.verbose( 'Job has no InputSandbox requirement' ) gJobReport.commit() if arguments['Job'].has_key( 'InputData' ): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception, x: gLogger.exception( 'JobWrapper failed to resolve input data' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' ) return 1 else: gLogger.verbose( 'Job has a null InputData requirement:' ) gLogger.verbose( arguments )
def execute(self): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No work available', jobRequest['Message']): self.log.info('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return S_ERROR('Nothing to do') return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error(jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return S_ERROR('Nothing to do') return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: self.log.error(jobRequest['Message']) return S_ERROR(jobRequest['Message']) else: self.log.info('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return S_ERROR('Nothing to do') return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key(param): self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if not params.has_key('JobID'): msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if not params.has_key('JobType'): self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key('SystemConfig'): self.log.warn( 'Job has no system configuration defined in JDL parameters') systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig else: systemConfig = params['SystemConfig'] if systemConfig.lower() == 'any': systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =', '"%s" since it was set to "ANY" in the job description' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig if not params.has_key('MaxCPUTime'): self.log.warn( 'Job has no CPU requirement defined in JDL parameters') self.log.verbose('Job request successful: \n %s' % (jobRequest['Value'])) self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' % (jobID, jobType, systemConfig)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if self.gridCEQueue: jobReport.setJobParameter('GridCEQueue', self.gridCEQueue, sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') # self.__setJobSite( jobID, self.siteName ) if not self.pilotInfoReportedFlag: self.__reportPilotInfo(jobID) result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) if 'Value' in result and result['Value']: proxyChain = result['Value'] # Is this necessary at all? saveJDL = self.__saveJobJDLRequest(jobID, jobJDL) #self.__report(jobID,'Matched','Job Prepared to Submit') #resourceParameters = self.__getJDLParameters( resourceJDL ) #if not resourceParameters['OK']: # return resourceParameters #resourceParams = resourceParameters['Value'] software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job %s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.verbose('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure) self.log.verbose('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) result = self.timeLeftUtil.getTimeLeft(0.0) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete')
def execute(arguments): """ The only real function executed here """ global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) if 'WorkingDirectory' in arguments: wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) # this will raise an exception if wdir already exists (which is ~OK) if os.path.isdir(wdir): os.chdir(wdir) except OSError as osError: if osError.errno == errno.EEXIST and os.path.isdir(wdir): gLogger.exception('JobWrapperTemplate found that the working directory already exists') rescheduleResult = rescheduleFailedJob(jobID, 'Working Directory already exists') else: gLogger.exception('JobWrapperTemplate could not create working directory') rescheduleResult = rescheduleFailedJob(jobID, 'Could Not Create Working Directory') return 1 gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) # initialize doesn't return S_OK/S_ERROR except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper failed the initialization phase', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) try: job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization') except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper failed sending job accounting', lException=exc) return 1 if 'InputSandbox' in arguments['Job']: gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception while downloading input sandbox', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if 'InputData' in arguments['Job']: if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception while resolving input data', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments) else: gLogger.verbose('Job has no InputData requirement') gJobReport.commit() try: result = job.execute(arguments) if not result['OK']: gLogger.error('Failed to execute job', result['Message']) raise JobWrapperError((result['Message'], result['Errno'])) except JobWrapperError as exc: if exc.value[1] == 0 or str(exc.value[0]) == '0': gLogger.verbose('JobWrapper exited with status=0 after execution') if exc.value[1] == DErrno.EWMSRESC: gLogger.warn("Asked to reschedule job") rescheduleResult = rescheduleFailedJob(jobID, 'JobWrapper execution', gJobReport) job.sendJobAccounting(rescheduleResult, 'JobWrapper execution') return 1 gLogger.exception('Job failed in execution phase') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus( 'Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception('Job raised exception during execution phase', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']: try: result = job.processJobOutputs() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError as exc: gLogger.exception('JobWrapper failed to process output files') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception while processing output files', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 else: gLogger.verbose('Job has no OutputData or OutputSandbox requirement') try: # Failed jobs will return 1 / successful jobs will return 0 return job.finalize() except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception during the finalization phase', lException=exc) return 2
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key(param): self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if not params.has_key('JobID'): msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if not params.has_key('JobType'): self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key('CPUTime'): self.log.warn( 'Job has no CPU requirement defined in JDL parameters') if self.extraOptions: params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n %s' % (jobRequest['Value'])) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if os.environ.has_key('BOINC_JOB_ID'): # Report BOINC environment for p in [ 'BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName' ]: jobReport.setJobParameter(p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) if 'Value' in result and result['Value']: proxyChain = result['Value'] # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % submission[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) currentTimes = list(os.times()) for i in range(len(currentTimes)): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, _elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft(cpuTime) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete')
def execute( self ): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info( 'Attempting to check CPU time left for filling mode' ) if self.fillingMode: if self.timeLeftError: self.log.warn( self.timeLeftError ) return self.__finish( self.timeLeftError ) self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) ) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft ) if not result['OK']: return self.__finish( result['Message'] ) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join( '.', self.extraOptions ) else: localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" ) localCfg.loadFromFile( localConfigFile ) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft ) localCfg.writeToFile( localConfigFile ) else: return self.__finish( 'Filling Mode is Disabled' ) self.log.verbose( 'Job Agent execution loop' ) available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info( 'Resource is not available' ) self.log.info( available['Message'] ) return self.__finish( 'CE Not Available' ) self.log.info( available['Message'] ) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' ) if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str( self.pilotReference ) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict( '/AgentJobRequirements' ) if result['OK']: requirementsDict = result['Value'] ceDict.update( requirementsDict ) self.log.verbose( ceDict ) start = time.time() jobRequest = self.__requestJob( ceDict ) matchTime = time.time() - start self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches ) if not jobRequest['OK']: if re.search( 'No match found', jobRequest['Message'] ): self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "seconds timeout" ) != -1: self.log.error( jobRequest['Message'] ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 : self.log.error( jobRequest['Message'] ) return S_ERROR( jobRequest['Message'] ) else: self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False ) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key( param ): self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) elif not matcherInfo[param]: self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) else: self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) ) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters( jobJDL ) if not parameters['OK']: self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' ) self.log.warn( parameters['Message'] ) return self.__finish( 'JDL Problem' ) params = parameters['Value'] if not params.has_key( 'JobID' ): msg = 'Job has not JobID defined in JDL parameters' self.__report( jobID, 'Failed', msg ) self.log.warn( msg ) return self.__finish( 'JDL Problem' ) else: jobID = params['JobID'] if not params.has_key( 'JobType' ): self.log.warn( 'Job has no JobType defined in JDL parameters' ) jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key( 'CPUTime' ): self.log.warn( 'Job has no CPU requirement defined in JDL parameters' ) if self.extraOptions: params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) ) self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) ) self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) ) self.jobCount += 1 try: jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName ) jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False ) if os.environ.has_key( 'BOINC_JOB_ID' ): # Report BOINC environment for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']: jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False ) jobReport.setJobStatus( 'Matched', 'Job Received by Agent' ) result = self.__setupProxy( ownerDN, jobGroup ) if not result[ 'OK' ]: return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure ) if 'Value' in result and result[ 'Value' ]: proxyChain = result[ 'Value' ] # Save the job jdl for external monitoring self.__saveJobJDLRequest( jobID, jobJDL ) software = self.__checkInstallSoftware( jobID, params, ceDict ) if not software['OK']: self.log.error( 'Failed to install software for job %s' % ( jobID ) ) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure ) self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) ) submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain ) if not submission['OK']: self.__report( jobID, 'Failed', submission['Message'] ) return self.__finish( submission['Message'] ) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure ) self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) ) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure ) currentTimes = list( os.times() ) for i in range( len( currentTimes ) ): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, _elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft( cpuTime ) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) ) self.scaledCPUTime = scaledCPUTime return S_OK( 'Job Agent cycle complete' )
def main(): from DIRAC.Core.Base import Script Script.initialize() DIRAC.gLogger.notice('Platform is:') os.system('dirac-platform') from DIRAC.DataManagementSystem.Client.DataManager import DataManager from CTADIRAC.Core.Workflow.Modules.EvnDispApp import EvnDispApp from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea from DIRAC.Core.Utilities.Subprocess import systemCall from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport jobID = os.environ['JOBID'] jobID = int( jobID ) jobReport = JobReport( jobID ) version = sys.argv[3] DIRAC.gLogger.notice( 'Version:', version ) EvnDispPack = os.path.join('evndisplay',version,'evndisplay') packs = [EvnDispPack] for package in packs: DIRAC.gLogger.notice( 'Checking:', package ) if checkSoftwarePackage( package, sharedArea() )['OK']: DIRAC.gLogger.notice( 'Package found in Shared Area:', package ) installSoftwareEnviron( package, sharedArea() ) # cmd = 'cp -r ' + os.path.join(sharedArea(),'evndisplay',version,'EVNDISP.CTA.runparameter') + ' .' # if(os.system(cmd)): # DIRAC.exit( -1 ) # cmd = 'cp -r ' + os.path.join(sharedArea(),'evndisplay',version,'Calibration') + ' .' # if(os.system(cmd)): # DIRAC.exit( -1 ) continue else: installSoftwarePackage( package, workingArea() ) DIRAC.gLogger.notice( 'Package found in workingArea:', package ) continue DIRAC.gLogger.error( 'Check Failed for software package:', package ) DIRAC.gLogger.error( 'Software package not available') DIRAC.exit( -1 ) ed = EvnDispApp() ed.setSoftwarePackage(EvnDispPack) ########## Use of trg mask file ####################### usetrgfile = sys.argv[7] DIRAC.gLogger.notice( 'Usetrgfile:', usetrgfile ) ####### Use of multiple inputs per job ### simtelFileLFNList = sys.argv[-1].split('ParametricParameters={')[1].split('}')[0].replace(',',' ') # first element of the list simtelFileLFN = simtelFileLFNList.split(' ')[0] ## convert the string into a list and get the basename simtelFileList = [] for word in simtelFileLFNList.split(): simtelFileList.append(os.path.basename(word)) #### Parse the Layout List ################# layoutList = parseLayoutList(sys.argv[9]) ############################################# #### Loop over the Layout List ################# for layout in layoutList: args = [] ########## download trg mask file ####################### if usetrgfile == 'True': trgmaskFileLFN = simtelFileLFN.replace( 'simtel.gz', 'trgmask.gz' ) DIRAC.gLogger.notice( 'Trying to download the trgmask File', trgmaskFileLFN ) result = DataManager().getFile( trgmaskFileLFN ) if not result['OK']: DIRAC.gLogger.error( 'Failed to download trgmakfile:', result ) jobReport.setApplicationStatus( 'Trgmakfile download Error' ) DIRAC.exit( -1 ) args.extend( ['-t', os.path.basename( trgmaskFileLFN )] ) ############################################################ ###### execute evndisplay converter ################## executable = sys.argv[5] ############ dst file Name ############################ run_number = simtelFileList[-1].split( 'run' )[1].split( '.simtel.gz' )[0] runNum = int( run_number ) subRunNumber = '%06d' % runNum particle = simtelFileList[-1].split( '_' )[0] if 'ptsrc' in simtelFileList[-1]: particle = particle + '_' + 'ptsrc' dstfile = particle + '_run' + subRunNumber + '_' + str( jobID ) + '_' + os.path.basename( layout ) + '_dst.root' ########################################### logfileName = executable + '_' + layout + '.log' layout = os.path.join( 'EVNDISP.CTA.runparameter/DetectorGeometry', layout ) DIRAC.gLogger.notice( 'Layout is:', layout ) # add other arguments for evndisplay converter specified by user ###### converterparfile = open( 'converter.par', 'r' ).readlines() for line in converterparfile: for word in line.split(): args.append( word ) ######################################################### args.extend( ['-a', layout] ) args.extend( ['-o', dstfile] ) args.extend( simtelFileList ) execute_module( ed, executable, args ) ########### check existence of DST file ############### if not os.path.isfile( dstfile ): DIRAC.gLogger.error( 'DST file Missing:', dstfile ) jobReport.setApplicationStatus( 'DST file Missing' ) DIRAC.exit( -1 ) ########### quality check on Log ############################################# cmd = 'mv ' + executable + '.log' + ' ' + logfileName if( os.system( cmd ) ): DIRAC.exit( -1 ) fd = open( 'check_log.sh', 'w' ) fd.write( """#! /bin/sh MCevts=$(grep writing %s | grep "MC events" | awk '{print $2}') if [ $MCevts -gt 0 ]; then exit 0 else echo "MCevts is zero" exit -1 fi """ % (logfileName)) fd.close() os.system( 'chmod u+x check_log.sh' ) cmd = './check_log.sh' DIRAC.gLogger.notice( 'Executing system call:', cmd ) if( os.system( cmd ) ): jobReport.setApplicationStatus( 'Converter Log Check Failed' ) DIRAC.exit( -1 ) #### Check the mode ################# mode = sys.argv[11] if( mode == 'convert_standalone' ): #DIRAC.exit() continue ###### execute evndisplay stage1 ############### executable = 'evndisp' logfileName = executable + '_' + os.path.basename( layout ) + '.log' args = ['-sourcefile', dstfile, '-outputdirectory', 'outdir'] # add other arguments for evndisp specified by user ###### evndispparfile = open( 'evndisp.par', 'r' ).readlines() for line in evndispparfile: for word in line.split(): args.append( word ) execute_module( ed, executable, args ) for name in glob.glob( 'outdir/*.root' ): evndispOutFile = name.split( '.root' )[0] + '_' + str( jobID ) + '_' + os.path.basename( layout ) + '_evndisp.root' cmd = 'mv ' + name + ' ' + os.path.basename( evndispOutFile ) if( os.system( cmd ) ): DIRAC.exit( -1 ) ########### quality check on Log ############################################# cmd = 'mv ' + executable + '.log' + ' ' + logfileName if( os.system( cmd ) ): DIRAC.exit( -1 ) fd = open( 'check_log.sh', 'w' ) fd.write( """#! /bin/sh if grep -i "error" %s; then exit 1 fi if grep "Final checks on result file (seems to be OK):" %s; then exit 0 else exit 1 fi """ % (logfileName,logfileName)) fd.close() os.system( 'chmod u+x check_log.sh' ) cmd = './check_log.sh' DIRAC.gLogger.notice( 'Executing system call:', cmd ) if( os.system( cmd ) ): jobReport.setApplicationStatus( 'EvnDisp Log Check Failed' ) DIRAC.exit( -1 ) ################################################################## ########### remove the converted dst file ############################################# cmd = 'rm ' + dstfile if( os.system( cmd ) ): DIRAC.exit( -1 ) DIRAC.exit()