def execute(self): """The JobAgent execution method.""" # Temporary mechanism to pass a shutdown message to the agent if os.path.exists("/var/lib/dirac_drain"): return self._finish("Node is being drained by an operator") self.log.verbose("Job Agent execution loop") # Check that there is enough slots to match a job result = self._checkCEAvailability(self.computingElement) if not result["OK"]: return self._finish(result["Message"]) if result["OK"] and result["Value"]: return result # Check that we are allowed to continue and that time left is sufficient if self.jobCount: cpuWorkLeft = self._computeCPUWorkLeft() result = self._checkCPUWorkLeft(cpuWorkLeft) if not result["OK"]: return result result = self._setCPUWorkLeft(cpuWorkLeft) if not result["OK"]: return result # Get environment details and enhance them result = self._getCEDict(self.computingElement) if not result["OK"]: return result ceDictList = result["Value"] for ceDict in ceDictList: self._setCEDict(ceDict) # Try to match a job jobRequest = self._matchAJob(ceDictList) self.stopAfterFailedMatches = self.am_getOption( "StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: res = self._checkMatchingIssues(jobRequest) if not res["OK"]: self._finish(res["Message"]) return res # if we don't match a job, independently from the reason, # we wait a bit longer before trying again time.sleep( int(self.am_getOption("PollingTime")) * (self.matchFailedCount + 1) * 2) return res # If we are, we matched a job # Reset the Counter self.matchFailedCount = 0 # Check matcher information returned matcherParams = ["JDL", "DN", "Group"] matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport) if not result["OK"]: return self._finish(result["Message"]) # Get matcher information if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( "PilotInfoReportedFlag", False) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] ceDict = matcherInfo["CEDict"] matchTime = matcherInfo["matchTime"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] # Get JDL paramters parameters = self._getJDLParameters(jobJDL) if not parameters["OK"]: jobReport.setJobStatus( status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters") self.log.warn("Could Not Extract JDL Parameters", parameters["Message"]) return self._finish("JDL Problem") params = parameters["Value"] result = self._extractValuesFromJobParams(params, jobReport) if not result["OK"]: return self._finish(result["Value"]) submissionParams = result["Value"] jobID = submissionParams["jobID"] jobType = submissionParams["jobType"] self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info( "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False) if "BOINC_JOB_ID" in os.environ: # Report BOINC environment for thisp in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"): jobReport.setJobParameter(par_name=thisp, par_value=gConfig.getValue( "/LocalSite/%s" % thisp, "Unknown"), sendFlag=False) jobReport.setJobStatus(minorStatus="Job Received by Agent", sendFlag=False) result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy["OK"]: result = self._rescheduleFailedJob( jobID, result_setupProxy["Message"]) return self._finish(result["Message"], self.stopOnApplicationFailure) proxyChain = result_setupProxy.get("Value") # Save the job jdl for external monitoring self._saveJobJDLRequest(jobID, jobJDL) # Check software and install them if required software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" result = self._rescheduleFailedJob(jobID, errorMsg) return self._finish(result["Message"], self.stopOnApplicationFailure) gridCE = gConfig.getValue("/LocalSite/GridCE", "") if gridCE: jobReport.setJobParameter(par_name="GridCE", par_value=gridCE, sendFlag=False) queue = gConfig.getValue("/LocalSite/CEQueue", "") if queue: jobReport.setJobParameter(par_name="CEQueue", par_value=queue, sendFlag=False) self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=submissionParams["processors"], wholeNode=submissionParams["wholeNode"], maxNumberOfProcessors=submissionParams[ "maxNumberOfProcessors"], mpTag=submissionParams["mpTag"], ) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res["OK"]: resFD = jobReport.generateForwardDISET() if not resFD["OK"]: self.log.error("Error generating ForwardDISET operation", resFD["Message"]) elif resFD["Value"]: # Here we create the Request. op = resFD["Value"] request = Request() requestName = "jobAgent_%s" % jobID request.RequestName = requestName.replace('"', "") request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob["OK"]: return self._finish(result_submitJob["Message"]) elif "PayloadFailed" in result_submitJob: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % result_submitJob[ "PayloadFailed"] if self.stopOnApplicationFailure: return self._finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) result = self._rescheduleFailedJob( jobID, "Job processing failed with exception", direct=True) return self._finish(result["Message"], self.stopOnApplicationFailure) return S_OK("Job Agent cycle complete")
def execute(arguments): """ The only real function executed here """ global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) if 'WorkingDirectory' in arguments: wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) # this will raise an exception if wdir already exists (which is ~OK) if os.path.isdir(wdir): os.chdir(wdir) except OSError as osError: if osError.errno == errno.EEXIST and os.path.isdir(wdir): gLogger.exception('JobWrapperTemplate found that the working directory already exists') rescheduleResult = rescheduleFailedJob(jobID, 'Working Directory already exists') else: gLogger.exception('JobWrapperTemplate could not create working directory') rescheduleResult = rescheduleFailedJob(jobID, 'Could Not Create Working Directory') return 1 gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) # initialize doesn't return S_OK/S_ERROR except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper failed the initialization phase', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) try: job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization') except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper failed sending job accounting', lException=exc) return 1 if 'InputSandbox' in arguments['Job']: gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception while downloading input sandbox', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if 'InputData' in arguments['Job']: if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception while resolving input data', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments) else: gLogger.verbose('Job has no InputData requirement') gJobReport.commit() try: result = job.execute(arguments) if not result['OK']: gLogger.error('Failed to execute job', result['Message']) raise JobWrapperError((result['Message'], result['Errno'])) except JobWrapperError as exc: if exc.value[1] == 0 or str(exc.value[0]) == '0': gLogger.verbose('JobWrapper exited with status=0 after execution') if exc.value[1] == DErrno.EWMSRESC: gLogger.warn("Asked to reschedule job") rescheduleResult = rescheduleFailedJob(jobID, 'JobWrapper execution', gJobReport) job.sendJobAccounting(rescheduleResult, 'JobWrapper execution') return 1 gLogger.exception('Job failed in execution phase') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus( 'Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception('Job raised exception during execution phase', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']: try: result = job.processJobOutputs() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError as exc: gLogger.exception('JobWrapper failed to process output files') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception while processing output files', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 else: gLogger.verbose('Job has no OutputData or OutputSandbox requirement') try: # Failed jobs will return 1 / successful jobs will return 0 return job.finalize() except Exception as exc: # pylint: disable=broad-except gLogger.exception('JobWrapper raised exception during the finalization phase', lException=exc) return 2
def execute(arguments): """ The only real function executed here """ global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) if 'WorkingDirectory' in arguments: wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs( wdir ) # this will raise an exception if wdir already exists (which is ~OK) if os.path.isdir(wdir): os.chdir(wdir) except OSError as osError: if osError.errno == errno.EEXIST and os.path.isdir(wdir): gLogger.exception( 'JobWrapperTemplate found that the working directory already exists' ) rescheduleResult = rescheduleFailedJob( jobID, 'Working Directory already exists') else: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory') return 1 gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) # initialize doesn't return S_OK/S_ERROR except Exception as exc: #pylint: disable=broad-except gLogger.exception('JobWrapper failed the initialization phase', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) try: job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization') except Exception as exc: #pylint: disable=broad-except gLogger.exception('JobWrapper failed sending job accounting', lException=exc) return 1 if 'InputSandbox' in arguments['Job']: gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 except Exception as exc: #pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception while downloading input sandbox', lException=exc) rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if 'InputData' in arguments['Job']: if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 except Exception as exc: #pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception while resolving input data', lException=exc) rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments) else: gLogger.verbose('Job has no InputData requirement') gJobReport.commit() try: result = job.execute(arguments) if not result['OK']: gLogger.error('Failed to execute job', result['Message']) raise JobWrapperError((result['Message'], result['Errno'])) except JobWrapperError as exc: if exc.value[1] == 0 or str(exc.value[0]) == '0': gLogger.verbose('JobWrapper exited with status=0 after execution') if exc.value[1] == DErrno.EWMSRESC: gLogger.warn("Asked to reschedule job") rescheduleResult = rescheduleFailedJob(jobID, 'JobWrapper execution', gJobReport) job.sendJobAccounting(rescheduleResult, 'JobWrapper execution') return 1 gLogger.exception('Job failed in execution phase') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 except Exception as exc: #pylint: disable=broad-except gLogger.exception('Job raised exception during execution phase', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']: try: result = job.processJobOutputs(arguments) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except JobWrapperError as exc: gLogger.exception('JobWrapper failed to process output files') gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 except Exception as exc: # pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception while processing output files', lException=exc) gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 else: gLogger.verbose('Job has no OutputData or OutputSandbox requirement') try: # Failed jobs will return 1 / successful jobs will return 0 return job.finalize() except Exception as exc: #pylint: disable=broad-except gLogger.exception( 'JobWrapper raised exception during the finalization phase', lException=exc) return 2
def execute ( arguments ): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int( jobID ) # Fix in the environment to get a reasonable performance from dCache, # until we move to a new version of root # os.environ['DCACHE_RAHEAD'] = str(1) # os.environ['DCACHE_RA_BUFFER'] = str(50*1024) if arguments.has_key( 'WorkingDirectory' ): wdir = os.path.expandvars( arguments['WorkingDirectory'] ) if os.path.isdir( wdir ): os.chdir( wdir ) else: try: os.makedirs( wdir ) if os.path.isdir( wdir ): os.chdir( wdir ) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleFailedJob( jobID, 'Could Not Create Working Directory' ) return 1 #root = arguments['CE']['Root'] gJobReport = JobReport( jobID, 'JobWrapper' ) try: job = JobWrapper( jobID, gJobReport ) job.initialize( arguments ) except Exception: gLogger.exception( 'JobWrapper failed the initialization phase' ) rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport ) job.sendWMSAccounting( 'Failed', 'Job Wrapper Initialization' ) return 1 if arguments['Job'].has_key( 'InputSandbox' ): gJobReport.commit() try: result = job.transferInputSandbox( arguments['Job']['InputSandbox'] ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception: gLogger.exception( 'JobWrapper failed to download input sandbox' ) rescheduleFailedJob( jobID, 'Input Sandbox Download' ) job.sendWMSAccounting( 'Failed', 'Input Sandbox Download' ) return 1 else: gLogger.verbose( 'Job has no InputSandbox requirement' ) gJobReport.commit() if arguments['Job'].has_key( 'InputData' ): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception, x: gLogger.exception( 'JobWrapper failed to resolve input data' ) rescheduleFailedJob( jobID, 'Input Data Resolution' ) job.sendWMSAccounting( 'Failed', 'Input Data Resolution' ) return 1 else: gLogger.verbose( 'Job has a null InputData requirement:' ) gLogger.verbose( arguments )
def execute ( arguments ): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int( jobID ) if arguments.has_key( 'WorkingDirectory' ): wdir = os.path.expandvars( arguments['WorkingDirectory'] ) if os.path.isdir( wdir ): os.chdir( wdir ) else: try: os.makedirs( wdir ) if os.path.isdir( wdir ): os.chdir( wdir ) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' ) return 1 gJobReport = JobReport( jobID, 'JobWrapper' ) try: job = JobWrapper( jobID, gJobReport ) job.initialize( arguments ) except Exception: gLogger.exception( 'JobWrapper failed the initialization phase' ) rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' ) return 1 if arguments['Job'].has_key( 'InputSandbox' ): gJobReport.commit() try: result = job.transferInputSandbox( arguments['Job']['InputSandbox'] ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception: gLogger.exception( 'JobWrapper failed to download input sandbox' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' ) return 1 else: gLogger.verbose( 'Job has no InputSandbox requirement' ) gJobReport.commit() if arguments['Job'].has_key( 'InputData' ): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception, x: gLogger.exception( 'JobWrapper failed to resolve input data' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' ) return 1 else: gLogger.verbose( 'Job has a null InputData requirement:' ) gLogger.verbose( arguments )
def execute(self): """The JobAgent execution method.""" self.log.verbose("Job Agent execution loop") queueDictItems = list(self.queueDict.items()) random.shuffle(queueDictItems) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result for queueName, queueDictionary in queueDictItems: # Make sure there is no problem with the queue before trying to submit if not self._allowedToSubmit(queueName): continue # Get a working proxy ce = queueDictionary["CE"] cpuTime = 86400 * 3 self.log.verbose( "Getting pilot proxy", "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result proxy = result["Value"] result = proxy.getRemainingSecs() # pylint: disable=no-member if not result["OK"]: return result lifetime_secs = result["Value"] ce.setProxy(proxy, lifetime_secs) # Check that there is enough slots in the remote CE to match a job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 continue # Get environment details and enhance them result = self._getCEDict(ce) if not result["OK"]: self.failedQueues[queueName] += 1 continue ceDictList = result["Value"] for ceDict in ceDictList: # Information about number of processors might not be returned in CE.getCEStatus() ceDict["NumberOfProcessors"] = ce.ceParameters.get( "NumberOfProcessors") self._setCEDict(ceDict) # Update the configuration with the names of the Site, CE and queue to target # This is used in the next stages self._updateConfiguration("Site", queueDictionary["Site"]) self._updateConfiguration("GridCE", queueDictionary["CEName"]) self._updateConfiguration("CEQueue", queueDictionary["QueueName"]) self._updateConfiguration("RemoteExecution", True) # Try to match a job jobRequest = self._matchAJob(ceDictList) while jobRequest["OK"]: # Check matcher information returned matcherParams = ["JDL", "DN", "Group"] matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName) result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] ceDict = matcherInfo["CEDict"] matchTime = matcherInfo["matchTime"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] # Get JDL paramters parameters = self._getJDLParameters(jobJDL) if not parameters["OK"]: jobReport.setJobStatus( status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters") self.log.warn("Could Not Extract JDL Parameters", parameters["Message"]) self.failedQueues[queueName] += 1 break params = parameters["Value"] result = self._extractValuesFromJobParams(params, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break submissionParams = result["Value"] jobID = submissionParams["jobID"] jobType = submissionParams["jobType"] self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info( "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup)) try: jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False) jobReport.setJobStatus(status=JobStatus.MATCHED, minorStatus="Job Received by Agent", sendFlag=False) # Setup proxy result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy["OK"]: result = self._rescheduleFailedJob( jobID, result_setupProxy["Message"]) self.failedQueues[queueName] += 1 break proxyChain = result_setupProxy.get("Value") # Check software and install them if required software = self._checkInstallSoftware( jobID, params, ceDict, jobReport) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" result = self._rescheduleFailedJob(jobID, errorMsg) self.failedQueues[queueName] += 1 break # Submit the job to the CE self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=submissionParams["processors"], wholeNode=submissionParams["wholeNode"], maxNumberOfProcessors=submissionParams[ "maxNumberOfProcessors"], mpTag=submissionParams["mpTag"], ) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res["OK"]: resFD = jobReport.generateForwardDISET() if not resFD["OK"]: self.log.error( "Error generating ForwardDISET operation", resFD["Message"]) elif resFD["Value"]: # Here we create the Request. op = resFD["Value"] request = Request() requestName = "jobAgent_%s" % jobID request.RequestName = requestName.replace('"', "") request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob["OK"]: self.log.error("Error during submission", result_submitJob["Message"]) self.failedQueues[queueName] += 1 break elif "PayloadFailed" in result_submitJob: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % result_submitJob[ "PayloadFailed"] self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result # Check that there is enough slots in the remote CE to match a new job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 break # Try to match a new job jobRequest = self._matchAJob(ceDictList) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) result = self._rescheduleFailedJob( jobID, "Job processing failed with exception") self.failedQueues[queueName] += 1 break if not jobRequest["OK"]: self._checkMatchingIssues(jobRequest) self.failedQueues[queueName] += 1 continue return S_OK("Push Job Agent cycle complete")
def execute(arguments): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) # Fix in the environment to get a reasonable performance from dCache, # until we move to a new version of root # os.environ['DCACHE_RAHEAD'] = str(1) # os.environ['DCACHE_RA_BUFFER'] = str(50*1024) if arguments.has_key('WorkingDirectory'): wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) if os.path.isdir(wdir): os.chdir(wdir) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory') rescheduleFailedJob(jobID, 'Could Not Create Working Directory') return 1 #root = arguments['CE']['Root'] gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) except Exception: gLogger.exception('JobWrapper failed the initialization phase') rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) job.sendWMSAccounting('Failed', 'Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleFailedJob(jobID, 'Input Sandbox Download') job.sendWMSAccounting('Failed', 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if arguments['Job'].has_key('InputData'): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception, x: gLogger.exception('JobWrapper failed to resolve input data') rescheduleFailedJob(jobID, 'Input Data Resolution') job.sendWMSAccounting('Failed', 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments)
def execute(self): """The JobAgent execution method. """ # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Check if we can match jobs at all self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', ': %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) self.log.info( 'CE is not available (and there are no running jobs)') return self.__finish('CE Not Available') if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: self.timeLeft = self.computeCPUWorkLeft() self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') # if we are here we assume that a job can be matched result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: # if we don't match a job, independently from the reason, # we wait a bit longer before trying again self.am_setOption("PollingTime", int(self.am_getOption("PollingTime") * 1.5)) if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 # If we are here it is because we matched a job matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: jobReport.setJobStatus(status='Failed', minor='Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: jobReport.setJobStatus(status='Failed', minor='Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: jobReport.setJobStatus(status='Failed', minor='Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' jobReport.setJobStatus(status='Failed', minor=msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions and 'dirac-jobexec' in params.get( 'Executable', '').strip(): params['Arguments'] = (params.get('Arguments', '') + ' ' + self.extraOptions).strip() params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport.setJobParameter(par_name='MatcherServiceTime', par_value=str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(par_name=thisp, par_value=gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus(status='Matched', minor='Job Received by Agent', sendFlag=False) result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy['OK']: return self._rescheduleFailedJob(jobID, result_setupProxy['Message'], self.stopOnApplicationFailure) proxyChain = result_setupProxy.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=processors, wholeNode=wholeNode, maxNumberOfProcessors=maxNumberOfProcessors, mpTag=mpTag) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res['OK']: resFD = jobReport.generateForwardDISET() if not resFD['OK']: self.log.error("Error generating ForwardDISET operation", resFD['Message']) else: # Here we create the Request. op = resFD['Value'] request = Request() requestName = 'jobAgent_%s' % jobID request.RequestName = requestName.replace('"', '') request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob['OK']: return self.__finish(result_submitJob['Message']) elif 'PayloadFailed' in result_submitJob: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result_submitJob[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) return S_OK('Job Agent cycle complete')
def execute(arguments): """The only real function executed here""" global gJobReport jobID = arguments["Job"].get("JobID", 0) os.environ["JOBID"] = str(jobID) jobID = int(jobID) if "WorkingDirectory" in arguments: wdir = os.path.expandvars(arguments["WorkingDirectory"]) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs( wdir ) # this will raise an exception if wdir already exists (which is ~OK) if os.path.isdir(wdir): os.chdir(wdir) except OSError as osError: if osError.errno == errno.EEXIST and os.path.isdir(wdir): gLogger.exception( "JobWrapperTemplate found that the working directory already exists" ) rescheduleResult = rescheduleFailedJob( jobID, "Working Directory already exists") else: gLogger.exception( "JobWrapperTemplate could not create working directory" ) rescheduleResult = rescheduleFailedJob( jobID, "Could Not Create Working Directory") return 1 gJobReport = JobReport(jobID, "JobWrapper") try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) # initialize doesn't return S_OK/S_ERROR except Exception as exc: # pylint: disable=broad-except gLogger.exception("JobWrapper failed the initialization phase", lException=exc) rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION) return 1 if "InputSandbox" in arguments["Job"]: gJobReport.commit() try: result = job.transferInputSandbox(arguments["Job"]["InputSandbox"]) if not result["OK"]: gLogger.warn(result["Message"]) raise JobWrapperError(result["Message"]) except JobWrapperError: gLogger.exception("JobWrapper failed to download input sandbox") rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX) return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception while downloading input sandbox", lException=exc) rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX) return 1 else: gLogger.verbose("Job has no InputSandbox requirement") gJobReport.commit() if "InputData" in arguments["Job"]: if arguments["Job"]["InputData"]: try: result = job.resolveInputData() if not result["OK"]: gLogger.warn(result["Message"]) raise JobWrapperError(result["Message"]) except JobWrapperError: gLogger.exception("JobWrapper failed to resolve input data") rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION) return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception while resolving input data", lException=exc) rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION) return 1 else: gLogger.verbose("Job has a null InputData requirement:") gLogger.verbose(arguments) else: gLogger.verbose("Job has no InputData requirement") gJobReport.commit() try: result = job.execute() if not result["OK"]: gLogger.error("Failed to execute job", result["Message"]) raise JobWrapperError((result["Message"], result["Errno"])) except JobWrapperError as exc: if exc.value[1] == 0 or str(exc.value[0]) == "0": gLogger.verbose("JobWrapper exited with status=0 after execution") if exc.value[1] == DErrno.EWMSRESC: gLogger.warn("Asked to reschedule job") rescheduleResult = rescheduleFailedJob( jobID=jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=gJobReport) job.sendJobAccounting( status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION) return 1 gLogger.exception("Job failed in execution phase") gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC) return 1 except Exception as exc: # pylint: disable=broad-except gLogger.exception("Job raised exception during execution phase", lException=exc) gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC) return 1 if "OutputSandbox" in arguments["Job"] or "OutputData" in arguments["Job"]: try: result = job.processJobOutputs() if not result["OK"]: gLogger.warn(result["Message"]) raise JobWrapperError(result["Message"]) except JobWrapperError as exc: gLogger.exception("JobWrapper failed to process output files") gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS) return 2 except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception while processing output files", lException=exc) gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False) gJobReport.setJobStatus( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS, sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting( status=JobStatus.FAILED, minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS) return 2 else: gLogger.verbose("Job has no OutputData or OutputSandbox requirement") try: # Failed jobs will return !=0 / successful jobs will return 0 return job.finalize() except Exception as exc: # pylint: disable=broad-except gLogger.exception( "JobWrapper raised exception during the finalization phase", lException=exc) return 2
def execute(arguments): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) if arguments.has_key('WorkingDirectory'): wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) if os.path.isdir(wdir): os.chdir(wdir) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory') rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory') return 1 gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) except Exception as e: gLogger.exception('JobWrapper failed the initialization phase', lException=e) rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) try: job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization') except Exception as e: gLogger.exception('JobWrapper failed sending job accounting', lException=e) return 1 if arguments['Job'].has_key('InputSandbox'): gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if arguments['Job'].has_key('InputData'): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception as x: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments) else: gLogger.verbose('Job has no InputData requirement') gJobReport.commit() try: result = job.execute(arguments) if not result['OK']: gLogger.error('Failed to execute job', result['Message']) raise JobWrapperError(result['Message']) except Exception as x: if str(x) == '0': gLogger.verbose('JobWrapper exited with status=0 after execution') else: gLogger.exception('Job failed in execution phase') gJobReport.setJobParameter('Error Message', str(x), sendFlag=False) gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False) job.sendFailoverRequest('Failed', 'Exception During Execution') return 1 if arguments['Job'].has_key('OutputSandbox') or arguments['Job'].has_key( 'OutputData'): try: result = job.processJobOutputs(arguments) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception as x: gLogger.exception('JobWrapper failed to process output files') gJobReport.setJobParameter('Error Message', str(x), sendFlag=False) gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False) job.sendFailoverRequest('Failed', 'Uploading Job Outputs') return 2 else: gLogger.verbose('Job has no OutputData or OutputSandbox requirement') try: # Failed jobs will return 1 / successful jobs will return 0 return job.finalize(arguments) except Exception: gLogger.exception('JobWrapper failed the finalization phase') return 2
return 1 #root = arguments['CE']['Root'] jobReport = JobReport(jobID,'JobWrapper') try: job = JobWrapper( jobID, jobReport ) job.initialize(arguments) except Exception, x: gLogger.exception('JobWrapper failed the initialization phase') rescheduleFailedJob(jobID,'Job Wrapper Initialization') job.sendWMSAccounting('Failed','Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): jobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception, x: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleFailedJob(jobID,'Input Sandbox Download') job.sendWMSAccounting('Failed','Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') jobReport.commit()
def execute( arguments ): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int( jobID ) if arguments.has_key( 'WorkingDirectory' ): wdir = os.path.expandvars( arguments['WorkingDirectory'] ) if os.path.isdir( wdir ): os.chdir( wdir ) else: try: os.makedirs( wdir ) if os.path.isdir( wdir ): os.chdir( wdir ) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory' ) rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' ) return 1 gJobReport = JobReport( jobID, 'JobWrapper' ) try: job = JobWrapper( jobID, gJobReport ) job.initialize( arguments ) except Exception as e: gLogger.exception( 'JobWrapper failed the initialization phase', lException = e ) rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport ) try: job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' ) except Exception as e: gLogger.exception( 'JobWrapper failed sending job accounting', lException = e ) return 1 if arguments['Job'].has_key( 'InputSandbox' ): gJobReport.commit() try: result = job.transferInputSandbox( arguments['Job']['InputSandbox'] ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception: gLogger.exception( 'JobWrapper failed to download input sandbox' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' ) return 1 else: gLogger.verbose( 'Job has no InputSandbox requirement' ) gJobReport.commit() if arguments['Job'].has_key( 'InputData' ): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception as x: gLogger.exception( 'JobWrapper failed to resolve input data' ) rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport ) job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' ) return 1 else: gLogger.verbose( 'Job has a null InputData requirement:' ) gLogger.verbose( arguments ) else: gLogger.verbose( 'Job has no InputData requirement' ) gJobReport.commit() try: result = job.execute( arguments ) if not result['OK']: gLogger.error( 'Failed to execute job', result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception as x: if str( x ) == '0': gLogger.verbose( 'JobWrapper exited with status=0 after execution' ) else: gLogger.exception( 'Job failed in execution phase' ) gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False ) gJobReport.setJobStatus( 'Failed', 'Exception During Execution', sendFlag = False ) job.sendFailoverRequest( 'Failed', 'Exception During Execution' ) return 1 if arguments['Job'].has_key( 'OutputSandbox' ) or arguments['Job'].has_key( 'OutputData' ): try: result = job.processJobOutputs( arguments ) if not result['OK']: gLogger.warn( result['Message'] ) raise JobWrapperError( result['Message'] ) except Exception as x: gLogger.exception( 'JobWrapper failed to process output files' ) gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False ) gJobReport.setJobStatus( 'Failed', 'Uploading Job Outputs', sendFlag = False ) job.sendFailoverRequest( 'Failed', 'Uploading Job Outputs' ) return 2 else: gLogger.verbose( 'Job has no OutputData or OutputSandbox requirement' ) try: # Failed jobs will return 1 / successful jobs will return 0 return job.finalize( arguments ) except Exception: gLogger.exception( 'JobWrapper failed the finalization phase' ) return 2
return 1 #root = arguments['CE']['Root'] jobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, jobReport) job.initialize(arguments) except Exception, x: gLogger.exception('JobWrapper failed the initialization phase') rescheduleFailedJob(jobID, 'Job Wrapper Initialization') job.sendWMSAccounting('Failed', 'Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): jobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception, x: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleFailedJob(jobID, 'Input Sandbox Download') job.sendWMSAccounting('Failed', 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') jobReport.commit()
def execute(arguments): global gJobReport jobID = arguments['Job']['JobID'] os.environ['JOBID'] = jobID jobID = int(jobID) if arguments.has_key('WorkingDirectory'): wdir = os.path.expandvars(arguments['WorkingDirectory']) if os.path.isdir(wdir): os.chdir(wdir) else: try: os.makedirs(wdir) if os.path.isdir(wdir): os.chdir(wdir) except Exception: gLogger.exception( 'JobWrapperTemplate could not create working directory') rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory') return 1 gJobReport = JobReport(jobID, 'JobWrapper') try: job = JobWrapper(jobID, gJobReport) job.initialize(arguments) except Exception: gLogger.exception('JobWrapper failed the initialization phase') rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport) job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization') return 1 if arguments['Job'].has_key('InputSandbox'): gJobReport.commit() try: result = job.transferInputSandbox(arguments['Job']['InputSandbox']) if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception: gLogger.exception('JobWrapper failed to download input sandbox') rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download') return 1 else: gLogger.verbose('Job has no InputSandbox requirement') gJobReport.commit() if arguments['Job'].has_key('InputData'): if arguments['Job']['InputData']: try: result = job.resolveInputData() if not result['OK']: gLogger.warn(result['Message']) raise JobWrapperError(result['Message']) except Exception, x: gLogger.exception('JobWrapper failed to resolve input data') rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport) job.sendJobAccounting(rescheduleResult, 'Input Data Resolution') return 1 else: gLogger.verbose('Job has a null InputData requirement:') gLogger.verbose(arguments)