else: job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) # stage-out ........................................................................................ # update the job state file tolog(runJob.getOutputDir()) job.jobState = "stageout" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # verify and prepare and the output files for transfer ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(job.outFiles, job.logFile, job.workdir) if ec: # missing output file (only error code from prepareOutFiles) runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # update the current file states updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created") dumpFileStates(runJob.getParentWorkDir(), job.jobId) # create xml string to pass to dispatcher for atlas jobs outputFileInfo = {} if outs or (job.logFile and job.logFile != ''): # get the datasets for the output files dsname, datasetDict = runJob.getDatasets(job)
if job.result[1] != 0 or job.result[2] != 0: runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) # stage-out ........................................................................................ # update the job state file tolog(runJob.getOutputDir()) job.jobState = "stageout" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # verify and prepare and the output files for transfer ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles( job.outFiles, job.logFile, job.workdir) if ec: # missing output file (only error code from prepareOutFiles) runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # update the current file states updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created") dumpFileStates(runJob.getParentWorkDir(), job.jobId)
#else: # job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) # stage-out ........................................................................................ # update the job state file tolog(runJob.getOutputDir()) job.jobState = "stageout" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # verify and prepare and the output files for transfer ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(job.outFiles, job.logFile, runJob.job_path) if ec: # missing output file (only error code from prepareOutFiles) runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # update the current file states updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created") dumpFileStates(runJob.getParentWorkDir(), job.jobId) # create xml string to pass to dispatcher for atlas jobs outputFileInfo = {} if outs or (job.logFile and job.logFile != ''): # get the datasets for the output files dsname, datasetDict = runJob.getDatasets(job)
def finishJob(self): try: self.__hpcManager.finishJob() except: tolog(sys.exc_info()[1]) tolog(sys.exc_info()[2]) # If payload leaves the input files, delete them explicitly if self.__job.inFiles: ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles) #if self.__output_es_files: # ec = pUtil.removeFiles("/", self.__output_es_files) errorCode = PilotErrors.ERR_UNKNOWN if self.__job.attemptNr < 4: errorCode = PilotErrors.ERR_ESRECOVERABLE #check HPC job status #if self.__hpcStatus: # self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed") if len(self.__eventRanges) == 0: tolog("Cannot get event ranges") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges") # check whether all event ranges are handled tolog("Total event ranges: %s" % len(self.__eventRanges)) not_handled_events = self.__eventRanges.values().count('new') tolog("Not handled events: %s" % not_handled_events) done_events = self.__eventRanges.values().count('Done') tolog("Finished events: %s" % done_events) stagedOut_events = self.__eventRanges.values().count('stagedOut') tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events) if done_events + stagedOut_events: errorCode = PilotErrors.ERR_ESRECOVERABLE if not_handled_events + stagedOut_events: tolog("Not all event ranges are handled. failed job") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events)) dsname, datasetDict = self.getDatasets() tolog("dsname = %s" % (dsname)) tolog("datasetDict = %s" % (datasetDict)) # Create the output file dictionary needed for generating the metadata ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True) if ec: # missing output file (only error code from prepareOutFiles) self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet) ec, job, outputFileInfo = self.createFileMetadata([], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename) if ec: self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) # Rename the metadata produced by the payload # if not pUtil.isBuildJob(outs): self.moveTrfMetadata(self.__job.workdir, self.__job.jobId) # Check the job report for any exit code that should replace the res_tuple[0] res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir) res = (res0, exitMsg, exitMsg) # Payload error handling ed = ErrorDiagnosis() job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) self.__job = job job.jobState = "finished" job.setState([job.jobState, 0, 0]) job.jobState = job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True) tolog("Done") self.sysExit(self.__job)
if job.result[1] != 0 or job.result[2] != 0: runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) # stage-out ........................................................................................ # update the job state file tolog(runJob.getOutputDir()) job.jobState = "stageout" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # verify and prepare and the output files for transfer ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles( job.outFiles, job.logFile, runJob.job_path) if ec: # missing output file (only error code from prepareOutFiles) runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # update the current file states updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created") dumpFileStates(runJob.getParentWorkDir(), job.jobId)
def finishJob(self): try: self.__hpcManager.finishJob() except: tolog(sys.exc_info()[1]) tolog(sys.exc_info()[2]) # If payload leaves the input files, delete them explicitly if self.__job.inFiles: ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles) #if self.__output_es_files: # ec = pUtil.removeFiles("/", self.__output_es_files) errorCode = PilotErrors.ERR_UNKNOWN if self.__job.attemptNr < 4: errorCode = PilotErrors.ERR_ESRECOVERABLE #check HPC job status #if self.__hpcStatus: # self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed") if len(self.__eventRanges) == 0: tolog("Cannot get event ranges") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges") # check whether all event ranges are handled tolog("Total event ranges: %s" % len(self.__eventRanges)) not_handled_events = self.__eventRanges.values().count('new') tolog("Not handled events: %s" % not_handled_events) done_events = self.__eventRanges.values().count('Done') tolog("Finished events: %s" % done_events) stagedOut_events = self.__eventRanges.values().count('stagedOut') tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events) if done_events + stagedOut_events: errorCode = PilotErrors.ERR_ESRECOVERABLE if not_handled_events + stagedOut_events: tolog("Not all event ranges are handled. failed job") self.failJob( 0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events)) dsname, datasetDict = self.getDatasets() tolog("dsname = %s" % (dsname)) tolog("datasetDict = %s" % (datasetDict)) # Create the output file dictionary needed for generating the metadata ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles( self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True) if ec: # missing output file (only error code from prepareOutFiles) self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet) ec, job, outputFileInfo = self.createFileMetadata( [], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename) if ec: self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) # Rename the metadata produced by the payload # if not pUtil.isBuildJob(outs): self.moveTrfMetadata(self.__job.workdir, self.__job.jobId) # Check the job report for any exit code that should replace the res_tuple[0] res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir) res = (res0, exitMsg, exitMsg) # Payload error handling ed = ErrorDiagnosis() job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) self.__job = job job.jobState = "finished" job.setState([job.jobState, 0, 0]) job.jobState = job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True) tolog("Done") self.sysExit(self.__job)