Python ErrorDiagnosis Examples, ErrorDiagnosis.ErrorDiagnosis Python Examples

Example #1

0

Show file

File: RunJobEdison.py Project: mlassnig/pilot

        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # run the job(s) ...................................................................................

        # Set ATLAS_CONDDB if necessary, and other env vars
        RunJobUtilities.setEnvVars(jobSite.sitename)

        # execute the payload
        res, job, getstatusoutput_was_interrupted, current_job_number = runJob.executePayload(thisExperiment, runCommandList, job)

        # if payload leaves the input files, delete them explicitly
        if ins:
            ec = pUtil.removeFiles(job.workdir, ins)

        # payload error handling
        ed = ErrorDiagnosis()
        if res[0] == None:
            job.jobState = "cancelled"
            job.setState(["cancelled", 0, 0])
            rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())
        else:
            job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode())
        
        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag)

        # stage-out ........................................................................................

        # update the job state file
        tolog(runJob.getOutputDir())

Example #2

0

Show file

        # run the job(s) ...................................................................................

        # Set ATLAS_CONDDB if necessary, and other env vars
        RunJobUtilities.setEnvVars(jobSite.sitename)

        # execute the payload
        res, job, getstatusoutput_was_interrupted, current_job_number = runJob.executePayload(
            thisExperiment, runCommandList, job)

        # if payload leaves the input files, delete them explicitly
        if ins:
            ec = pUtil.removeFiles(job.workdir, ins)

        # payload error handling
        ed = ErrorDiagnosis()
        if res[0] == None:
            job.jobState = "cancelled"
            job.setState(["cancelled", 0, 0])
            rt = RunJobUtilities.updatePilotServer(job,
                                                   runJob.getPilotServer(),
                                                   runJob.getPilotPort())
        else:
            job = ed.interpretPayload(job, res,
                                      getstatusoutput_was_interrupted,
                                      current_job_number, runCommandList,
                                      runJob.getFailureCode())

        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1],
                           job.result[2],

Example #3

0

Show file

File: RunJobArgo.py Project: jtchilders/pilot

        # run the job(s) ...................................................................................

        # Set ATLAS_CONDDB if necessary, and other env vars
        RunJobUtilities.setEnvVars(jobSite.sitename)

        # execute the payload
        res, job, getstatusoutput_was_interrupted = runJob.executePayload(
            thisExperiment, job)

        tolog("Check ARGO output: %s" % runJob.job_path)
        # if payload leaves the input files, delete them explicitly
        if ins:
            ec = pUtil.removeFiles(job.workdir, ins)

        # payload error handling
        ed = ErrorDiagnosis()
        if res[0] == None:
            job.jobState = "cancelled"
            job.setState(["cancelled", 0, 0])
            rt = RunJobUtilities.updatePilotServer(job,
                                                   runJob.getPilotServer(),
                                                   runJob.getPilotPort())
        #else:
        #    job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode())

        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1],
                           job.result[2],
                           job,
                           pilotErrorDiag=job.pilotErrorDiag)

Example #4

0

Show file

File: RunJobHpcEvent.py Project: RRCKI/pilot

    def finishJob(self):
        try:
            self.__hpcManager.finishJob()
        except:
            tolog(sys.exc_info()[1])
            tolog(sys.exc_info()[2])

        # If payload leaves the input files, delete them explicitly
        if self.__job.inFiles:
            ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles)
        #if self.__output_es_files:
        #    ec = pUtil.removeFiles("/", self.__output_es_files)


        errorCode = PilotErrors.ERR_UNKNOWN
        if self.__job.attemptNr < 4:
            errorCode = PilotErrors.ERR_ESRECOVERABLE

        #check HPC job status
        #if self.__hpcStatus:
        #    self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed")

        if len(self.__eventRanges) == 0:
            tolog("Cannot get event ranges")
            self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges")

        # check whether all event ranges are handled
        tolog("Total event ranges: %s" % len(self.__eventRanges))
        not_handled_events = self.__eventRanges.values().count('new')
        tolog("Not handled events: %s" % not_handled_events)
        done_events = self.__eventRanges.values().count('Done')
        tolog("Finished events: %s" % done_events)
        stagedOut_events = self.__eventRanges.values().count('stagedOut')
        tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events)
        if done_events + stagedOut_events:
            errorCode = PilotErrors.ERR_ESRECOVERABLE
        if not_handled_events + stagedOut_events:
            tolog("Not all event ranges are handled. failed job")
            self.failJob(0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events))

        dsname, datasetDict = self.getDatasets()
        tolog("dsname = %s" % (dsname))
        tolog("datasetDict = %s" % (datasetDict))

        # Create the output file dictionary needed for generating the metadata
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet)
        ec, job, outputFileInfo = self.createFileMetadata([], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename)
        if ec:
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # Rename the metadata produced by the payload
        # if not pUtil.isBuildJob(outs):
        self.moveTrfMetadata(self.__job.workdir, self.__job.jobId)

        # Check the job report for any exit code that should replace the res_tuple[0]
        res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir)
        res = (res0, exitMsg, exitMsg)

        # Payload error handling
        ed = ErrorDiagnosis()
        job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode())
        if job.result[1] != 0 or job.result[2] != 0:
            self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job

        job.jobState = "finished"
        job.setState([job.jobState, 0, 0])
        job.jobState = job.result
        rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True)

        tolog("Done")
        self.sysExit(self.__job)

Example #5

0

Show file

    def finishJob(self):
        try:
            self.__hpcManager.finishJob()
        except:
            tolog(sys.exc_info()[1])
            tolog(sys.exc_info()[2])

        # If payload leaves the input files, delete them explicitly
        if self.__job.inFiles:
            ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles)
        #if self.__output_es_files:
        #    ec = pUtil.removeFiles("/", self.__output_es_files)

        errorCode = PilotErrors.ERR_UNKNOWN
        if self.__job.attemptNr < 4:
            errorCode = PilotErrors.ERR_ESRECOVERABLE

        #check HPC job status
        #if self.__hpcStatus:
        #    self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed")

        if len(self.__eventRanges) == 0:
            tolog("Cannot get event ranges")
            self.failJob(0,
                         errorCode,
                         self.__job,
                         pilotErrorDiag="Cannot get event ranges")

        # check whether all event ranges are handled
        tolog("Total event ranges: %s" % len(self.__eventRanges))
        not_handled_events = self.__eventRanges.values().count('new')
        tolog("Not handled events: %s" % not_handled_events)
        done_events = self.__eventRanges.values().count('Done')
        tolog("Finished events: %s" % done_events)
        stagedOut_events = self.__eventRanges.values().count('stagedOut')
        tolog("stagedOut but not updated to panda server events: %s" %
              stagedOut_events)
        if done_events + stagedOut_events:
            errorCode = PilotErrors.ERR_ESRECOVERABLE
        if not_handled_events + stagedOut_events:
            tolog("Not all event ranges are handled. failed job")
            self.failJob(
                0,
                errorCode,
                self.__job,
                pilotErrorDiag="Not All events are handled(total:%s, left:%s)"
                % (len(self.__eventRanges),
                   not_handled_events + stagedOut_events))

        dsname, datasetDict = self.getDatasets()
        tolog("dsname = %s" % (dsname))
        tolog("datasetDict = %s" % (datasetDict))

        # Create the output file dictionary needed for generating the metadata
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(
            self.__job.outFiles,
            self.__job.logFile,
            self.__job.workdir,
            fullpath=True)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            self.failJob(self.__job.result[1],
                         ec,
                         self.__job,
                         pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet)
        ec, job, outputFileInfo = self.createFileMetadata(
            [], self.__job, outsDict, dsname, datasetDict,
            self.__jobSite.sitename)
        if ec:
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # Rename the metadata produced by the payload
        # if not pUtil.isBuildJob(outs):
        self.moveTrfMetadata(self.__job.workdir, self.__job.jobId)

        # Check the job report for any exit code that should replace the res_tuple[0]
        res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir)
        res = (res0, exitMsg, exitMsg)

        # Payload error handling
        ed = ErrorDiagnosis()
        job = ed.interpretPayload(self.__job, res, False, 0,
                                  self.__runCommandList, self.getFailureCode())
        if job.result[1] != 0 or job.result[2] != 0:
            self.failJob(job.result[1],
                         job.result[2],
                         job,
                         pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job

        job.jobState = "finished"
        job.setState([job.jobState, 0, 0])
        job.jobState = job.result
        rt = RunJobUtilities.updatePilotServer(job,
                                               self.getPilotServer(),
                                               self.getPilotPort(),
                                               final=True)

        tolog("Done")
        self.sysExit(self.__job)