Beispiel #1
0
        else:
            job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode())
        
        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag)

        # stage-out ........................................................................................

        # update the job state file
        tolog(runJob.getOutputDir()) 
        
        job.jobState = "stageout"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # verify and prepare and the output files for transfer
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(job.outFiles, job.logFile, job.workdir)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # update the current file states
        updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created")
        dumpFileStates(runJob.getParentWorkDir(), job.jobId)

        # create xml string to pass to dispatcher for atlas jobs
        outputFileInfo = {}
        if outs or (job.logFile and job.logFile != ''):
            # get the datasets for the output files
            dsname, datasetDict = runJob.getDatasets(job)
Beispiel #2
0
        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1],
                           job.result[2],
                           job,
                           pilotErrorDiag=job.pilotErrorDiag)

        # stage-out ........................................................................................

        # update the job state file
        tolog(runJob.getOutputDir())

        job.jobState = "stageout"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # verify and prepare and the output files for transfer
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(
            job.outFiles, job.logFile, job.workdir)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            runJob.failJob(job.result[1],
                           ec,
                           job,
                           pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # update the current file states
        updateFileStates(outs,
                         runJob.getParentWorkDir(),
                         job.jobId,
                         mode="file_state",
                         state="created")
        dumpFileStates(runJob.getParentWorkDir(), job.jobId)
Beispiel #3
0
        #else:
        #    job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode())
        
        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag)

        # stage-out ........................................................................................

        # update the job state file
        tolog(runJob.getOutputDir()) 
        
        job.jobState = "stageout"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # verify and prepare and the output files for transfer
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(job.outFiles, job.logFile, runJob.job_path)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # update the current file states
        updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created")
        dumpFileStates(runJob.getParentWorkDir(), job.jobId)

        # create xml string to pass to dispatcher for atlas jobs
        outputFileInfo = {}
        if outs or (job.logFile and job.logFile != ''):
            # get the datasets for the output files
            dsname, datasetDict = runJob.getDatasets(job)
Beispiel #4
0
    def finishJob(self):
        try:
            self.__hpcManager.finishJob()
        except:
            tolog(sys.exc_info()[1])
            tolog(sys.exc_info()[2])

        # If payload leaves the input files, delete them explicitly
        if self.__job.inFiles:
            ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles)
        #if self.__output_es_files:
        #    ec = pUtil.removeFiles("/", self.__output_es_files)


        errorCode = PilotErrors.ERR_UNKNOWN
        if self.__job.attemptNr < 4:
            errorCode = PilotErrors.ERR_ESRECOVERABLE

        #check HPC job status
        #if self.__hpcStatus:
        #    self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed")

        if len(self.__eventRanges) == 0:
            tolog("Cannot get event ranges")
            self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges")

        # check whether all event ranges are handled
        tolog("Total event ranges: %s" % len(self.__eventRanges))
        not_handled_events = self.__eventRanges.values().count('new')
        tolog("Not handled events: %s" % not_handled_events)
        done_events = self.__eventRanges.values().count('Done')
        tolog("Finished events: %s" % done_events)
        stagedOut_events = self.__eventRanges.values().count('stagedOut')
        tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events)
        if done_events + stagedOut_events:
            errorCode = PilotErrors.ERR_ESRECOVERABLE
        if not_handled_events + stagedOut_events:
            tolog("Not all event ranges are handled. failed job")
            self.failJob(0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events))

        dsname, datasetDict = self.getDatasets()
        tolog("dsname = %s" % (dsname))
        tolog("datasetDict = %s" % (datasetDict))

        # Create the output file dictionary needed for generating the metadata
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet)
        ec, job, outputFileInfo = self.createFileMetadata([], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename)
        if ec:
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # Rename the metadata produced by the payload
        # if not pUtil.isBuildJob(outs):
        self.moveTrfMetadata(self.__job.workdir, self.__job.jobId)

        # Check the job report for any exit code that should replace the res_tuple[0]
        res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir)
        res = (res0, exitMsg, exitMsg)

        # Payload error handling
        ed = ErrorDiagnosis()
        job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode())
        if job.result[1] != 0 or job.result[2] != 0:
            self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job

        job.jobState = "finished"
        job.setState([job.jobState, 0, 0])
        job.jobState = job.result
        rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True)

        tolog("Done")
        self.sysExit(self.__job)
Beispiel #5
0
        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1],
                           job.result[2],
                           job,
                           pilotErrorDiag=job.pilotErrorDiag)

        # stage-out ........................................................................................

        # update the job state file
        tolog(runJob.getOutputDir())

        job.jobState = "stageout"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # verify and prepare and the output files for transfer
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(
            job.outFiles, job.logFile, runJob.job_path)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            runJob.failJob(job.result[1],
                           ec,
                           job,
                           pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # update the current file states
        updateFileStates(outs,
                         runJob.getParentWorkDir(),
                         job.jobId,
                         mode="file_state",
                         state="created")
        dumpFileStates(runJob.getParentWorkDir(), job.jobId)
Beispiel #6
0
    def finishJob(self):
        try:
            self.__hpcManager.finishJob()
        except:
            tolog(sys.exc_info()[1])
            tolog(sys.exc_info()[2])

        # If payload leaves the input files, delete them explicitly
        if self.__job.inFiles:
            ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles)
        #if self.__output_es_files:
        #    ec = pUtil.removeFiles("/", self.__output_es_files)

        errorCode = PilotErrors.ERR_UNKNOWN
        if self.__job.attemptNr < 4:
            errorCode = PilotErrors.ERR_ESRECOVERABLE

        #check HPC job status
        #if self.__hpcStatus:
        #    self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed")

        if len(self.__eventRanges) == 0:
            tolog("Cannot get event ranges")
            self.failJob(0,
                         errorCode,
                         self.__job,
                         pilotErrorDiag="Cannot get event ranges")

        # check whether all event ranges are handled
        tolog("Total event ranges: %s" % len(self.__eventRanges))
        not_handled_events = self.__eventRanges.values().count('new')
        tolog("Not handled events: %s" % not_handled_events)
        done_events = self.__eventRanges.values().count('Done')
        tolog("Finished events: %s" % done_events)
        stagedOut_events = self.__eventRanges.values().count('stagedOut')
        tolog("stagedOut but not updated to panda server events: %s" %
              stagedOut_events)
        if done_events + stagedOut_events:
            errorCode = PilotErrors.ERR_ESRECOVERABLE
        if not_handled_events + stagedOut_events:
            tolog("Not all event ranges are handled. failed job")
            self.failJob(
                0,
                errorCode,
                self.__job,
                pilotErrorDiag="Not All events are handled(total:%s, left:%s)"
                % (len(self.__eventRanges),
                   not_handled_events + stagedOut_events))

        dsname, datasetDict = self.getDatasets()
        tolog("dsname = %s" % (dsname))
        tolog("datasetDict = %s" % (datasetDict))

        # Create the output file dictionary needed for generating the metadata
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(
            self.__job.outFiles,
            self.__job.logFile,
            self.__job.workdir,
            fullpath=True)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            self.failJob(self.__job.result[1],
                         ec,
                         self.__job,
                         pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet)
        ec, job, outputFileInfo = self.createFileMetadata(
            [], self.__job, outsDict, dsname, datasetDict,
            self.__jobSite.sitename)
        if ec:
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # Rename the metadata produced by the payload
        # if not pUtil.isBuildJob(outs):
        self.moveTrfMetadata(self.__job.workdir, self.__job.jobId)

        # Check the job report for any exit code that should replace the res_tuple[0]
        res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir)
        res = (res0, exitMsg, exitMsg)

        # Payload error handling
        ed = ErrorDiagnosis()
        job = ed.interpretPayload(self.__job, res, False, 0,
                                  self.__runCommandList, self.getFailureCode())
        if job.result[1] != 0 or job.result[2] != 0:
            self.failJob(job.result[1],
                         job.result[2],
                         job,
                         pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job

        job.jobState = "finished"
        job.setState([job.jobState, 0, 0])
        job.jobState = job.result
        rt = RunJobUtilities.updatePilotServer(job,
                                               self.getPilotServer(),
                                               self.getPilotPort(),
                                               final=True)

        tolog("Done")
        self.sysExit(self.__job)