def PrepareJobForDeferredStageout(job_state, **kwargs): """ Function prepares job for staging out :param job_state: (JobState) decoded job state file mandatory Other params can be passed into functions: 1. To overwrite environment variables: 1. uflag -- pUtil.isSameType 2. pshttpurl, psport, pilot_initdir -- pUtil.getJobStatus 3. thisSite -- compare the site names 4. maxNumberOfRecoveryAttempts -- check recoveryAttempt 5. psport, jobSchedulerId, pilotId -- server updates Unused parameters are ommited. :return: (integer) 0. job is prepared 1. job is not prepared, skip 2. job is to be removed, proceed to cleanup """ # Next test is testing trf type and PROD/ANAL job vs pilot comparison if not pUtil.isSameType(job_state.job.trf.split(",")[0], DorE(kwargs, "uflag")): return lognret(ReturnCode.SkipJob, "Job is not the same type as current pilot") # Next test ensures that the pilot is started on the same site. # Maybe we should add env var to switch this test on and off, there can be a number of equal nodes sharing one FS # Which can equally perform recovery of the lost jobs from each other if job_state.site.sitename != DorE(kwargs, "thisSite").sitename: return lognret(ReturnCode.SkipJob, "Job is not running on the same site") # This test ensures that the number of recovery attempts did not exceeded and if exceeded, updates the server and # the state file if job_state.recoveryAttempt >= DorE(kwargs, "maxNumberOfRecoveryAttempts"): log("!!WARNING!!1100!! Max number of recovery attempts exceeded: %d" % (env["maxNumberOfRecoveryAttempts"])) job_state.job.setState(["failed", job_state.job.result[1], PilotErrors().ERR_LOSTJOBMAXEDOUT]) rt, retNode = updatePandaServer(job_state, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) else: log("Panda server returned a %d" % (rt)) return lognret(ReturnCode.SkipJob, "(Failed to update panda server - leave for next pilot)") jobStatus = job_state.job.result[0] jobStatusCode = 0 jobAttemptNr = job_state.job.attemptNr # There can be unclear state, consult the server if jobStatus not in acceptedJobStatesFromFile: log("Job state may be unclear (found state %s), checking with the server" % jobStatus) jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus( job_state.job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir") ) if jobStatusCode != 0: return lognret( ReturnCode.SkipJob, "Received general error code from dispatcher call (leave job for later" " pilot)" ) else: log("Job state is %s" % jobStatus) # If any inconvenience occurs or job is finalised, cleanup if job_state.job.attemptNr != jobAttemptNr or jobStatus in finalJobStates or "tobekilled" in job_state.job.action: if job_state.job.attemptNr != jobAttemptNr: return lognret( ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It has a " "mismatch in the attempt number record.", ) if "tobekilled" in job_state.job.action: return lognret( ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It was" " marked to be killed.", ) return lognret( ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job, it is in final " "state: %s." % jobStatus, ) if jobStatus != "holding": # is the attemptNr defined? try: attemptNr = job_state.job.attemptNr except Exception, e: log("!!WARNING!!1100!! Attempt number not defined [ignore]: %s" % str(e)) else: # check if the attemptNr (set during initial getJob command) is the same # as the current jobAttemptNr from the server (protection against failed lost # heartbeat jobs due to reassigned panda job id numbers) if attemptNr != jobAttemptNr: log( "!!WARNING!!1100!! Attempt number mismatch for job %s (according to server - will not be" " recovered)" % job_state.job.jobId ) log("....Initial attempt number: %d" % attemptNr) log("....Current attempt number: %d" % jobAttemptNr) log("....Job status (server) : %s" % jobStatus) log("....Job status (state) : %s" % job_state.job.result[0]) return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job") else: log("Attempt numbers from server and job state file agree: %d" % attemptNr)
def PrepareJobForDeferredStageout(job_state, **kwargs): """ Function prepares job for staging out :param job_state: (JobState) decoded job state file mandatory Other params can be passed into functions: 1. To overwrite environment variables: 1. uflag -- pUtil.isSameType 2. pshttpurl, psport, pilot_initdir -- pUtil.getJobStatus 3. thisSite -- compare the site names 4. maxNumberOfRecoveryAttempts -- check recoveryAttempt 5. psport, jobSchedulerId, pilotId -- server updates Unused parameters are ommited. :return: (integer) 0. job is prepared 1. job is not prepared, skip 2. job is to be removed, proceed to cleanup """ # Next test is testing trf type and PROD/ANAL job vs pilot comparison if not pUtil.isSameType(job_state.job.trf.split(",")[0], DorE(kwargs, 'uflag')): return lognret(ReturnCode.SkipJob, "Job is not the same type as current pilot") # Next test ensures that the pilot is started on the same site. # Maybe we should add env var to switch this test on and off, there can be a number of equal nodes sharing one FS # Which can equally perform recovery of the lost jobs from each other if job_state.site.sitename != DorE(kwargs, 'thisSite').sitename: return lognret(ReturnCode.SkipJob, "Job is not running on the same site") # This test ensures that the number of recovery attempts did not exceeded and if exceeded, updates the server and # the state file if job_state.recoveryAttempt >= DorE(kwargs, 'maxNumberOfRecoveryAttempts'): log("!!WARNING!!1100!! Max number of recovery attempts exceeded: %d" % (env['maxNumberOfRecoveryAttempts'])) job_state.job.setState(['failed', job_state.job.result[1], PilotErrors().ERR_LOSTJOBMAXEDOUT]) rt, retNode = updatePandaServer(job_state, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) else: log("Panda server returned a %d" % (rt)) return lognret(ReturnCode.SkipJob, "(Failed to update panda server - leave for next pilot)") jobStatus = job_state.job.result[0] jobStatusCode = 0 jobAttemptNr = job_state.job.attemptNr # There can be unclear state, consult the server if jobStatus not in acceptedJobStatesFromFile: log("Job state may be unclear (found state %s), checking with the server" % jobStatus) jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(job_state.job.jobId, DorE(kwargs, 'pshttpurl'), DorE(kwargs, 'psport'), DorE(kwargs, 'pilot_initdir')) if jobStatusCode != 0: return lognret(ReturnCode.SkipJob, "Received general error code from dispatcher call (leave job for later" " pilot)") else: log("Job state is %s" % jobStatus) # If any inconvenience occurs or job is finalised, cleanup if job_state.job.attemptNr != jobAttemptNr or\ jobStatus in finalJobStates or\ "tobekilled" in job_state.job.action: if job_state.job.attemptNr != jobAttemptNr: return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It has a " "mismatch in the attempt number record.") if "tobekilled" in job_state.job.action: return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job. It was" " marked to be killed.") return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job, it is in final " "state: %s." % jobStatus) if jobStatus != 'holding': # is the attemptNr defined? try: attemptNr = job_state.job.attemptNr except Exception, e: log("!!WARNING!!1100!! Attempt number not defined [ignore]: %s" % str(e)) else: # check if the attemptNr (set during initial getJob command) is the same # as the current jobAttemptNr from the server (protection against failed lost # heartbeat jobs due to reassigned panda job id numbers) if attemptNr != jobAttemptNr: log("!!WARNING!!1100!! Attempt number mismatch for job %s (according to server - will not be" " recovered)" % job_state.job.jobId) log("....Initial attempt number: %d" % attemptNr) log("....Current attempt number: %d" % jobAttemptNr) log("....Job status (server) : %s" % jobStatus) log("....Job status (state) : %s" % job_state.job.result[0]) return lognret(ReturnCode.Cleanup, "Further recovery attempts will be prevented for this job") else: log("Attempt numbers from server and job state file agree: %d" % attemptNr)
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation for the HPC job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log('Deferred stageout from HPC job directory "%s"' % job_dir) file_path = job_dir + "/" + hpc_jobState_file_wildcart current_dir = os.getcwd() log("Working on %s" % file_path) log("Chdir from current dir %s to %s" % (current_dir, job_dir)) pUtil.chdir(job_dir) try: with LockFileWrapper(file_path) as is_locked: if not is_locked: return False from json import load with open(file_path) as data_file: HPC_state = load(data_file) job_state_file = HPC_state["JobStateFile"] job_command = HPC_state["JobCommand"] # global_work_dir = HPC_state['GlobalWorkingDir'] JS = JobState() JS.get(job_state_file) _job, _site, _node, _recoveryAttempt = JS.decode() with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus( _job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir") ) # recover this job? if jobStatusCode == 20: log("Received general error code from dispatcher call (leave job for later pilot)") # release the atomic lockfile and go to the next directory # releaseAtomicLockFile(fd, lockfile_name) return False elif jobStatus in finalJobStates or "tobekilled" in _job.action: log( "Job %s is currently in state '%s' with attemptNr = %d (according to server - will not" " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr) ) # releaseAtomicLockFile(fd, lockfile_name) return False # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt) # releaseAtomicLockFile(fd, lockfile_name) monitor = Monitor(env) monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir) log("Chdir back to %s" % current_dir) pUtil.chdir(current_dir) panda_jobs = glob(job_dir + "/PandaJob_*_*") panda_logs = glob(job_dir + "/*.log.tgz.*") if panda_jobs or panda_logs: log( "Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir" % (len(panda_jobs), len(panda_logs)) ) else: log( "Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir" % (len(panda_jobs), len(panda_logs)) ) log("Remove job dir %s" % job_dir) os.system("rm -rf %s" % job_dir) return True except: log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc()) return False
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation for the HPC job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log("Deferred stageout from HPC job directory \"%s\"" % job_dir) file_path = job_dir+"/"+hpc_jobState_file_wildcart current_dir = os.getcwd() log("Working on %s" % file_path) log("Chdir from current dir %s to %s" % (current_dir, job_dir)) pUtil.chdir(job_dir) try: with LockFileWrapper(file_path) as is_locked: if not is_locked: return False from json import load with open(file_path) as data_file: HPC_state = load(data_file) job_state_file = HPC_state['JobStateFile'] job_command = HPC_state['JobCommand'] # global_work_dir = HPC_state['GlobalWorkingDir'] JS = JobState() JS.get(job_state_file) _job, _site, _node, _recoveryAttempt = JS.decode() with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(_job.jobId, DorE(kwargs,'pshttpurl'), DorE(kwargs,'psport'), DorE(kwargs,'pilot_initdir')) # recover this job? if jobStatusCode == 20: log("Received general error code from dispatcher call (leave job for later pilot)") # release the atomic lockfile and go to the next directory # releaseAtomicLockFile(fd, lockfile_name) return False elif jobStatus in finalJobStates or "tobekilled" in _job.action: log("Job %s is currently in state \'%s\' with attemptNr = %d (according to server - will not" " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr)) # releaseAtomicLockFile(fd, lockfile_name) return False # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt) # releaseAtomicLockFile(fd, lockfile_name) monitor = Monitor(env) monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir) log("Chdir back to %s" % current_dir) pUtil.chdir(current_dir) panda_jobs = glob(job_dir + "/PandaJob_*_*") panda_logs = glob(job_dir + "/*.log.tgz.*") if panda_jobs or panda_logs: log("Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir" % (len(panda_jobs), len(panda_logs))) else: log("Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir" % (len(panda_jobs), len(panda_logs))) log("Remove job dir %s" % job_dir) os.system("rm -rf %s" % job_dir) return True except: log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc()) return False