def updateJobStateTest(self, job, thisSite, workerNode, recoveryAttempt=0, mode=""): """ update the job state file """ # NOTE: this function will eventually replace pilot::updateJobState() when new job rec is in place status = False # IGNORE TEST MODE mode = "" # create a job state object and give it the current job state information JS = JobState() if JS.put(job, thisSite, workerNode, recoveryAttempt=recoveryAttempt, mode=mode): if recoveryAttempt > 0: tolog( "Successfully updated job state file (recovery attempt number: %d) with state: %s" % (recoveryAttempt, job.jobState)) else: tolog("Successfully updated job state file with state: %s" % (job.jobState)) status = True else: self.__pilotErrorDiag = "Failed to update job state file" tolog(self.__errorString % self.__pilotErrorDiag) return status
def updateJobStateTest(self, job, thisSite, workerNode, recoveryAttempt=0, mode=""): """ update the job state file """ # NOTE: this function will eventually replace pilot::updateJobState() when new job rec is in place status = False # create a job state object and give it the current job state information JS = JobState() if JS.put(job, thisSite, workerNode, recoveryAttempt=recoveryAttempt, mode=mode): if recoveryAttempt > 0: tolog("Successfully updated job state file (recovery attempt number: %d) with state: %s" % (recoveryAttempt, job.jobState)) else: tolog("Successfully updated job state file with state: %s" % (job.jobState)) status = True else: self.__pilotErrorDiag = "Failed to update job state file" tolog(self.__errorString % self.__pilotErrorDiag) return status
def getNodeStructureFromFile(self, workDir, jobId): """ get the node structure from the Job State file """ JS = JobState() _node = None # open the job state file tolog("workDir: %s" % (workDir)) tolog("jobId: %s" % (jobId)) filename = JS.getFilename(workDir, jobId) tolog("filename: %s" % (filename)) if os.path.exists(filename): # load the objects if JS.get(filename): # decode the job state info _job, _site, _node, _recoveryAttempt = JS.decode() else: tolog("JS.decode() failed to load objects") else: tolog("%s does not exist" % (filename)) return _node
def performRecovery(self): """ Run job recovery in all directories """ status = False if not self.__PASSED_INIT: self.__pilotErrorDiag = "Aborting job recovery due to previous failure" tolog(self.__errorString % self.__pilotErrorDiag) return status # keep track of the starting dir startingDir = os.getcwd() # scan all recovery dirs JS = JobState() for _dir in self.__recoveryDirs: status = self.run(_dir, JS) # return to the starting dir chdir(startingDir) return status
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log('Deferred stageout from job directory "%s"' % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s" % job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log('Job "%s" does not need deferred stageout procedure (yet)' % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log('Working with job in "%s"' % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = "" if datadir == "": try: XMLStr = job_state.node["xml"] except: pass if XMLStr == "": XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile( job_state.job, job_state.site, DorE(kwargs, "experiment"), dest=None, jr=True ) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a 'tobekilled' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode["xml"] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation for the HPC job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log('Deferred stageout from HPC job directory "%s"' % job_dir) file_path = job_dir + "/" + hpc_jobState_file_wildcart current_dir = os.getcwd() log("Working on %s" % file_path) log("Chdir from current dir %s to %s" % (current_dir, job_dir)) pUtil.chdir(job_dir) try: with LockFileWrapper(file_path) as is_locked: if not is_locked: return False from json import load with open(file_path) as data_file: HPC_state = load(data_file) job_state_file = HPC_state["JobStateFile"] job_command = HPC_state["JobCommand"] # global_work_dir = HPC_state['GlobalWorkingDir'] JS = JobState() JS.get(job_state_file) _job, _site, _node, _recoveryAttempt = JS.decode() with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus( _job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir") ) # recover this job? if jobStatusCode == 20: log("Received general error code from dispatcher call (leave job for later pilot)") # release the atomic lockfile and go to the next directory # releaseAtomicLockFile(fd, lockfile_name) return False elif jobStatus in finalJobStates or "tobekilled" in _job.action: log( "Job %s is currently in state '%s' with attemptNr = %d (according to server - will not" " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr) ) # releaseAtomicLockFile(fd, lockfile_name) return False # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt) # releaseAtomicLockFile(fd, lockfile_name) monitor = Monitor(env) monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir) log("Chdir back to %s" % current_dir) pUtil.chdir(current_dir) panda_jobs = glob(job_dir + "/PandaJob_*_*") panda_logs = glob(job_dir + "/*.log.tgz.*") if panda_jobs or panda_logs: log( "Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir" % (len(panda_jobs), len(panda_logs)) ) else: log( "Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir" % (len(panda_jobs), len(panda_logs)) ) log("Remove job dir %s" % job_dir) os.system("rm -rf %s" % job_dir) return True except: log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc()) return False
def cleanup(self): """ execute the clean-up """ status = True number_of_cleanups = 0 if self.clean: tolog("Executing empty dirs clean-up, stage 1/5") Cleaner.purgeEmptyDirs(self.path) tolog("Executing work dir clean-up, stage 2/5") Cleaner.purgeWorkDirs(self.path) tolog("Executing maxed-out dirs clean-up, stage 3/5") Cleaner.purgeMaxedoutDirs(self.path) tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>") #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl'] #for f in files: # Cleaner.purgeFiles(self.path, f, limit=48*3600) tolog("Executing PanDA Pilot dir clean-up, stage 5/5") JS = JobState() # grab all job state files in all work directories job_state_files = glob(self.path + "/Panda_Pilot_*/jobState-*.pickle") number_of_files = len(job_state_files) file_number = 0 max_cleanups = 30 tolog("Number of found job state files: %d" % (number_of_files)) if job_state_files: # loop over all found job state files for file_path in job_state_files: file_number += 1 if file_number > max_cleanups: tolog("Maximum number of job recoveries exceeded for this pilot: %d" % (max_cleanups)) break tolog("Processing job state file %d/%d: %s" % (file_number, number_of_files, file_path)) current_time = int(time.time()) # when was file last modified? try: file_modification_time = os.path.getmtime(file_path) except: # skip this file since it was not possible to read the modification time pass else: # was the job state file updated longer than the time limit? (convert to seconds) mod_time = current_time - file_modification_time if mod_time > self.limit*3600: tolog("File was last modified %d seconds ago (proceed)" % (mod_time)) cmd = "whoami; ls -lF %s; ls -lF %s" % (file_path, os.path.dirname(file_path)) tolog("Executing command: %s" % (cmd)) ec, rs = commands.getstatusoutput(cmd) if ec == 0: tolog("%s" % (rs)) else: tolog("!!WARNING!!2999!! %d, %s" % (ec, rs)) # open the job state file if JS.get(file_path): # decode the job state info _job, _site, _node, _recoveryAttempt = JS.decode() # add member if it doesn't exist (new Job version) try: _tmp = _job.prodSourceLabel except: _job.prodSourceLabel = '' if _job and _site and _node: # query the job state file for job information if _job.result[0] == 'running' or _job.result[0] == 'starting' or (_job.result[0] == 'holding' and mod_time > 7*24*3600): if _job.result[0] == 'holding': tolog("Job %s was found in %s state but has not been modified for a long time - will be cleaned up" % (_job.jobId, _job.result[0])) else: tolog("Job %s was found in %s state - will be cleaned up" % (_job.jobId, _job.result[0])) tolog("Erasing directory: %s" % (_site.workdir)) cmd = "rm -rf %s" % (_site.workdir) try: ec, rs = commands.getstatusoutput(cmd) except: tolog("!!WARNING!!5500!! Could not erase lost job workdir: %d, %s" % (ec, rs)) status = False break else: tolog("Lost job workdir removed") else: tolog("Job found in state: %s" % (_job.result[0])) else: tolog("File was last modified %d seconds ago (skip)" % (mod_time)) else: tolog("No job state files were found, aborting clean-up") else: tolog("Clean-up turned off") status = False return status
def cleanup(self): """ execute the clean-up """ status = True number_of_cleanups = 0 if self.clean: tolog("Executing empty dirs clean-up, stage 1/5") Cleaner.purgeEmptyDirs(self.path) tolog("Executing work dir clean-up, stage 2/5") Cleaner.purgeWorkDirs(self.path) tolog("Executing maxed-out dirs clean-up, stage 3/5") Cleaner.purgeMaxedoutDirs(self.path) tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>") #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl'] #for f in files: # Cleaner.purgeFiles(self.path, f, limit=48*3600) tolog("Executing PanDA Pilot dir clean-up, stage 5/5") JS = JobState() # grab all job state files in all work directories job_state_files = glob(self.path + "/Panda_Pilot_*/jobState-*.pickle") number_of_files = len(job_state_files) file_number = 0 max_cleanups = 30 tolog("Number of found job state files: %d" % (number_of_files)) if job_state_files: # loop over all found job state files for file_path in job_state_files: file_number += 1 if file_number > max_cleanups: tolog( "Maximum number of job recoveries exceeded for this pilot: %d" % (max_cleanups)) break tolog("Processing job state file %d/%d: %s" % (file_number, number_of_files, file_path)) current_time = int(time.time()) # when was file last modified? try: file_modification_time = os.path.getmtime(file_path) except: # skip this file since it was not possible to read the modification time pass else: # was the job state file updated longer than the time limit? (convert to seconds) mod_time = current_time - file_modification_time if mod_time > self.limit * 3600: tolog( "File was last modified %d seconds ago (proceed)" % (mod_time)) cmd = "whoami; ls -lF %s; ls -lF %s" % ( file_path, os.path.dirname(file_path)) tolog("Executing command: %s" % (cmd)) ec, rs = commands.getstatusoutput(cmd) if ec == 0: tolog("%s" % (rs)) else: tolog("!!WARNING!!2999!! %d, %s" % (ec, rs)) # open the job state file if JS.get(file_path): # decode the job state info _job, _site, _node, _recoveryAttempt = JS.decode( ) # add member if it doesn't exist (new Job version) try: _tmp = _job.prodSourceLabel except: _job.prodSourceLabel = '' if _job and _site and _node: # query the job state file for job information if _job.result[ 0] == 'running' or _job.result[ 0] == 'starting' or ( _job.result[0] == 'holding' and mod_time > 7 * 24 * 3600): if _job.result[0] == 'holding': tolog( "Job %s was found in %s state but has not been modified for a long time - will be cleaned up" % (_job.jobId, _job.result[0])) else: tolog( "Job %s was found in %s state - will be cleaned up" % (_job.jobId, _job.result[0])) tolog("Erasing directory: %s" % (_site.workdir)) cmd = "rm -rf %s" % (_site.workdir) try: ec, rs = commands.getstatusoutput( cmd) except: tolog( "!!WARNING!!5500!! Could not erase lost job workdir: %d, %s" % (ec, rs)) status = False break else: tolog("Lost job workdir removed") else: tolog("Job found in state: %s" % (_job.result[0])) else: tolog( "File was last modified %d seconds ago (skip)" % (mod_time)) else: tolog("No job state files were found, aborting clean-up") else: tolog("Clean-up turned off") status = False return status
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log("Deferred stageout from job directory \"%s\"" % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s"%job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log("Job \"%s\" does not need deferred stageout procedure (yet)" % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log("Working with job in \"%s\"" % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = '' if datadir == "": try: XMLStr = job_state.node['xml'] except: pass if XMLStr == '': XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile(job_state.job, job_state.site, DorE(kwargs, 'experiment'), dest=None, jr=True) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a \'tobekilled\' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode['xml'] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation for the HPC job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log("Deferred stageout from HPC job directory \"%s\"" % job_dir) file_path = job_dir+"/"+hpc_jobState_file_wildcart current_dir = os.getcwd() log("Working on %s" % file_path) log("Chdir from current dir %s to %s" % (current_dir, job_dir)) pUtil.chdir(job_dir) try: with LockFileWrapper(file_path) as is_locked: if not is_locked: return False from json import load with open(file_path) as data_file: HPC_state = load(data_file) job_state_file = HPC_state['JobStateFile'] job_command = HPC_state['JobCommand'] # global_work_dir = HPC_state['GlobalWorkingDir'] JS = JobState() JS.get(job_state_file) _job, _site, _node, _recoveryAttempt = JS.decode() with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(_job.jobId, DorE(kwargs,'pshttpurl'), DorE(kwargs,'psport'), DorE(kwargs,'pilot_initdir')) # recover this job? if jobStatusCode == 20: log("Received general error code from dispatcher call (leave job for later pilot)") # release the atomic lockfile and go to the next directory # releaseAtomicLockFile(fd, lockfile_name) return False elif jobStatus in finalJobStates or "tobekilled" in _job.action: log("Job %s is currently in state \'%s\' with attemptNr = %d (according to server - will not" " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr)) # releaseAtomicLockFile(fd, lockfile_name) return False # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt) # releaseAtomicLockFile(fd, lockfile_name) monitor = Monitor(env) monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir) log("Chdir back to %s" % current_dir) pUtil.chdir(current_dir) panda_jobs = glob(job_dir + "/PandaJob_*_*") panda_logs = glob(job_dir + "/*.log.tgz.*") if panda_jobs or panda_logs: log("Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir" % (len(panda_jobs), len(panda_logs))) else: log("Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir" % (len(panda_jobs), len(panda_logs))) log("Remove job dir %s" % job_dir) os.system("rm -rf %s" % job_dir) return True except: log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc()) return False