def getStatusForAllJobs(self): statusDict = {} proc = subprocess.Popen([ "qstat" ], stdin=open(os.devnull), stdout=subprocess.PIPE, stderr=subprocess.PIPE) outMsg = proc.communicate()[0] if proc.returncode > 0: # SGE unavailable for the moment, don't update the job status return for line in outMsg.splitlines(): words = line.split() if len(words) >= 5 and words[0].isdigit(): jobId = words[0] statusLetter = self.getStatusLetter(words, 4) if statusLetter == self.errorStatus: self.errorReasons[jobId] = self.getErrorReason(jobId) self.killJob(jobId) continue status = self.allStatuses.get(statusLetter) if status: statusDict[jobId] = status else: log.info("WARNING: unexpected job status " + repr(statusLetter) + " received from SGE!") statusDict[jobId] = statusLetter return statusDict
def findJobId(self, stdout): jobId = "" for line in stdout.splitlines(): if line.find("has been submitted") != -1: jobId = self.getJobId(line) else: log.info("Unexpected output from qsub : " + line.strip()) return jobId
def findJobId(self, stdout): jobId = "" for line in stdout.splitlines(): if line.find("submitted to cluster") != -1: jobId = self.getJobId(line) elif line.find("Submitting job") != -1 or line.find("Logging submit event") != -1: continue else: log.info("Unexpected output from condor_submit : " + line.strip()) return jobId
def retryAccountInfo(self, jobId): sleepTime = 0.5 acctError = "" for i in range(9): # would be 10 but we had one already # assume failure is because the job hasn't propagated yet, wait a bit sleep(sleepTime) if sleepTime < 5: sleepTime *= 2 acctOutput, acctError = self.getAccountInfo(jobId) if acctOutput is not None: return acctOutput, acctError else: log.info("Waiting " + str(sleepTime) + " seconds before retrying account info for job " + jobId) return None, acctError