Beispiel #1
0
    def getStatusForAllJobs(self):
        statusDict = {}
        proc = subprocess.Popen(["qstat"],
                                stdin=open(os.devnull),
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                encoding=getpreferredencoding())
        outMsg = proc.communicate()[0]
        if proc.returncode > 0:
            # SGE unavailable for the moment, don't update the job status
            return

        for line in outMsg.splitlines():
            words = line.split()
            if len(words) >= 5 and words[0].isdigit():
                jobId = words[0]
                statusLetter = self.getStatusLetter(words, 4)
                if statusLetter in self.errorStatuses:
                    self.errorReasons[jobId] = self.getErrorReason(jobId)
                    self.killJob(jobId)
                    continue

                status = self.allStatuses.get(statusLetter)
                if status:
                    statusDict[jobId] = status
                else:
                    log.info("WARNING: unexpected job status " +
                             repr(statusLetter) + " received from SGE!")
                    statusDict[jobId] = statusLetter, statusLetter
        return statusDict
Beispiel #2
0
 def findJobId(self, stdout):
     jobId = ""
     for line in stdout.splitlines():
         if line.find("has been submitted") != -1:
             jobId = self.getJobId(line)
         else:
             log.info("Unexpected output from qsub : " + line.strip())
     return jobId
Beispiel #3
0
 def findJobId(self, stdout):
     jobId = ""
     for line in stdout.splitlines():
         if line.find("submitted to cluster") != -1:
             jobId = self.getJobId(line)
         elif line.find("Submitting job") != -1 or line.find("Logging submit event") != -1:
             continue
         else:
             log.info("Unexpected output from condor_submit : " + line.strip())
     return jobId 
Beispiel #4
0
 def retryAccountInfo(self, jobId):
     sleepTime = 0.5
     acctError = ""
     for i in range(9): # would be 10 but we had one already
         # assume failure is because the job hasn't propagated yet, wait a bit
         sleep(sleepTime)
         if sleepTime < 5:
             sleepTime *= 2
         acctOutput, acctError = self.getAccountInfo(jobId)
         if acctOutput is not None:
             return acctOutput, acctError
         else:
             log.info("Waiting " + str(sleepTime) + " seconds before retrying account info for job " + jobId)
     return None, acctError