def reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem, killAfterNTimesMissing=3): """Check all the current job ids are in the list of currently running batch system jobs. If a job is missing, we mark it as so, if it is missing for a number of runs of this function (say 10).. then we try deleting the job (though its probably lost), we wait then we pass the job to processFinishedJob. """ runningJobs = set(batchSystem.getIssuedJobIDs()) jobIDsSet = set(jobIDsToJobsHash.keys()) assert runningJobs.issubset(jobIDsSet) for jobID in set(jobIDsSet.difference(runningJobs)): jobFile = jobIDsToJobsHash[jobID] if reissueMissingJobs_missingHash.has_key(jobID): reissueMissingJobs_missingHash[jobID] = reissueMissingJobs_missingHash[jobID]+1 else: reissueMissingJobs_missingHash[jobID] = 1 timesMissing = reissueMissingJobs_missingHash[jobID] logger.critical("Job %s with id %i is missing for the %i time" % (jobFile, jobID, timesMissing)) if timesMissing == killAfterNTimesMissing: reissueMissingJobs_missingHash.pop(jobID) batchSystem.killJobs([ jobID ]) processFinishedJob(jobID, 1, updatedJobFiles, jobIDsToJobsHash) if len(reissueMissingJobs_missingHash) > 0: logger.critical("Going to sleep before trying again before trying to find missing jobs") time.sleep(60) reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem, killAfterNTimesMissing)
def fixJobsList(config, jobFiles): """Traverses through and finds any .old files, using there saved state to recover a valid state of the job tree. """ updatingFiles = [] for fileName in jobFiles[:]: if "updating" in fileName: updatingFiles.append(fileName) for updatingFileName in updatingFiles: #Things crashed while the state was updating, so we should remove the 'updating' and '.new' files fileHandle = open(updatingFileName, 'r') for fileName in fileHandle.readline().split(): if os.path.isfile(fileName): config.attrib["job_file_dir"].destroyTempFile(fileName) jobFiles.remove(fileName) fileHandle.close() config.attrib["job_file_dir"].destroyTempFile(updatingFileName) jobFiles.remove(updatingFileName) for fileName in jobFiles[:]: #Else any '.new' files will be switched in place of the original file. if fileName[-3:] == 'new': originalFileName = fileName[:-4] os.rename(fileName, originalFileName) jobFiles.remove(fileName) if originalFileName not in jobFiles: jobFiles.append(originalFileName) logger.critical("Fixing the file: %s from %s" % (originalFileName, fileName))
def reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem): """Check each issued job - if it is running for longer than desirable.. issue a kill instruction. Wait for the job to die then we pass the job to processFinishedJob. """ maxJobDuration = float(config.attrib["max_job_duration"]) if maxJobDuration < 10000000: #We won't both doing anything is the rescue time is more than 10 weeks. runningJobs = batchSystem.getRunningJobIDs() for jobID in runningJobs.keys(): if runningJobs[jobID] > maxJobDuration: logger.critical("The job: %s has been running for: %s seconds, more than the max job duration: %s, we'll kill it" % \ (jobIDsToJobsHash[jobID], str(runningJobs[jobID]), str(maxJobDuration))) batchSystem.killJobs([ jobID ]) processFinishedJob(jobID, 1, updatedJobFiles, jobIDsToJobsHash)
def killJobs(self, jobIDs): """Kills the given jobs, represented as Job ids, then checks they are dead by checking they are not in the list of issued jobs. """ while True: for jobID in jobIDs: i = popenParasolCommand("parasol remove job %i" % jobID, tmpFileForStdOut=self.scratchFile, runUntilSuccessful=None) logger.info("Tried to remove jobID: %i, with exit value: %i" % (jobID, i)) runningJobs = self.getIssuedJobIDs() if set(jobIDs).difference(set(runningJobs)) == set(jobIDs): return time.sleep(5) logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")
def popenParasolCommand(command, tmpFileForStdOut, runUntilSuccessful=True): """Issues a parasol command using popen to capture the output. If the command fails then it will try pinging parasol until it gets a response. When it gets a response it will recursively call the issue parasol command, repeating this pattern for a maximum of N times. The final exit value will reflect this. """ while True: fileHandle = open(tmpFileForStdOut, 'w') process = subprocess.Popen(command, shell=True, stdout=fileHandle) sts = os.waitpid(process.pid, 0) fileHandle.close() i = sts[1] if i != 0 and runUntilSuccessful: logger.critical("The following parasol command failed: %s" % command) time.sleep(10) logger.critical("Waited for a few seconds, will try again") else: return i
def processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats): from workflow.jobTree.lib.bioio import getTempFile from workflow.jobTree.lib.bioio import getTempDirectory from workflow.jobTree.lib.bioio import logger from workflow.jobTree.lib.bioio import system from workflow.jobTree.lib.bioio import getTotalCpuTime assert len(job.find("children").findall("child")) == 0 assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"]) command = jobToRun.attrib["command"] #Copy the job file to be edited tempJob = ET.Element("job") ET.SubElement(tempJob, "children") #Log for job tempJob.attrib["log_level"] = job.attrib["log_level"] #Time length of 'ideal' job before further parallelism is required tempJob.attrib["job_time"] = job.attrib["job_time"] #Dir to put all the temp files in. localSlaveTempDir = getTempDirectory() #Temp file dirs for job. localTempDir = getTempDirectory(rootDir=localSlaveTempDir) tempJob.attrib["local_temp_dir"] = localTempDir depth = len(job.find("followOns").findall("followOn")) tempJob.attrib["global_temp_dir"] = os.path.join(job.attrib["global_temp_dir"], str(depth)) if not os.path.isdir(tempJob.attrib["global_temp_dir"]): #Ensures that the global temp dirs of each level are kept separate. os.mkdir(tempJob.attrib["global_temp_dir"]) os.chmod(tempJob.attrib["global_temp_dir"], 0777) if os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+1))): system("rm -rf %s" % os.path.join(job.attrib["global_temp_dir"], str(depth+1))) assert not os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+2))) #Deal with memory and cpu requirements (this pass tells the running job how much cpu and memory they have, #according to the batch system tempJob.attrib["available_memory"] = str(memoryAvailable) tempJob.attrib["available_cpu"] = str(cpuAvailable) if stats != None: tempJob.attrib["stats"] = getTempFile(rootDir=localSlaveTempDir) os.remove(tempJob.attrib["stats"]) #Now write the temp job file tempFile = getTempFile(rootDir=localSlaveTempDir) fileHandle = open(tempFile, 'w') tree = ET.ElementTree(tempJob) tree.write(fileHandle) fileHandle.close() logger.info("Copied the jobs files ready for the job") if "JOB_FILE" not in command: logger.critical("There is no 'JOB_FILE' string in the command to be run to take the job-file argument: %s" % command) job.attrib["colour"] = "red" #Update the colour else: #First load the environment for the job. fileHandle = open(job.attrib["environment_file"], 'r') environment = cPickle.load(fileHandle) fileHandle.close() logger.info("Loaded the environment for the process") #Run the actual command tempLogFile = getTempFile(suffix=".log", rootDir=localSlaveTempDir) fileHandle = open(tempLogFile, 'w') finalCommand = command.replace("JOB_FILE", tempFile) if stats != None: startTime = time.time() startClock = getTotalCpuTime() process = subprocess.Popen(finalCommand, shell=True, stdout=fileHandle, stderr=subprocess.STDOUT, env=environment) sts = os.waitpid(process.pid, 0) fileHandle.close() truncateFile(tempLogFile, int(job.attrib["max_log_file_size"])) #Copy across the log file system("mv %s %s" % (tempLogFile, job.attrib["log_file"])) i = sts[1] logger.info("Ran the job command=%s with exit status %i" % (finalCommand, i)) if i == 0: logger.info("Passed the job, okay") if stats != None: jobTag = ET.SubElement(stats, "job", { "time":str(time.time() - startTime), "clock":str(getTotalCpuTime() - startClock) }) if os.path.exists(tempJob.attrib["stats"]): jobTag.append(ET.parse(tempJob.attrib["stats"]).getroot()) tempJob = ET.parse(tempFile).getroot() job.attrib["colour"] = "black" #Update the colour #Update the runtime of the stack.. totalRuntime = float(job.attrib["total_time"]) #This is the estimate runtime of the jobs on the followon stack runtime = float(jobToRun.attrib["time"]) totalRuntime -= runtime if totalRuntime < 0.0: totalRuntime = 0.0 #The children children = job.find("children") assert len(children.findall("child")) == 0 #The children assert tempJob.find("children") != None for child in tempJob.find("children").findall("child"): memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, child) ET.SubElement(children, "child", { "command":child.attrib["command"], "time":str(compTime), "memory":str(memory), "cpu":str(cpu) }) logger.info("Making a child with command: %s" % (child.attrib["command"])) #The follow on command followOns = job.find("followOns") followOns.remove(followOns.findall("followOn")[-1]) #Remove the old job if tempJob.attrib.has_key("command"): memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, tempJob) ET.SubElement(followOns, "followOn", { "command":tempJob.attrib["command"], "time":str(compTime), "memory":str(memory), "cpu":str(cpu) }) ##Add the runtime to the total runtime.. totalRuntime += compTime logger.info("Making a follow on job with command: %s" % tempJob.attrib["command"]) elif len(tempJob.find("children").findall("child")) != 0: #This is to keep the stack of follow on jobs consistent. ET.SubElement(followOns, "followOn", { "command":"echo JOB_FILE", "time":"0", "memory":"1000000", "cpu":"1" }) logger.info("Making a stub follow on job") #Write back the runtime, after addin the follow on time and subtracting the time of the run job. job.attrib["total_time"] = str(totalRuntime) else: logger.info("Failed the job") job.attrib["colour"] = "red" #Update the colour #Clean up system("rm -rf %s" % (localSlaveTempDir)) logger.info("Cleaned up by removing temp jobfile (the copy), and the temporary file directory for the job")
def mainLoop(config, batchSystem): """This is the main loop from which jobs are issued and processed. """ waitDuration = float(config.attrib["wait_duration"]) assert waitDuration >= 0 rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"]) maxJobDuration = float(config.attrib["max_job_duration"]) assert maxJobDuration >= 0 logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \ (waitDuration, rescueJobsFrequency, maxJobDuration)) #Kill any jobs on the batch system queue from the last time. assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs! logger.info("Checked batch system has no running jobs and no updated jobs") jobFiles = config.attrib["job_file_dir"].listFiles() logger.info("Got a list of job files") #Repair the job tree using any .old files fixJobsList(config, jobFiles) logger.info("Fixed the job files using any .old files") #Get jobs that were running, or that had failed reset to 'white' status restartFailedJobs(config, jobFiles) logger.info("Reworked failed jobs") updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started. for jobFile in jobFiles: job = ET.parse(jobFile).getroot() if job.attrib["colour"] not in ("grey", "blue"): updatedJobFiles.add(jobFile) logger.info("Got the active (non grey/blue) job files") totalJobFiles = len(jobFiles) #Total number of job files we have. jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system. idealJobTime = float(config.attrib["job_time"]) assert idealJobTime > 0.0 maxIssuedJobs = int(config.attrib["max_jobs"]) #The maximum number of jobs to issue to the batch system assert maxIssuedJobs >= 1 stats = config.attrib.has_key("stats") if stats: startTime = time.time() startClock = getTotalCpuTime() logger.info("Starting the main loop") timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug while True: if len(updatedJobFiles) > 0: logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash))) jobsToIssue = [] for jobFile in list(updatedJobFiles): job = ET.parse(jobFile).getroot() assert job.attrib["colour"] not in ("grey", "blue") if job.attrib["colour"] == "white": #Get ready to start the job if len(jobIDsToJobsHash) < maxIssuedJobs: logger.debug("Job: %s is being started" % job.attrib["file"]) updatedJobFiles.remove(job.attrib["file"]) #Reset the log files for the job. open(job.attrib["slave_log_file"], 'w').close() open(job.attrib["log_file"], 'w').close() job.attrib["colour"] = "grey" #writeJobs([ job ]) #Check point, do this before issuing job, so state is not read until issued #issueJobs([ job ], jobIDsToJobsHash, batchSystem) jobsToIssue.append(job) else: logger.debug("Job: %s is not being issued yet because we have %i jobs issued" % (job.attrib["file"], len(jobIDsToJobsHash))) elif job.attrib["colour"] == "black": #Job has finished okay logger.debug("Job: %s has finished okay" % job.attrib["file"]) #Deal with stats if stats: system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"])) open(job.attrib["stats"], 'w').close() #Reset the stats file childCount = int(job.attrib["child_count"]) blackChildCount = int(job.attrib["black_child_count"]) assert childCount == blackChildCount #Has no currently running child jobs #Launch any unborn children unbornChildren = job.find("children") unbornChild = unbornChildren.find("child") if unbornChild != None: #We must give birth to the unborn children logger.debug("Job: %s has children to schedule" % job.attrib["file"]) newChildren = [] while unbornChild != None: cummulativeChildTime = float(unbornChild.attrib["time"]) newJob = createJob(unbornChild.attrib.copy(), job.attrib["file"], config) totalJobFiles += 1 updatedJobFiles.add(newJob.attrib["file"]) newChildren.append(newJob) unbornChildren.remove(unbornChild) unbornChild = unbornChildren.find("child") #This was code to aggregate groups of children into one job, but we don't do this now #while cummulativeChildTime < idealJobTime and unbornChild != None: #Cummulate a series of children into each job as a stack of jobs (to balance cost of parellelism with cost of running stuff in serially). # cummulativeChildTime += float(unbornChild.attrib["time"]) # ET.SubElement(newJob.find("followOns"), "followOn", unbornChild.attrib.copy()) # unbornChildren.remove(unbornChild) # unbornChild = unbornChildren.find("child") newJob.attrib["total_time"] = str(cummulativeChildTime) updatedJobFiles.remove(job.attrib["file"]) job.attrib["child_count"] = str(childCount + len(newChildren)) job.attrib["colour"] = "blue" #Blue - has children running. writeJobs([ job ] + newChildren ) #Check point elif len(job.find("followOns").findall("followOn")) != 0: #Has another job logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"]) ##Reset the job run info job.attrib["remaining_retry_count"] = config.attrib["retry_count"] job.attrib["colour"] = "white" ##End resetting the job writeJobs([ job ]) else: #Job has finished, so we can defer to any parent logger.debug("Job: %s is now dead" % job.attrib["file"]) job.attrib["colour"] = "dead" if job.attrib.has_key("parent"): parent = ET.parse(job.attrib["parent"]).getroot() assert parent.attrib["colour"] == "blue" assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"]) parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1) if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]): parent.attrib["colour"] = "black" assert parent.attrib["file"] not in updatedJobFiles updatedJobFiles.add(parent.attrib["file"]) writeJobs([ job, parent ]) #Check point else: writeJobs([ job ]) elif job.attrib["colour"] == "red": #Job failed logger.critical("Job: %s failed" % job.attrib["file"]) logger.critical("The log file of the failed job") logFile(job.attrib["log_file"], logger.critical) logger.critical("The log file of the slave for the failed job") logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop #Checks assert len(job.find("children").findall("child")) == 0 assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"]) remainingRetyCount = int(job.attrib["remaining_retry_count"]) if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere job.attrib["remaining_retry_count"] = str(remainingRetyCount-1) job.attrib["colour"] = "white" logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"])) writeJobs([ job ]) #Check point else: assert remainingRetyCount == 0 updatedJobFiles.remove(job.attrib["file"]) logger.critical("Job: %s is completely failed" % job.attrib["file"]) else: logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"]) assert job.attrib["colour"] == "dead" updatedJobFiles.remove(job.attrib["file"]) totalJobFiles -= 1 deleteJob(job, config) #This could be done earlier, but I like it this way. ###End of for loop writeJobs(jobsToIssue) #Check point, do this before issuing job, so state is not read until issued issueJobs(jobsToIssue, jobIDsToJobsHash, batchSystem) if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0: logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles) break if len(updatedJobFiles) > 0: updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed. else: updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed. for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, result = updatedJobs[jobID] if jobIDsToJobsHash.has_key(jobID): if result == 0: logger.debug("Batch system is reporting that the job %s ended sucessfully" % jobIDsToJobsHash[jobID]) else: logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID], result)) processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash) else: logger.info("A result seems to already have been processed: %i" % jobID) #T if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem) logger.info("Reissued any over long jobs") reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem) logger.info("Rescued any (long) missing jobs") timeSinceJobsLastRescued = time.time() #Going to sleep to let the job system catch up. time.sleep(waitDuration) if stats: fileHandle = open(config.attrib["stats"], 'a') fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock))) fileHandle.close() logger.info("Finished the main loop") return totalJobFiles #Returns number of failed jobs
def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobIDsToJobsHash): """Function reads a processed job file and updates it state. """ assert jobID in jobIDsToJobsHash jobFile = jobIDsToJobsHash.pop(jobID) updatingFileIsPresent = os.path.isfile(jobFile + ".updating") newFileIsPresent = os.path.isfile(jobFile + ".new") if resultStatus == 0 and updatingFileIsPresent: logger.critical("Despite the batch system claiming success there is a .updating file present: %s", jobFile + ".updating") if resultStatus == 0 and newFileIsPresent: logger.critical("Despite the batch system claiming success there is a .new file present: %s", jobFile + ".new") if resultStatus != 0 or newFileIsPresent or updatingFileIsPresent: #Job not successful according to batchsystem, or according to the existance of a .new or .updating file if updatingFileIsPresent: #The job failed while attempting to write the job file. logger.critical("There was an .updating file for the crashed job: %s" % jobFile) if os.path.isfile(jobFile + ".new"): #The job failed while writing the updated job file. logger.critical("There was an .new file for the crashed job: %s" % jobFile) os.remove(jobFile + ".new") #The existance of the .updating file means it wasn't complete os.remove(jobFile + ".updating") #Delete second the updating file second to preserve a correct state assert os.path.isfile(jobFile) job = ET.parse(jobFile).getroot() #The original must still be there. assert job.find("children").find("child") == None #The original can not reflect the end state of the job. assert int(job.attrib["black_child_count"]) == int(job.attrib["child_count"]) job.attrib["colour"] = "red" #It failed, so we mark it so and continue. writeJobs([ job ]) logger.critical("We've reverted to the original job file and marked it as failed: %s" % jobFile) else: if newFileIsPresent: #The job was not properly updated before crashing logger.critical("There is a valid .new file %s" % jobFile) if os.path.isfile(jobFile): os.remove(jobFile) os.rename(jobFile + ".new", jobFile) job = ET.parse(jobFile).getroot() if job.attrib["colour"] == "grey": #The job failed while preparing to run another job on the slave assert job.find("children").find("child") == None #File job.attrib["colour"] = "red" writeJobs([ job ]) assert job.attrib["colour"] in ("black", "red") else: logger.critical("There was no valid .new file %s" % jobFile) assert os.path.isfile(jobFile) job = ET.parse(jobFile).getroot() #The job may have failed before or after creating this file, we check the state. if job.attrib["colour"] == "black": #The job completed okay, so we'll keep it logger.critical("Despite the batch system job failing, the job appears to have completed okay") else: assert job.attrib["colour"] in ("grey", "red") assert job.find("children").find("child") == None #File assert int(job.attrib["black_child_count"]) == int(job.attrib["child_count"]) if job.attrib["colour"] == "grey": job.attrib["colour"] = "red" writeJobs([ job ]) logger.critical("We've reverted to the original job file and marked it as failed: %s" % jobFile) else: job = ET.parse(jobFile).getroot() ##Now check the log files exist, because they must ultimately be cleaned up by their respective file trees. if not os.path.isfile(job.attrib["log_file"]): #We need to keep these files in existence. open(job.attrib["log_file"], 'w').close() logger.critical("The log file %s for job %s had disappeared" % (job.attrib["log_file"], jobFile)) if not os.path.isfile(job.attrib["slave_log_file"]): open(job.attrib["slave_log_file"], 'w').close() logger.critical("The slave log file %s for job %s had disappeared" % (job.attrib["slave_log_file"], jobFile)) assert jobFile not in updatedJobFiles updatedJobFiles.add(jobFile) #Now we know the job is done we can add it to the list of updated job files logger.debug("Added job: %s to active jobs" % jobFile)