def mainLoop(config, batchSystem): """This is the main loop from which jobs are issued and processed. """ waitDuration = float(config.attrib["wait_duration"]) assert waitDuration >= 0 rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"]) maxJobDuration = float(config.attrib["max_job_duration"]) assert maxJobDuration >= 0 logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \ (waitDuration, rescueJobsFrequency, maxJobDuration)) #Kill any jobs on the batch system queue from the last time. assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs! logger.info("Checked batch system has no running jobs and no updated jobs") jobFiles = config.attrib["job_file_dir"].listFiles() logger.info("Got a list of job files") #Repair the job tree using any .old files fixJobsList(config, jobFiles) logger.info("Fixed the job files using any .old files") #Get jobs that were running, or that had failed reset to 'white' status restartFailedJobs(config, jobFiles) logger.info("Reworked failed jobs") updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started. for jobFile in jobFiles: job = ET.parse(jobFile).getroot() if job.attrib["colour"] not in ("grey", "blue"): updatedJobFiles.add(jobFile) logger.info("Got the active (non grey/blue) job files") totalJobFiles = len(jobFiles) #Total number of job files we have. jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system. idealJobTime = float(config.attrib["job_time"]) assert idealJobTime > 0.0 maxIssuedJobs = int(config.attrib["max_jobs"]) #The maximum number of jobs to issue to the batch system assert maxIssuedJobs >= 1 stats = config.attrib.has_key("stats") if stats: startTime = time.time() startClock = getTotalCpuTime() logger.info("Starting the main loop") timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug while True: if len(updatedJobFiles) > 0: logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash))) jobsToIssue = [] for jobFile in list(updatedJobFiles): job = ET.parse(jobFile).getroot() assert job.attrib["colour"] not in ("grey", "blue") if job.attrib["colour"] == "white": #Get ready to start the job if len(jobIDsToJobsHash) < maxIssuedJobs: logger.debug("Job: %s is being started" % job.attrib["file"]) updatedJobFiles.remove(job.attrib["file"]) #Reset the log files for the job. open(job.attrib["slave_log_file"], 'w').close() open(job.attrib["log_file"], 'w').close() job.attrib["colour"] = "grey" #writeJobs([ job ]) #Check point, do this before issuing job, so state is not read until issued #issueJobs([ job ], jobIDsToJobsHash, batchSystem) jobsToIssue.append(job) else: logger.debug("Job: %s is not being issued yet because we have %i jobs issued" % (job.attrib["file"], len(jobIDsToJobsHash))) elif job.attrib["colour"] == "black": #Job has finished okay logger.debug("Job: %s has finished okay" % job.attrib["file"]) #Deal with stats if stats: system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"])) open(job.attrib["stats"], 'w').close() #Reset the stats file if job.find("messages") != None: for message in job.find("messages").findall("message"): logger.info("Received the following message from job: %s" % message.attrib["message"]) job.remove(job.find("messages")) childCount = int(job.attrib["child_count"]) blackChildCount = int(job.attrib["black_child_count"]) assert childCount == blackChildCount #Has no currently running child jobs #Launch any unborn children unbornChildren = job.find("children") unbornChild = unbornChildren.find("child") if unbornChild != None: #We must give birth to the unborn children logger.debug("Job: %s has children to schedule" % job.attrib["file"]) newChildren = [] while unbornChild != None: cummulativeChildTime = float(unbornChild.attrib["time"]) newJob = createJob(unbornChild.attrib.copy(), job.attrib["file"], config) totalJobFiles += 1 updatedJobFiles.add(newJob.attrib["file"]) newChildren.append(newJob) unbornChildren.remove(unbornChild) unbornChild = unbornChildren.find("child") #This was code to aggregate groups of children into one job, but we don't do this now #while cummulativeChildTime < idealJobTime and unbornChild != None: #Cummulate a series of children into each job as a stack of jobs (to balance cost of parellelism with cost of running stuff in serially). # cummulativeChildTime += float(unbornChild.attrib["time"]) # ET.SubElement(newJob.find("followOns"), "followOn", unbornChild.attrib.copy()) # unbornChildren.remove(unbornChild) # unbornChild = unbornChildren.find("child") newJob.attrib["total_time"] = str(cummulativeChildTime) updatedJobFiles.remove(job.attrib["file"]) job.attrib["child_count"] = str(childCount + len(newChildren)) job.attrib["colour"] = "blue" #Blue - has children running. writeJobs([ job ] + newChildren ) #Check point elif len(job.find("followOns").findall("followOn")) != 0: #Has another job logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"]) ##Reset the job run info job.attrib["remaining_retry_count"] = config.attrib["retry_count"] job.attrib["colour"] = "white" ##End resetting the job writeJobs([ job ]) else: #Job has finished, so we can defer to any parent logger.debug("Job: %s is now dead" % job.attrib["file"]) job.attrib["colour"] = "dead" if job.attrib.has_key("parent"): parent = ET.parse(job.attrib["parent"]).getroot() assert parent.attrib["colour"] == "blue" assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"]) parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1) if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]): parent.attrib["colour"] = "black" assert parent.attrib["file"] not in updatedJobFiles updatedJobFiles.add(parent.attrib["file"]) writeJobs([ job, parent ]) #Check point else: writeJobs([ job ]) elif job.attrib["colour"] == "red": #Job failed logger.critical("Job: %s failed" % job.attrib["file"]) logger.critical("The log file of the failed job") logFile(job.attrib["log_file"], logger.critical) logger.critical("The log file of the slave for the failed job") logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop #Checks assert len(job.find("children").findall("child")) == 0 assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"]) remainingRetyCount = int(job.attrib["remaining_retry_count"]) if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere job.attrib["remaining_retry_count"] = str(remainingRetyCount-1) job.attrib["colour"] = "white" logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"])) writeJobs([ job ]) #Check point else: assert remainingRetyCount == 0 updatedJobFiles.remove(job.attrib["file"]) logger.critical("Job: %s is completely failed" % job.attrib["file"]) else: logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"]) assert job.attrib["colour"] == "dead" updatedJobFiles.remove(job.attrib["file"]) totalJobFiles -= 1 deleteJob(job, config) #This could be done earlier, but I like it this way. ###End of for loop writeJobs(jobsToIssue) #Check point, do this before issuing job, so state is not read until issued issueJobs(jobsToIssue, jobIDsToJobsHash, batchSystem) if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0: logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles) break if len(updatedJobFiles) > 0: updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed. else: updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed. for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, result = updatedJobs[jobID] if jobIDsToJobsHash.has_key(jobID): if result == 0: logger.debug("Batch system is reporting that the job %s ended sucessfully" % jobIDsToJobsHash[jobID]) else: logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID], result)) processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash) else: logger.info("A result seems to already have been processed: %i" % jobID) #T if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem) logger.info("Reissued any over long jobs") reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem) logger.info("Rescued any (long) missing jobs") timeSinceJobsLastRescued = time.time() #Going to sleep to let the job system catch up. time.sleep(waitDuration) if stats: fileHandle = open(config.attrib["stats"], 'a') fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock))) fileHandle.close() logger.info("Finished the main loop") return totalJobFiles #Returns number of failed jobs
def main(): """Reports the state of the job tree. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [options] \nThe colours returned indicate the state of the job.\n\ \twhite: job has not been started yet\n\ \tgrey: job is issued to batch system\n\ \tred: job failed\n\ \tblue: job has children currently being processed\n\ \tblack: job has finished and will be processed (transient state)\n\ \tdead: job is totally finished and is awaiting deletion (transient state)", "%prog 0.1") parser.add_option("--jobTree", dest="jobTree", help="Directory containing the job tree") parser.add_option("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of errors", default=False) parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if job tree jobs not all completed", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") assert len(args) == 0 if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for job tree") assert options.jobTree != None assert os.path.isdir(options.jobTree) #The given job dir tree must exist. assert os.path.isfile(os.path.join(options.jobTree, "config.xml")) #A valid job tree must contain the config gile assert os.path.isdir(os.path.join(options.jobTree, "jobs")) #A job tree must have a directory of jobs. assert os.path.isdir(os.path.join(options.jobTree, "tempDirDir")) #A job tree must have a directory of temporary directories (for jobs to make temp files in). assert os.path.isdir(os.path.join(options.jobTree, "logFileDir")) #A job tree must have a directory of log files. assert os.path.isdir(os.path.join(options.jobTree, "slaveLogFileDir")) #A job tree must have a directory of slave log files. ########################################## #Read the total job number ########################################## config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot() ########################################## #Survey the status of the job and report. ########################################## colours = {} jobFiles = TempFileTree(config.attrib["job_file_dir"]).listFiles() if len(jobFiles) > 0: logger.info("Collating the colours of the job tree") for absFileName in jobFiles: job = parseJobFile(absFileName) if job != None: if not colours.has_key(job.attrib["colour"]): colours[job.attrib["colour"]] = 0 colours[job.attrib["colour"]] += 1 else: logger.info("There are no jobs to collate") print "There are %i jobs currently in job tree: %s" % \ (len(jobFiles), options.jobTree) for colour in colours.keys(): print "\tColour: %s, number of jobs: %s" % (colour, colours[colour]) if options.verbose: #Verbose currently means outputting the files that have failed. for absFileName in jobFiles: job = parseJobFile(absFileName) if job != None: if job.attrib["colour"] == "red": if os.path.isfile(job.attrib["log_file"]): def fn(string): print string logFile(job.attrib["log_file"], fn) else: logger.info("Log file for job %s is not present" % job.attrib["file"]) if len(jobFiles) != 0 and options.failIfNotComplete: sys.exit(1)