Exemple #1
0
def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """    
    waitDuration = float(config.attrib["wait_duration"])
    assert waitDuration >= 0
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \
                (waitDuration, rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    jobFiles = config.attrib["job_file_dir"].listFiles()
    logger.info("Got a list of job files")
    
    #Repair the job tree using any .old files
    fixJobsList(config, jobFiles)
    logger.info("Fixed the job files using any .old files")
    
    #Get jobs that were running, or that had failed reset to 'white' status
    restartFailedJobs(config, jobFiles)
    logger.info("Reworked failed jobs")
    
    updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started.
    for jobFile in jobFiles:
        job = ET.parse(jobFile).getroot()
        if job.attrib["colour"] not in ("grey", "blue"):
            updatedJobFiles.add(jobFile)
    logger.info("Got the active (non grey/blue) job files")
    
    totalJobFiles = len(jobFiles) #Total number of job files we have.
    jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system.
    
    idealJobTime = float(config.attrib["job_time"]) 
    assert idealJobTime > 0.0
    
    maxIssuedJobs = int(config.attrib["max_jobs"]) #The maximum number of jobs to issue to the batch system
    assert maxIssuedJobs >= 1
    
    stats = config.attrib.has_key("stats")
    if stats:
        startTime = time.time()
        startClock = getTotalCpuTime()
    
    logger.info("Starting the main loop")
    timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug
    while True:
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash)))
        
	jobsToIssue = []
        for jobFile in list(updatedJobFiles):
            job = ET.parse(jobFile).getroot()
            assert job.attrib["colour"] not in ("grey", "blue")
            
            if job.attrib["colour"] == "white": #Get ready to start the job
                if len(jobIDsToJobsHash) < maxIssuedJobs:
                    logger.debug("Job: %s is being started" % job.attrib["file"])
                    updatedJobFiles.remove(job.attrib["file"])
                    
                    #Reset the log files for the job.
                    open(job.attrib["slave_log_file"], 'w').close()
                    open(job.attrib["log_file"], 'w').close()
                    
                    job.attrib["colour"] = "grey"
                    #writeJobs([ job ]) #Check point, do this before issuing job, so state is not read until issued
                    
                    #issueJobs([ job ], jobIDsToJobsHash, batchSystem)
		    jobsToIssue.append(job)
                else:
                    logger.debug("Job: %s is not being issued yet because we have %i jobs issued" % (job.attrib["file"], len(jobIDsToJobsHash)))
            elif job.attrib["colour"] == "black": #Job has finished okay
                logger.debug("Job: %s has finished okay" % job.attrib["file"])
                #Deal with stats
                if stats:
                    system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"]))
                    open(job.attrib["stats"], 'w').close() #Reset the stats file
                if job.find("messages") != None:
                    for message in job.find("messages").findall("message"):
                        logger.info("Received the following message from job: %s" % message.attrib["message"])
                    job.remove(job.find("messages"))
                childCount = int(job.attrib["child_count"])
                blackChildCount = int(job.attrib["black_child_count"])
                assert childCount == blackChildCount #Has no currently running child jobs
                #Launch any unborn children
                unbornChildren = job.find("children")
                unbornChild = unbornChildren.find("child")
                if unbornChild != None: #We must give birth to the unborn children
                    logger.debug("Job: %s has children to schedule" % job.attrib["file"])
                    newChildren = []
                    while unbornChild != None:
                        cummulativeChildTime = float(unbornChild.attrib["time"])
                        
                        newJob = createJob(unbornChild.attrib.copy(), job.attrib["file"], config)
                        
                        totalJobFiles += 1
                        updatedJobFiles.add(newJob.attrib["file"])
                        
                        newChildren.append(newJob)
                        unbornChildren.remove(unbornChild)
                        unbornChild = unbornChildren.find("child")
                        
                        #This was code to aggregate groups of children into one job, but we don't do this now
                        #while cummulativeChildTime < idealJobTime and unbornChild != None: #Cummulate a series of children into each job as a stack of jobs (to balance cost of parellelism with cost of running stuff in serially).
                        #    cummulativeChildTime += float(unbornChild.attrib["time"])
                        #    ET.SubElement(newJob.find("followOns"), "followOn", unbornChild.attrib.copy())    
                        #    unbornChildren.remove(unbornChild)
                        #    unbornChild = unbornChildren.find("child")
                        newJob.attrib["total_time"] = str(cummulativeChildTime)
                    
                    updatedJobFiles.remove(job.attrib["file"])
                    job.attrib["child_count"] = str(childCount + len(newChildren))
                    job.attrib["colour"] = "blue" #Blue - has children running.
                    writeJobs([ job ] + newChildren ) #Check point
                    
                elif len(job.find("followOns").findall("followOn")) != 0: #Has another job
                    logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"])
                    ##Reset the job run info
                    job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
                    job.attrib["colour"] = "white"
                    ##End resetting the job
                    writeJobs([ job ])
                    
                else: #Job has finished, so we can defer to any parent
                    logger.debug("Job: %s is now dead" % job.attrib["file"])
                    job.attrib["colour"] = "dead"
                    if job.attrib.has_key("parent"):
                        parent = ET.parse(job.attrib["parent"]).getroot()
                        assert parent.attrib["colour"] == "blue"
                        assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"])
                        parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1)
                        if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]):
                            parent.attrib["colour"] = "black"
                            assert parent.attrib["file"] not in updatedJobFiles
                            updatedJobFiles.add(parent.attrib["file"])
                        writeJobs([ job, parent ]) #Check point
                    else:
                        writeJobs([ job ])
                         
            elif job.attrib["colour"] == "red": #Job failed
                logger.critical("Job: %s failed" % job.attrib["file"])
                logger.critical("The log file of the failed job")
                logFile(job.attrib["log_file"], logger.critical)
                logger.critical("The log file of the slave for the failed job")
                logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop
                #Checks
                assert len(job.find("children").findall("child")) == 0
                assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
                
                remainingRetyCount = int(job.attrib["remaining_retry_count"])
                if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere
                    job.attrib["remaining_retry_count"] = str(remainingRetyCount-1)
                    job.attrib["colour"] = "white"
                    logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"]))
                    writeJobs([ job ]) #Check point
                else:
                    assert remainingRetyCount == 0
                    updatedJobFiles.remove(job.attrib["file"])
                    logger.critical("Job: %s is completely failed" % job.attrib["file"])
                    
            else:
                logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"])
                assert job.attrib["colour"] == "dead"
                updatedJobFiles.remove(job.attrib["file"])
                totalJobFiles -= 1
                deleteJob(job, config) #This could be done earlier, but I like it this way.

        ###End of for loop
        writeJobs(jobsToIssue) #Check point, do this before issuing job, so state is not read until issued
        issueJobs(jobsToIssue, jobIDsToJobsHash, batchSystem)

                
        if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles)
            break
        
        if len(updatedJobFiles) > 0:
            updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed.
        else:
            updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed.
        for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, 
            result = updatedJobs[jobID]
            if jobIDsToJobsHash.has_key(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended sucessfully" % jobIDsToJobsHash[jobID])   
                else:
                    logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID], result))  
                processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash)
            else:
                logger.info("A result seems to already have been processed: %i" % jobID) #T
        
        if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds
            reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem)
            logger.info("Reissued any over long jobs")
            
            reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem)
            logger.info("Rescued any (long) missing jobs")
            timeSinceJobsLastRescued = time.time()
        #Going to sleep to let the job system catch up.
        time.sleep(waitDuration)
    
    if stats:
        fileHandle = open(config.attrib["stats"], 'a')
        fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
        fileHandle.close()
    
    logger.info("Finished the main loop")     
    
    return totalJobFiles #Returns number of failed jobs
Exemple #2
0
def main():
    """Reports the state of the job tree.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  
    
    parser = getBasicOptionParser("usage: %prog [options] \nThe colours returned indicate the state of the job.\n\
\twhite: job has not been started yet\n\
\tgrey: job is issued to batch system\n\
\tred: job failed\n\
\tblue: job has children currently being processed\n\
\tblack: job has finished and will be processed (transient state)\n\
\tdead: job is totally finished and is awaiting deletion (transient state)", "%prog 0.1")
    
    parser.add_option("--jobTree", dest="jobTree", 
                      help="Directory containing the job tree")
    
    parser.add_option("--verbose", dest="verbose", action="store_true",
                      help="Print loads of information, particularly all the log files of errors",
                      default=False)
    
    parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true",
                      help="Return exit value of 1 if job tree jobs not all completed",
                      default=False)
    
    options, args = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    assert len(args) == 0
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    ##########################################
    #Do some checks.
    ##########################################
    
    logger.info("Checking if we have files for job tree")
    assert options.jobTree != None
    assert os.path.isdir(options.jobTree) #The given job dir tree must exist.
    assert os.path.isfile(os.path.join(options.jobTree, "config.xml")) #A valid job tree must contain the config gile
    assert os.path.isdir(os.path.join(options.jobTree, "jobs")) #A job tree must have a directory of jobs.
    assert os.path.isdir(os.path.join(options.jobTree, "tempDirDir")) #A job tree must have a directory of temporary directories (for jobs to make temp files in).
    assert os.path.isdir(os.path.join(options.jobTree, "logFileDir")) #A job tree must have a directory of log files.
    assert os.path.isdir(os.path.join(options.jobTree, "slaveLogFileDir")) #A job tree must have a directory of slave log files.
    
    ##########################################
    #Read the total job number
    ##########################################  
    
    config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot()
    
    ##########################################
    #Survey the status of the job and report.
    ##########################################  
    
    colours = {}
    jobFiles = TempFileTree(config.attrib["job_file_dir"]).listFiles()
    if len(jobFiles) > 0:
        logger.info("Collating the colours of the job tree")
        for absFileName in jobFiles:
            job = parseJobFile(absFileName)
            if job != None:
                if not colours.has_key(job.attrib["colour"]):
                    colours[job.attrib["colour"]] = 0
                colours[job.attrib["colour"]] += 1
    else:
        logger.info("There are no jobs to collate")
    
    print "There are %i jobs currently in job tree: %s" % \
    (len(jobFiles), options.jobTree)
    
    for colour in colours.keys():
        print "\tColour: %s, number of jobs: %s" % (colour, colours[colour])
    
    if options.verbose: #Verbose currently means outputting the files that have failed.
        for absFileName in jobFiles:
            job = parseJobFile(absFileName)
            if job != None:
                if job.attrib["colour"] == "red":
                    if os.path.isfile(job.attrib["log_file"]):
                        def fn(string):
                            print string
                        logFile(job.attrib["log_file"], fn)
                    else:
                        logger.info("Log file for job %s is not present" % job.attrib["file"])
    
    if len(jobFiles) != 0 and options.failIfNotComplete:
        sys.exit(1)