Exemple #1
0
def statsAggregatorProcess(jobTreePath, tempDirs, stop):
    #Overall timing 
    startTime = time.time()
    startClock = getTotalCpuTime()
    
    #Start off the stats file
    fileHandle = open(getStatsFileName(jobTreePath), 'w')
    fileHandle.write('<?xml version="1.0" ?><stats>')
    statsFile = getStatsFileName(jobTreePath)
    
    #The main loop
    while True:
        def fn():
            i = 0
            for dir in tempDirs:
                for tempFile in os.listdir(dir):
                    if tempFile[-3:] != "new":
                        absTempFile = os.path.join(dir, tempFile)
                        fH = open(absTempFile, 'r')
                        for line in fH.readlines():
                            fileHandle.write(line)
                        fH.close()
                        os.remove(absTempFile)
                        i += 1
            return i
        if not stop.empty():
            fn()
            break
        if not fn():
            time.sleep(0.5) #Avoid cycling too fast
    
    #Finish the stats file
    fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
    fileHandle.close()
Exemple #2
0
def statsAggregatorProcess(jobTreePath, tempDirs, stop):
    #Overall timing
    startTime = time.time()
    startClock = getTotalCpuTime()

    #Start off the stats file
    fileHandle = open(getStatsFileName(jobTreePath), 'w')
    fileHandle.write('<?xml version="1.0" ?><stats>')
    statsFile = getStatsFileName(jobTreePath)

    #The main loop
    timeSinceOutFileLastFlushed = time.time()
    while True:

        def scanDirectoriesAndScrapeStats():
            numberOfFilesProcessed = 0
            for dir in tempDirs:
                for tempFile in os.listdir(dir):
                    if tempFile[-3:] != "new":
                        absTempFile = os.path.join(dir, tempFile)
                        fH = open(absTempFile, 'r')
                        for line in fH.readlines():
                            fileHandle.write(line)
                        fH.close()
                        os.remove(absTempFile)
                        numberOfFilesProcessed += 1
            return numberOfFilesProcessed

        if not stop.empty(
        ):  #This is a indirect way of getting a message to the process to exit
            scanDirectoriesAndScrapeStats()
            break
        if scanDirectoriesAndScrapeStats() == 0:
            time.sleep(0.5)  #Avoid cycling too fast
        if time.time(
        ) - timeSinceOutFileLastFlushed > 60:  #Flush the results file every minute
            fileHandle.flush()
            timeSinceOutFileLastFlushed = time.time()

    #Finish the stats file
    fileHandle.write(
        "<total_time time='%s' clock='%s'/></stats>" %
        (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
    fileHandle.close()
Exemple #3
0
def statsAggregatorProcess(jobTreePath, tempDirs, stop):
    #Overall timing
    startTime = time.time()
    startClock = getTotalCpuTime()

    #Start off the stats file
    fileHandle = open(getStatsFileName(jobTreePath), 'w')
    fileHandle.write('<?xml version="1.0" ?><stats>')
    statsFile = getStatsFileName(jobTreePath)

    #The main loop
    timeSinceOutFileLastFlushed = time.time()
    while True:
        def scanDirectoriesAndScrapeStats():
            numberOfFilesProcessed = 0
            for dir in tempDirs:
                for tempFile in os.listdir(dir):
                    if tempFile[-3:] != "new":
                        absTempFile = os.path.join(dir, tempFile)
                        fH = open(absTempFile, 'r')
                        for line in fH.readlines():
                            fileHandle.write(line)
                        fH.close()
                        os.remove(absTempFile)
                        numberOfFilesProcessed += 1
            return numberOfFilesProcessed 
        if not stop.empty(): #This is a indirect way of getting a message to the process to exit
            scanDirectoriesAndScrapeStats()
            break
        if scanDirectoriesAndScrapeStats() == 0:
            time.sleep(0.5) #Avoid cycling too fast
        if time.time() - timeSinceOutFileLastFlushed > 60: #Flush the results file every minute
            fileHandle.flush() 
            timeSinceOutFileLastFlushed = time.time()

    #Finish the stats file
    fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
    fileHandle.close()
Exemple #4
0
def processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats, environment, 
               localSlaveTempDir, localTempDir):
    """Runs a job.
    """
    from sonLib.bioio import logger
    from sonLib.bioio import system
    from sonLib.bioio import getTotalCpuTime
    from sonLib.bioio import redirectLoggerStreamHandlers
    
    assert len(job.find("children").findall("child")) == 0
    assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
    command = jobToRun.attrib["command"]
    #Copy the job file to be edited
    
    tempJob = ET.Element("job")
    ET.SubElement(tempJob, "children")
    
    #Log for job
    tempJob.attrib["log_level"] = job.attrib["log_level"]
    
    #Time length of 'ideal' job before further parallelism is required
    tempJob.attrib["job_time"] = job.attrib["job_time"]

    #Temp file dirs for job.
    tempJob.attrib["local_temp_dir"] = localTempDir
    depth = len(job.find("followOns").findall("followOn"))
    tempJob.attrib["global_temp_dir"] = os.path.join(job.attrib["global_temp_dir"], str(depth))
    if not os.path.isdir(tempJob.attrib["global_temp_dir"]): #Ensures that the global temp dirs of each level are kept separate.
        os.mkdir(tempJob.attrib["global_temp_dir"])
        os.chmod(tempJob.attrib["global_temp_dir"], 0777)
    if os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+1))):
        system("rm -rf %s" % os.path.join(job.attrib["global_temp_dir"], str(depth+1)))
    assert not os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+2)))
    
    #Deal with memory and cpu requirements (this pass tells the running job how much cpu and memory they have,
    #according to the batch system
    tempJob.attrib["available_memory"] = str(memoryAvailable) 
    tempJob.attrib["available_cpu"] = str(cpuAvailable)
    
    #Run the actual command
    tempLogFile = os.path.join(localSlaveTempDir, "temp.log")
    fileHandle = open(tempLogFile, 'w')
    
    if stats != None:
        tempJob.attrib["stats"] = ""
        startTime = time.time()
        startClock = getTotalCpuTime()
    
    #If you're a script tree python process, we don't need to python
    if command[:10] == "scriptTree":
        import jobTree.scriptTree.scriptTree
        savedStdErr = sys.stderr
        savedStdOut = sys.stdout
        exitValue = 0
        try: 
            sys.stderr = fileHandle 
            sys.stdout = fileHandle
            redirectLoggerStreamHandlers(savedStdErr, fileHandle)
            l = command.split()
            jobTree.scriptTree.scriptTree.run(tempJob, l[1], l[2:])
        except:
            traceback.print_exc(file = fileHandle)
            exitValue = 1
        sys.stderr = savedStdErr
        sys.stdout = savedStdOut
        redirectLoggerStreamHandlers(fileHandle, sys.stderr)
        if exitValue == 1:
            logger.critical("Caught an exception in the target being run")
    else:
        if "JOB_FILE" not in command:
            logger.critical("There is no 'JOB_FILE' string in the command to be run to take the job-file argument: %s" % command)
            job.attrib["colour"] = "red" #Update the colour
        
        #Now write the temp job file
        tempFile = os.path.join(localSlaveTempDir, "tempJob.xml")
        fileHandle2 = open(tempFile, 'w')  
        tree = ET.ElementTree(tempJob)
        tree.write(fileHandle2)
        fileHandle2.close()
        logger.info("Copied the jobs files ready for the job")
        
        process = subprocess.Popen(command.replace("JOB_FILE", tempFile), shell=True, stdout=fileHandle, stderr=subprocess.STDOUT, env=environment)
        sts = os.waitpid(process.pid, 0)
        exitValue = sts[1]
        if exitValue == 0:
            tempJob = ET.parse(tempFile).getroot()
        
    fileHandle.close()
    truncateFile(tempLogFile, int(job.attrib["max_log_file_size"]))
    
    logger.info("Ran the job command=%s with exit status %i" % (command, exitValue))
    
    if exitValue == 0:
        logger.info("Passed the job, okay")
        
        if stats != None:
            jobTag = ET.SubElement(stats, "job", { "time":str(time.time() - startTime), "clock":str(getTotalCpuTime() - startClock) })
            if tempJob.find("stack") != None:
                jobTag.append(tempJob.find("stack"))
        
        job.attrib["colour"] = "black" #Update the colour
        
        #Deal with any logging messages directed at the master
        if tempJob.find("messages") != None:
            messages = job.find("messages")
            if messages == None:
                messages = ET.SubElement(job, "messages")
            for messageTag in tempJob.find("messages").findall("message"):
                messages.append(messageTag)
        
        #Update the runtime of the stack..
        totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
        runtime = float(jobToRun.attrib["time"])
        totalRuntime -= runtime
        if totalRuntime < 0.0:
            totalRuntime = 0.0
        
        #The children
        children = job.find("children")
        assert len(children.findall("child")) == 0 #The children
        assert tempJob.find("children") != None
        for child in tempJob.find("children").findall("child"):
            memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, child)
            ET.SubElement(children, "child", { "command":child.attrib["command"], 
                    "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
            logger.info("Making a child with command: %s" % (child.attrib["command"]))
        
        #The follow on command
        followOns = job.find("followOns")
        followOns.remove(followOns.findall("followOn")[-1]) #Remove the old job
        if tempJob.attrib.has_key("command"):
            memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, tempJob)
            ET.SubElement(followOns, "followOn", { "command":tempJob.attrib["command"], 
                    "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
            ##Add the runtime to the total runtime..
            totalRuntime += compTime
            logger.info("Making a follow on job with command: %s" % tempJob.attrib["command"])
            
        elif len(tempJob.find("children").findall("child")) != 0: #This is to keep the stack of follow on jobs consistent.
            ET.SubElement(followOns, "followOn", { "command":"echo JOB_FILE", "time":"0", "memory":"1000000", "cpu":"1" })
            logger.info("Making a stub follow on job")
        #Write back the runtime, after addin the follow on time and subtracting the time of the run job.
        job.attrib["total_time"] = str(totalRuntime)
    else:
        logger.critical("Failed the job")
        job.attrib["colour"] = "red" #Update the colour
    
    #Clean up
    system("rm -rf %s/*" % (localTempDir))
    logger.info("Cleaned up by removing the contents of the local temporary file directory for the job")
    
    return tempLogFile
Exemple #5
0
def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime
    from sonLib.bioio import getTempDirectory
    from jobTree.src.master import writeJobs
    from jobTree.src.master import readJob
    from sonLib.bioio import system
    
    ##########################################
    #Construct the arguments.
    ##########################################
    
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--job", dest="jobFile", 
                      help="Job file containing command to run",
                      default="None")
    
    options, args = parseBasicOptions(parser)
    assert len(args) == 0

    ##########################################
    #Parse the job.
    ##########################################
    
    job = readJob(options.jobFile)
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging
    setLogLevel(job.attrib["log_level"])
    addLoggingFileHandler(job.attrib["slave_log_file"], rotatingLogging=False)
    logger.info("Parsed arguments and set up logging")
    
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if job.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(job.attrib["environment_file"], 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
    logger.info("Loaded the environment for the process")
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = os.path.join(localSlaveTempDir, "localTempDir") 
    os.mkdir(localTempDir)
    os.chmod(localTempDir, 0777)
    
    ##########################################
    #Run the script.
    ##########################################
    
    maxTime = float(job.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint
    jobToRun = job.find("followOns").findall("followOn")[-1]
    memoryAvailable = int(jobToRun.attrib["memory"])
    cpuAvailable = int(jobToRun.attrib["cpu"])
    startTime = time.time()
    while True:
        tempLogFile = processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats, environment, localSlaveTempDir, localTempDir)
        
        if job.attrib["colour"] != "black": 
            logger.critical("Exiting the slave because of a failed job")
            system("mv %s %s" % (tempLogFile, job.attrib["log_file"])) #Copy back the job log file, because we saw failure
            break
        elif job.attrib.has_key("reportAllJobLogFiles"):
            logger.info("Exiting because we've been asked to report all logs, and this involves returning to the master")
            #Copy across the log file
            system("mv %s %s" % (tempLogFile, job.attrib["log_file"]))
            break
   
        totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
        
        childrenNode = job.find("children")
        childrenList = childrenNode.findall("child")
        #childRuntime = sum([ float(child.attrib["time"]) for child in childrenList ])
            
        if len(childrenList) >= 2: # or totalRuntime + childRuntime > maxTime: #We are going to have to return to the parent
            logger.info("No more jobs can run in series by this slave, its got %i children" % len(childrenList))
            break
        
        if time.time() - startTime > maxTime:
            logger.info("We are breaking because the maximum time the job should run for has been exceeded")
            break
        
        followOns = job.find("followOns")
        while len(childrenList) > 0:
            child = childrenList.pop()
            childrenNode.remove(child)
            totalRuntime += float(child.attrib["time"])
            ET.SubElement(followOns, "followOn", child.attrib.copy())
        #assert totalRuntime <= maxTime + 1 #The plus one second to avoid unimportant rounding errors
        job.attrib["total_time"] = str(totalRuntime)
        assert len(childrenNode.findall("child")) == 0
        
        if len(followOns.findall("followOn")) == 0:
            logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
            break
        
        #Get the next job and see if we have enough cpu and memory to run it..
        jobToRun = job.find("followOns").findall("followOn")[-1]
        if int(jobToRun.attrib["memory"]) > memoryAvailable:
            logger.info("We need more memory for the next job, so finishing")
            break
        if int(jobToRun.attrib["cpu"]) > cpuAvailable:
            logger.info("We need more cpus for the next job, so finishing")
            break
        
        ##Updated the job so we can start the next loop cycle
        job.attrib["colour"] = "grey"
        writeJobs([ job ])
        logger.info("Updated the status of the job to grey and starting the next job")
    
    #Write back the job file with the updated jobs, using the checkpoint method.
    writeJobs([ job ])
    logger.info("Written out an updated job file")
    
    logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Cleanup the temporary directory
    ##########################################
    
    system("rm -rf %s" % localSlaveTempDir)
    
    ##########################################
    #Finish up the stats
    ##########################################
    
    if stats != None:
        stats.attrib["time"] = str(time.time() - startTime)
        stats.attrib["clock"] = str(getTotalCpuTime() - startClock)
        fileHandle = open(job.attrib["stats"], 'w')
        ET.ElementTree(stats).write(fileHandle)
        fileHandle.close()
Exemple #6
0
def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """
    waitDuration = float(config.attrib["wait_duration"])
    assert waitDuration >= 0
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \
                (waitDuration, rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    jobFiles = config.attrib["job_file_dir"].listFiles()
    logger.info("Got a list of job files")
    
    #Repair the job tree using any .old files
    fixJobsList(config, jobFiles)
    logger.info("Fixed the job files using any .old files")
    
    #Get jobs that were running, or that had failed reset to 'grey' status
    restartFailedJobs(config, jobFiles)
    logger.info("Reworked failed jobs")
    
    updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started.
    for jobFile in jobFiles:
        job = readJob(jobFile)
        if job.attrib["colour"] not in ("blue"):
            updatedJobFiles.add(jobFile)
    logger.info("Got the active (non blue) job files")
    
    totalJobFiles = len(jobFiles) #Total number of job files we have.
    jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system.
    
    idealJobTime = float(config.attrib["job_time"]) 
    assert idealJobTime > 0.0
    
    reportAllJobLogFiles = bool(int(config.attrib["reportAllJobLogFiles"]))
    
    stats = config.attrib.has_key("stats")
    if stats:
        startTime = time.time()
        startClock = getTotalCpuTime()
        
    #Stuff do handle the maximum number of issued jobs
    queueingJobs = []
    maxJobs = int(config.attrib["max_jobs"])
    cpusUsed = 0
    
    logger.info("Starting the main loop")
    timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug
    while True: 
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash)))
        
        for jobFile in list(updatedJobFiles):
            job = readJob(jobFile)
            assert job.attrib["colour"] is not "blue"
            
            ##Check the log files exist, because they must ultimately be cleaned up by their respective file trees.
            def checkFileExists(fileName, type):
                if not os.path.isfile(fileName): #We need to keep these files in existence.
                    open(fileName, 'w').close()
                    logger.critical("The file %s of type %s for job %s had disappeared" % (fileName, type, jobFile))
            checkFileExists(job.attrib["log_file"], "log_file")
            checkFileExists(job.attrib["slave_log_file"], "slave_log_file")
            if stats:
                checkFileExists(job.attrib["stats"], "stats")
            
            def reissueJob(job):
                #Reset the log files for the job.
                updatedJobFiles.remove(jobFile)
                open(job.attrib["slave_log_file"], 'w').close()
                open(job.attrib["log_file"], 'w').close()
                assert job.attrib["colour"] == "grey"
                return issueJobs([ job ], jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed)
                
            def makeGreyAndReissueJob(job):
                job.attrib["colour"] = "grey"
                writeJobs([ job ])
                return reissueJob(job)
            
            if job.attrib["colour"] == "grey": #Get ready to start the job
                cpusUsed = reissueJob(job)
            elif job.attrib["colour"] == "black": #Job has finished okay
                logger.debug("Job: %s has finished okay" % job.attrib["file"])
                if reportAllJobLogFiles:
                    reportJobLogFiles(job)
                #Deal with stats
                if stats:
                    system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"]))
                    open(job.attrib["stats"], 'w').close() #Reset the stats file
                if job.find("messages") != None:
                    for message in job.find("messages").findall("message"):
                        logger.critical("Received the following message from job: %s" % message.attrib["message"])
                    job.remove(job.find("messages"))
                childCount = int(job.attrib["child_count"])
                blackChildCount = int(job.attrib["black_child_count"])
                assert childCount == blackChildCount #Has no currently running child jobs
                #Launch any unborn children
                unbornChildren = job.find("children")
                unbornChild = unbornChildren.find("child")
                if unbornChild != None: #We must give birth to the unborn children
                    logger.debug("Job: %s has %i children to schedule" % (job.attrib["file"], len(unbornChildren.findall("child"))))
                    newChildren = []
                    while unbornChild != None:
                        newJob = createJob(unbornChild.attrib, job.attrib["file"], config)
                        totalJobFiles += 1
                        newChildren.append(newJob)
                        unbornChildren.remove(unbornChild)
                        unbornChild = unbornChildren.find("child")
                    
                    updatedJobFiles.remove(job.attrib["file"])
                    job.attrib["child_count"] = str(childCount + len(newChildren))
                    job.attrib["colour"] = "blue" #Blue - has children running.
                    writeJobs([ job ] + newChildren ) #Check point
                    cpusUsed = issueJobs(newChildren, jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed) #Issue the new children directly
                    
                elif len(job.find("followOns").findall("followOn")) != 0: #Has another job
                    logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"])
                    ##Reset the job run info
                    job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
                    cpusUsed = makeGreyAndReissueJob(job)
                    
                else: #Job has finished, so we can defer to any parent
                    logger.debug("Job: %s is now dead" % job.attrib["file"])
                    job.attrib["colour"] = "dead"
                    if job.attrib.has_key("parent"):
                        parent = readJob(job.attrib["parent"])
                        assert job.attrib["parent"] != jobFile
                        assert parent.attrib["colour"] == "blue"
                        assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"])
                        parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1)
                        if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]):
                            parent.attrib["colour"] = "black"
                            assert parent.attrib["file"] not in updatedJobFiles
                            updatedJobFiles.add(parent.attrib["file"])
                        writeJobs([ job, parent ]) #Check point
                    updatedJobFiles.remove(job.attrib["file"])
                    totalJobFiles -= 1
                    deleteJob(job, config)
                         
            elif job.attrib["colour"] == "red": #Job failed
                logger.critical("Job: %s failed" % job.attrib["file"])
                reportJobLogFiles(job)
                #Checks
                assert len(job.find("children").findall("child")) == 0
                assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
                
                remainingRetyCount = int(job.attrib["remaining_retry_count"])
                if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere
                    job.attrib["remaining_retry_count"] = str(remainingRetyCount-1)
                    logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"]))
                    cpusUsed = makeGreyAndReissueJob(job)
                else:
                    assert remainingRetyCount == 0
                    updatedJobFiles.remove(job.attrib["file"]) #We remove the job and neither delete it or reissue it
                    logger.critical("Job: %s is completely failed" % job.attrib["file"])
                    
            else: #This case should only occur after failure
                logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"])
                assert job.attrib["colour"] == "dead"
                updatedJobFiles.remove(job.attrib["file"])
                totalJobFiles -= 1
                deleteJob(job, config)
                
        #This command is issued to ensure any queing jobs are issued at the end of the loop
        cpusUsed = issueJobs([], jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed)
      
        if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles)
            assert cpusUsed == 0
            break
        
        if len(updatedJobFiles) > 0:
            updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed.
        else:
            updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed.
        
        for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, 
            result = updatedJobs[jobID]
            if jobIDsToJobsHash.has_key(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended successfully" % jobIDsToJobsHash[jobID][0])   
                else:
                    logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID][0], result))  
                cpusUsed = processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash, cpusUsed)
            else:
                logger.info("A result seems to already have been processed: %i" % jobID) #T
        
        if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds
            cpusUsed = reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem, cpusUsed)
            logger.info("Reissued any over long jobs")
            
            hasNoMissingJobs, cpusUsed = reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem, cpusUsed)
            if hasNoMissingJobs:
                timeSinceJobsLastRescued = time.time()
            else:
                timeSinceJobsLastRescued += 60 #This means we'll try again in 60 seconds
            logger.info("Rescued any (long) missing jobs")
        #Going to sleep to let the job system catch up.
        time.sleep(waitDuration)
        ##Check that the total number of cpus
        assert sum([ cpus for jobID, cpus in jobIDsToJobsHash.values() ]) == cpusUsed
        assert cpusUsed <= maxJobs
    
    if stats:
        fileHandle = open(config.attrib["stats"], 'a')
        fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
        fileHandle.close()
    
    logger.info("Finished the main loop")     
    
    return totalJobFiles #Returns number of failed jobs
Exemple #7
0
def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler, redirectLoggerStreamHandlers
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime, getTotalCpuTimeAndMemoryUsage
    from sonLib.bioio import getTempDirectory
    from sonLib.bioio import makeSubDir
    from jobTree.src.job import Job
    from jobTree.src.master import getEnvironmentFileName, getConfigFileName, listChildDirs, getTempStatsFile, setupJobAfterFailure
    from sonLib.bioio import system
    
    ########################################## 
    #Input args
    ##########################################
    
    jobTreePath = sys.argv[1]
    jobFile = sys.argv[2]
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(getEnvironmentFileName(jobTreePath), 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = makeSubDir(os.path.join(localSlaveTempDir, "localTempDir"))
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging
    tempSlaveLogFile = os.path.join(localSlaveTempDir, "slave_log.txt")
    slaveHandle = open(tempSlaveLogFile, 'w')
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    logger.addHandler(logging.StreamHandler(slaveHandle))
    origStdErr = sys.stderr
    origStdOut = sys.stdout
    sys.stderr = slaveHandle 
    sys.stdout = slaveHandle
    
    ##########################################
    #Parse input files
    ##########################################
    
    config = ET.parse(getConfigFileName(jobTreePath)).getroot()
    setLogLevel(config.attrib["log_level"])
    job = Job.read(jobFile)
    job.messages = [] #This is the only way to stop messages logging twice, as are read only in the master
    job.children = []
    if os.path.exists(job.getLogFileName()): #This cleans the old log file
        os.remove(job.getLogFileName())
    logger.info("Parsed arguments and set up logging")

     #Try loop for slave logging
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if config.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #The max time 
    ##########################################
    
    maxTime = float(config.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint

    ##########################################
    #Slave log file trapped from here on in
    ##########################################

    slaveFailed = False
    try:
        
        ##########################################
        #The next job
        ##########################################
        
        def globalTempDirName(job, depth):
            return job.getGlobalTempDirName() + str(depth)
        
        command, memoryAvailable, cpuAvailable, depth = job.followOnCommands[-1]
        defaultMemory = int(config.attrib["default_memory"])
        defaultCpu = int(config.attrib["default_cpu"])
        assert len(job.children) == 0
        
        startTime = time.time() 
        while True:
            job.followOnCommands.pop()
            
            ##########################################
            #Global temp dir
            ##########################################
            
            globalTempDir = makeSubDir(globalTempDirName(job, depth))
            i = 1
            while os.path.isdir(globalTempDirName(job, depth+i)):
                system("rm -rf %s" % globalTempDirName(job, depth+i))
                i += 1
                
            ##########################################
            #Old children, not yet deleted
            #
            #These may exist because of the lazy cleanup
            #we do
            ##########################################
        
            for childDir in listChildDirs(job.jobDir):
                logger.debug("Cleaning up old child %s" % childDir)
                system("rm -rf %s" % childDir)
        
            ##########################################
            #Run the job
            ##########################################
        
            if command != "": #Not a stub
                if command[:11] == "scriptTree ":
                    ##########################################
                    #Run the target
                    ##########################################
                    
                    loadStack(command).execute(job=job, stats=stats,
                                    localTempDir=localTempDir, globalTempDir=globalTempDir, 
                                    memoryAvailable=memoryAvailable, cpuAvailable=cpuAvailable, 
                                    defaultMemory=defaultMemory, defaultCpu=defaultCpu, depth=depth)
            
                else: #Is another command
                    system(command) 
            
            ##########################################
            #Cleanup/reset a successful job/checkpoint
            ##########################################
            
            job.remainingRetryCount = int(config.attrib["try_count"])
            system("rm -rf %s/*" % (localTempDir))
            job.update(depth=depth, tryCount=job.remainingRetryCount)
            
            ##########################################
            #Establish if we can run another job
            ##########################################
            
            if time.time() - startTime > maxTime:
                logger.info("We are breaking because the maximum time the job should run for has been exceeded")
                break
            
            #Deal with children
            if len(job.children) >= 1:  #We are going to have to return to the parent
                logger.info("No more jobs can run in series by this slave, its got %i children" % len(job.children))
                break
            
            if len(job.followOnCommands) == 0:
                logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
                break
            
            #Get the next job and see if we have enough cpu and memory to run it..
            command, memory, cpu, depth = job.followOnCommands[-1]
            
            if memory > memoryAvailable:
                logger.info("We need more memory for the next job, so finishing")
                break
            if cpu > cpuAvailable:
                logger.info("We need more cpus for the next job, so finishing")
                break
            
            logger.info("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        
        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            tempStatsFile = getTempStatsFile(jobTreePath)
            fileHandle = open(tempStatsFile + ".new", "w")
            ET.ElementTree(stats).write(fileHandle)
            fileHandle.close()
            os.rename(tempStatsFile + ".new", tempStatsFile) #This operation is atomic
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Where slave goes wrong
    ##########################################
    except: #Case that something goes wrong in slave
        traceback.print_exc(file = slaveHandle)
        logger.critical("Exiting the slave because of a failed job on host %s", socket.gethostname())
        job = Job.read(jobFile)
        setupJobAfterFailure(job, config)
        job.write()
        slaveFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the slave logging
    slaveHandle.flush()
    sys.stderr = origStdErr
    sys.stdout = origStdOut
    redirectLoggerStreamHandlers(slaveHandle, sys.stderr)
    slaveHandle.close()
    
    #Copy back the log file to the global dir, if needed
    if slaveFailed:
        truncateFile(tempSlaveLogFile)
        system("mv %s %s" % (tempSlaveLogFile, job.getLogFileName()))
    #Remove the temp dir
    system("rm -rf %s" % localSlaveTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not slaveFailed) and len(job.followOnCommands) == 0 and len(job.children) == 0 and len(job.messages) == 0:
        ##########################################
        #Cleanup global files at the end of the chain
        ##########################################
        job.delete()            
Exemple #8
0
 def execute(self, job):
     setLogLevel(job.attrib["log_level"])
     logger.info("Setup logging with level: %s" % job.attrib["log_level"])
     self.tempDirAccessed = False
     self.localTempDir = job.attrib["local_temp_dir"]
     self.globalTempDir = job.attrib["global_temp_dir"]
     maxTime = float(job.attrib["job_time"])
     memory = int(job.attrib["available_memory"])
     cpu = int(job.attrib["available_cpu"])
     
     if job.attrib.has_key("stats"):
         stats = ET.SubElement(job, "stack")
         startTime = time.time()
         startClock = getTotalCpuTime()
     else:
         stats = None
     
     newChildren = [] #List to add all the children to before we package them
     #off into stacks
     newChildCommands = [] #Ditto for the child commands
     newFollowOns = [] #Ditto for the follow-ons 
     baseDir = os.getcwd()
     while self.hasRemaining():
         if stats is not None: #Getting the runtime of the stats module
             targetStartTime = time.time()
             targetStartClock = getTotalCpuTime()
             
         target = self.popTarget()
         target.setStack(self)
         #Debug check that we have the right amount of CPU and memory for the job in hand
         targetMemory = target.getMemory()
         if targetMemory != sys.maxint:
             assert targetMemory <= memory
         targetCpu = target.getCpu()
         if targetCpu != sys.maxint:
             assert targetCpu <= cpu
         #Run the target, first cleanup then run.
         target.run()
         #Change dir back to cwd dir, if changed by target (this is a safety issue)
         if os.getcwd() != baseDir:
             os.chdir(baseDir)
         #Cleanup after the target
         if self.tempDirAccessed:
             system("rm -rf %s/*" % self.localTempDir)
             self.tempDirAccessed = False
         #Handle the follow on
         followOn = target.getFollowOn()
         #if target.__class__ != CleanupGlobalTempDirTarget and followOn == None:
         #    followOn = CleanupGlobalTempDirTarget()
         if followOn is not None: #Target to get rid of follow on when done.
             if target.isGlobalTempDirSet():
                 followOn.setGlobalTempDir(target.getGlobalTempDir())
             newFollowOns.append(followOn)
         
         #Now add the children to the newChildren stack
         newChildren += target.getChildren()
         
         #Now add the child commands to the newChildCommands stack
         newChildCommands += target.getChildCommands()
         
         if stats is not None:
             ET.SubElement(stats, "target", { "time":str(time.time() - targetStartTime), 
                                             "clock":str(getTotalCpuTime() - targetStartClock),
                                             "class":".".join((target.__class__.__name__,)),
                                             "e_time":str(target.getRunTime())})
             
         for message in target.getMasterLoggingMessages():
             if job.find("messages") is None:
                 ET.SubElement(job, "messages")
             ET.SubElement(job.find("messages"), "message", { "message": message} )
     
     #######
     #Now build the new stacks and corresponding jobs
     #######
     
     #First add all the follow ons to the existing stack and make it a follow on job for job-tree
     assert not self.hasRemaining()
     
     #First sort out the follow on job
     if len(newFollowOns) > 0: #If we have follow ons
         followOnRuntime = sum([ followOn.getRunTime() for followOn in newFollowOns ])
         
         if followOnRuntime > maxTime: #We create a parallel list of follow ons
             followOnStack = Stack(ParallelFollowOnTarget(newFollowOns))
         else:
             followOnStack = Stack(newFollowOns.pop())
             while len(newFollowOns) > 0:
                 followOnStack.addTarget(newFollowOns.pop())
     
         job.attrib["command"] = followOnStack.makeRunnable(self.globalTempDir)
         job.attrib["time"] = str(followOnStack.getRunTime())
         followOnMemory = followOnStack.getMemory()
         assert not job.attrib.has_key("memory")
         if followOnMemory != sys.maxint:
             job.attrib["memory"] = str(followOnMemory)
         assert not job.attrib.has_key("cpu")
         followOnCpu = followOnStack.getCpu()
         if followOnCpu != sys.maxint:
             job.attrib["cpu"] = str(followOnCpu)
           
     #Now build stacks of children..
     childrenTag = job.find("children")
     while len(newChildren) > 0:
         childStack = Stack(newChildren.pop())
         while len(newChildren) > 0 and childStack.getRunTime() <= maxTime:
             childStack.addTarget(newChildren.pop())
         childJob = ET.SubElement(childrenTag, "child", { "command":childStack.makeRunnable(self.globalTempDir),
                                           "time":str(childStack.getRunTime()) })
         childMemory = childStack.getMemory()
         assert not childJob.attrib.has_key("memory")
         if childMemory != sys.maxint:
             childJob.attrib["memory"] = str(childMemory)
         assert not childJob.attrib.has_key("cpu")
         childCpu = childStack.getCpu()
         if childCpu != sys.maxint:
             childJob.attrib["cpu"] = str(childCpu)
     
     #Now build jobs for each child command
     for childCommand, runTime in newChildCommands:
         ET.SubElement(childrenTag, "child", { "command":str(childCommand),
                                           "time":str(runTime) })
     
     #Finish up the stats
     if stats is not None:
         stats.attrib["time"] = str(time.time() - startTime)
         stats.attrib["clock"] = str(getTotalCpuTime() - startClock)
Exemple #9
0
def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler, redirectLoggerStreamHandlers
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime, getTotalCpuTimeAndMemoryUsage
    from sonLib.bioio import getTempDirectory
    from sonLib.bioio import makeSubDir
    from jobTree.src.job import Job
    from jobTree.src.master import getEnvironmentFileName, getConfigFileName, listChildDirs, getTempStatsFile, setupJobAfterFailure
    from sonLib.bioio import system
    
    ########################################## 
    #Input args
    ##########################################
    
    jobTreePath = sys.argv[1]
    jobFile = sys.argv[2]
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(getEnvironmentFileName(jobTreePath), 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = makeSubDir(os.path.join(localSlaveTempDir, "localTempDir"))
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging. This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?    
    tempSlaveLogFile = os.path.join(localSlaveTempDir, "slave_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)
    
    #Open the file to send stdout/stderr to.
    logDescriptor = os.open(tempSlaveLogFile, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logDescriptor, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logDescriptor, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logDescriptor)
    
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    #Put a message at the top of the log, just to make sure it's working.
    print "---JOBTREE SLAVE OUTPUT LOG---"
    sys.stdout.flush()
    
    #Log the number of open file descriptors so we can tell if we're leaking
    #them.
    logger.debug("Next available file descriptor: {}".format(
        nextOpenDescriptor()))
    
    ##########################################
    #Parse input files
    ##########################################
    
    config = ET.parse(getConfigFileName(jobTreePath)).getroot()
    setLogLevel(config.attrib["log_level"])
    job = Job.read(jobFile)
    job.messages = [] #This is the only way to stop messages logging twice, as are read only in the master
    job.children = [] #Similarly, this is where old children are flushed out.
    job.write() #Update status, to avoid reissuing children after running a follow on below.
    if os.path.exists(job.getLogFileName()): #This cleans the old log file
        os.remove(job.getLogFileName())
    logger.info("Parsed arguments and set up logging")

     #Try loop for slave logging
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if config.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #The max time 
    ##########################################
    
    maxTime = float(config.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint

    ##########################################
    #Slave log file trapped from here on in
    ##########################################

    slaveFailed = False
    try:
        
        ##########################################
        #The next job
        ##########################################
        
        def globalTempDirName(job, depth):
            return job.getGlobalTempDirName() + str(depth)
        
        command, memoryAvailable, cpuAvailable, depth = job.followOnCommands[-1]
        defaultMemory = int(config.attrib["default_memory"])
        defaultCpu = int(config.attrib["default_cpu"])
        assert len(job.children) == 0
        
        startTime = time.time() 
        while True:
            job.followOnCommands.pop()
            
            ##########################################
            #Global temp dir
            ##########################################
            
            globalTempDir = makeSubDir(globalTempDirName(job, depth))
            i = 1
            while os.path.isdir(globalTempDirName(job, depth+i)):
                system("rm -rf %s" % globalTempDirName(job, depth+i))
                i += 1
                
            ##########################################
            #Old children, not yet deleted
            #
            #These may exist because of the lazy cleanup
            #we do
            ##########################################
        
            for childDir in listChildDirs(job.jobDir):
                logger.debug("Cleaning up old child %s" % childDir)
                system("rm -rf %s" % childDir)
        
            ##########################################
            #Run the job
            ##########################################
        
            if command != "": #Not a stub
                if command[:11] == "scriptTree ":
                    ##########################################
                    #Run the target
                    ##########################################
                    
                    loadStack(command).execute(job=job, stats=stats,
                                    localTempDir=localTempDir, globalTempDir=globalTempDir, 
                                    memoryAvailable=memoryAvailable, cpuAvailable=cpuAvailable, 
                                    defaultMemory=defaultMemory, defaultCpu=defaultCpu, depth=depth)
            
                else: #Is another command
                    system(command) 
            
            ##########################################
            #Cleanup/reset a successful job/checkpoint
            ##########################################
            
            job.remainingRetryCount = int(config.attrib["try_count"])
            system("rm -rf %s/*" % (localTempDir))
            job.update(depth=depth, tryCount=job.remainingRetryCount)
            
            ##########################################
            #Establish if we can run another job
            ##########################################
            
            if time.time() - startTime > maxTime:
                logger.info("We are breaking because the maximum time the job should run for has been exceeded")
                break
            
            #Deal with children
            if len(job.children) >= 1:  #We are going to have to return to the parent
                logger.info("No more jobs can run in series by this slave, its got %i children" % len(job.children))
                break
            
            if len(job.followOnCommands) == 0:
                logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
                break
            
            #Get the next job and see if we have enough cpu and memory to run it..
            command, memory, cpu, depth = job.followOnCommands[-1]
            
            if memory > memoryAvailable:
                logger.info("We need more memory for the next job, so finishing")
                break
            if cpu > cpuAvailable:
                logger.info("We need more cpus for the next job, so finishing")
                break
            
            logger.info("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        
        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            tempStatsFile = getTempStatsFile(jobTreePath)
            fileHandle = open(tempStatsFile + ".new", "w")
            ET.ElementTree(stats).write(fileHandle)
            fileHandle.close()
            os.rename(tempStatsFile + ".new", tempStatsFile) #This operation is atomic
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Where slave goes wrong
    ##########################################
    except: #Case that something goes wrong in slave
        traceback.print_exc()
        logger.critical("Exiting the slave because of a failed job on host %s", socket.gethostname())
        job = Job.read(jobFile)
        setupJobAfterFailure(job, config)
        job.write()
        slaveFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the slave logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if slaveFailed:
        truncateFile(tempSlaveLogFile)
        system("mv %s %s" % (tempSlaveLogFile, job.getLogFileName()))
    #Remove the temp dir
    system("rm -rf %s" % localSlaveTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not slaveFailed) and len(job.followOnCommands) == 0 and len(job.children) == 0 and len(job.messages) == 0:
        ##########################################
        #Cleanup global files at the end of the chain
        ##########################################
        job.delete()            
Exemple #10
0
    def execute(self, job, stats, localTempDir, globalTempDir, memoryAvailable,
                cpuAvailable, defaultMemory, defaultCpu, depth):
        self.tempDirAccessed = False
        self.localTempDir = localTempDir
        self.globalTempDir = globalTempDir

        if stats != None:
            startTime = time.time()
            startClock = getTotalCpuTime()

        baseDir = os.getcwd()

        self.target.setStack(self)
        #Debug check that we have the right amount of CPU and memory for the job in hand
        targetMemory = self.target.getMemory()
        if targetMemory != sys.maxint:
            assert targetMemory <= memoryAvailable
        targetCpu = self.target.getCpu()
        if targetCpu != sys.maxint:
            assert targetCpu <= cpuAvailable
        #Run the target, first cleanup then run.
        self.target.run()
        #Change dir back to cwd dir, if changed by target (this is a safety issue)
        if os.getcwd() != baseDir:
            os.chdir(baseDir)
        #Cleanup after the target
        if self.tempDirAccessed:
            system("rm -rf %s/*" % self.localTempDir)
            self.tempDirAccessed = False
        #Handle the follow on
        followOn = self.target.getFollowOn()
        if followOn is not None:  #Target to get rid of follow on when done.
            if self.target.isGlobalTempDirSet():
                followOn.setGlobalTempDir(self.target.getGlobalTempDir())
            followOnStack = Stack(followOn)
            job.followOnCommands.append(
                (followOnStack.makeRunnable(self.globalTempDir),
                 followOnStack.getMemory(defaultMemory),
                 followOnStack.getCpu(defaultCpu), depth))

        #Now add the children to the newChildren stack
        newChildren = self.target.getChildren()
        newChildren.reverse()
        assert len(job.children) == 0
        while len(newChildren) > 0:
            childStack = Stack(newChildren.pop())
            job.children.append((childStack.makeRunnable(self.globalTempDir),
                                 childStack.getMemory(defaultMemory),
                                 childStack.getCpu(defaultCpu)))

        #Now build jobs for each child command
        for childCommand, runTime in self.target.getChildCommands():
            job.children.append((childCommand, defaultMemory, defaultCpu))

        for message in self.target.getMasterLoggingMessages():
            job.messages.append(message)

        #Finish up the stats
        if stats != None:
            stats = ET.SubElement(stats, "target")
            stats.attrib["time"] = str(time.time() - startTime)
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["class"] = ".".join(
                (self.target.__class__.__name__, ))
            stats.attrib["memory"] = str(totalMemoryUsage)
Exemple #11
0
 def execute(self, job, stats, localTempDir, globalTempDir, 
             memoryAvailable, cpuAvailable,
             defaultMemory, defaultCpu, depth):
     self.tempDirAccessed = False
     self.localTempDir = localTempDir
     self.globalTempDir = globalTempDir
     
     if stats != None:
         startTime = time.time()
         startClock = getTotalCpuTime()
     
     baseDir = os.getcwd()
     
     self.target.setStack(self)
     #Debug check that we have the right amount of CPU and memory for the job in hand
     targetMemory = self.target.getMemory()
     if targetMemory != sys.maxint:
         assert targetMemory <= memoryAvailable
     targetCpu = self.target.getCpu()
     if targetCpu != sys.maxint:
         assert targetCpu <= cpuAvailable
     #Run the target, first cleanup then run.
     self.target.run()
     #Change dir back to cwd dir, if changed by target (this is a safety issue)
     if os.getcwd() != baseDir:
         os.chdir(baseDir)
     #Cleanup after the target
     if self.tempDirAccessed:
         system("rm -rf %s/*" % self.localTempDir)
         self.tempDirAccessed = False
     #Handle the follow on
     followOn = self.target.getFollowOn()
     if followOn is not None: #Target to get rid of follow on when done.
         if self.target.isGlobalTempDirSet():
             followOn.setGlobalTempDir(self.target.getGlobalTempDir())
         followOnStack = Stack(followOn)
         job.followOnCommands.append((followOnStack.makeRunnable(self.globalTempDir),
                                      followOnStack.getMemory(defaultMemory),
                                      followOnStack.getCpu(defaultCpu),
                                      depth))
     
     #Now add the children to the newChildren stack
     newChildren = self.target.getChildren()
     newChildren.reverse()
     assert len(job.children) == 0
     while len(newChildren) > 0:
         childStack = Stack(newChildren.pop())
         job.children.append((childStack.makeRunnable(self.globalTempDir),
                  childStack.getMemory(defaultMemory),
                  childStack.getCpu(defaultCpu)))
     
      #Now build jobs for each child command
     for childCommand, runTime in self.target.getChildCommands():
         job.children.append((childCommand, defaultMemory, defaultCpu))
         
     for message in self.target.getMasterLoggingMessages():
         job.messages.append(message)
     
     #Finish up the stats
     if stats != None:
         stats = ET.SubElement(stats, "target")
         stats.attrib["time"] = str(time.time() - startTime)
         totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
         stats.attrib["clock"] = str(totalCpuTime - startClock)
         stats.attrib["class"] = ".".join((self.target.__class__.__name__,))
         stats.attrib["memory"] = str(totalMemoryUsage)