Esempio n. 1
0
def parasolRestart():
    """Function starts the parasol hub and node.
    """
    parasolStop()
    while True:
        machineList = os.path.join(workflowRootPath(), "workflow", "jobTree", "machineList")
        #pathEnvVar = os.environ["PATH"]
        os.system("paraNode start -hub=localhost") 
        #-umask=002 -userPath=%s -sysPath=%s" % (pathEnvVar, pathEnvVar))
        os.system("paraHub %s subnet=127.0.0 &" % (machineList,))
        tempFile = getTempFile()
        dead = True
        try:
            popen("parasol status", tempFile)
            fileHandle = open(tempFile, 'r')
            line = fileHandle.readline()
            while line != '':
                if "Nodes dead" in line:
                    print line
                    if int(line.split()[-1]) == 0:
                        dead = False
                line = fileHandle.readline()
            fileHandle.close()
        except RuntimeError:
            pass
        os.remove(tempFile)
        if not dead:
            break
        else:
            logger.info("Tried to restart the parasol process, but failed, will try again")
            parasolStop()
            time.sleep(5)
    logger.info("Restarted the parasol process")
Esempio n. 2
0
 def issueJobs(self, jobCommands):
     """Issues parasol with job commands.
     """
     issuedJobs = {}
     for jobCommand, memory, cpu, logFile in jobCommands:
         assert memory != None
         assert cpu != None
         assert logFile != None
         pattern = re.compile("your job ([0-9]+).*")
         command = "parasol -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (memory, cpu, self.parasolResultsFile, jobCommand)
         while True:
             #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary
             popenParasolCommand(command, self.scratchFile)
             fileHandle = open(self.scratchFile, 'r')
             line = fileHandle.readline()
             fileHandle.close()
             match = pattern.match(line)
             if match != None: #This is because parasol add job will return success, even if the job was not properly issued!
                 break
             else:
                 logger.info("We failed to properly add the job, we will try again after a sleep")
                 time.sleep(5)
         jobID = int(match.group(1))
         logger.debug("Got the job id: %s from line: %s" % (jobID, line))
         assert jobID not in issuedJobs.keys()
         issuedJobs[jobID] = jobCommand
         logger.debug("Issued the job command: %s with job id: %i " % (command, jobID))
     return issuedJobs
Esempio n. 3
0
def parseJobFile(absFileName):
    try:
        job = ET.parse(absFileName).getroot()
        return job
    except IOError:
        logger.info("Encountered error while parsing job file %s, so we will ignore it" % absFileName)
    return None
Esempio n. 4
0
 def run(self):
     ##########################################
     #Setup a file tree.
     ##########################################
         
     tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString()))   
     
     fileTreeRootFile = tempFileTree.getTempFile()
 
     makeFileTree(fileTreeRootFile, \
                  self.depth, tempFileTree)
     
     treePointer = tempFileTree.getTempFile()
     
     makeTreePointer(fileTreeRootFile, treePointer)
     
     logger.info("We've set up the file tree")
     
     ##########################################
     #Issue the child and follow on jobs
     ##########################################
     
     self.addChildTarget(ChildTarget(treePointer))
     
     self.setFollowOnTarget(DestructFileTree(tempFileTree))
     
     logger.info("We've added the child target and finished SetupFileTree.run()")
Esempio n. 5
0
def setupTempFileTrees(config):
    """Load the temp file trees
    """
    config.attrib["job_file_dir"] = TempFileTree(config.attrib["job_file_dir"])
    config.attrib["temp_dir_dir"] = TempFileTree(config.attrib["temp_dir_dir"])
    config.attrib["log_file_dir"] = TempFileTree(config.attrib["log_file_dir"])
    config.attrib["slave_log_file_dir"] = TempFileTree(config.attrib["slave_log_file_dir"])
    logger.info("Setup the temp file trees")
Esempio n. 6
0
 def run (self):
     parasolRestart()
     while True:
         time.sleep(random.choice(xrange(240)))
         if self.kill == True:
             return
         logger.info("Going to kill a parasol/master process")
         killMasterAndParasol()
Esempio n. 7
0
def loadEnvironment(config):
    """Puts the environment in the pickle file.
    """
    #Dump out the environment of this process in the environment pickle file.
    fileHandle = open(config.attrib["environment_file"], 'w')
    cPickle.dump(os.environ, fileHandle)
    fileHandle.close()
    logger.info("Written the environment for the jobs to the environment file")
Esempio n. 8
0
def createFirstJob(command, config, memory=None, cpu=None, time=sys.maxint):
    """Adds the first job to to the jobtree.
    """
    logger.info("Adding the first job")
    if memory == None:
        memory = config.attrib["default_memory"]
    if cpu == None:
        cpu = config.attrib["default_cpu"]
    job = createJob({ "command":command, "memory":str(int(memory)), "cpu":str(int(cpu)), "time":str(float(time)) }, None, config)
    writeJobs([job])
    logger.info("Added the first job")
Esempio n. 9
0
 def killJobs(self, jobIDs):
     """Kills the given jobs, represented as Job ids, then checks they are dead by checking
     they are not in the list of issued jobs.
     """
     while True:
         for jobID in jobIDs:
             i = popenParasolCommand("parasol remove job %i" % jobID, tmpFileForStdOut=self.scratchFile, runUntilSuccessful=None)
             logger.info("Tried to remove jobID: %i, with exit value: %i" % (jobID, i))
         runningJobs = self.getIssuedJobIDs()
         if set(jobIDs).difference(set(runningJobs)) == set(jobIDs):
             return
         time.sleep(5)
         logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")
Esempio n. 10
0
def checkFileTreeCounts(rootFile):
    """Check the file tree produced by the test.
    """
    tree = ET.parse(rootFile).getroot()
    i = 0
    children = tree.find("children").findall("child")
    if len(children) == 0:
        i = 1
    else:
        for child in children:
            i += checkFileTreeCounts(child.attrib["file"])
    logger.info("File tree counts: %i %i" % (i, int(tree.attrib["count"])))
    assert i == int(tree.attrib["count"])
    return i
Esempio n. 11
0
def runJobTree(command, jobTreeDir, logLevel="DEBUG", retryCount=0, batchSystem="single_machine", 
               rescueJobFrequency=None):
    """A convenience function for running job tree from within a python script.
    """
    if rescueJobFrequency != None:
        rescueJobFrequencyString = "--rescueJobsFrequency %s" % float(rescueJobFrequency)
    else:
        rescueJobFrequencyString = ""
    command = "jobTree --command \"%s\" --jobTree %s --logLevel %s \
--retryCount %i --batchSystem %s %s" % \
            (command, jobTreeDir,  logLevel, retryCount, batchSystem, rescueJobFrequencyString)
    logger.info("Running command : %s" % command)
    system(command)
    logger.info("Ran the jobtree apparently okay")
Esempio n. 12
0
def restartFailedJobs(config, jobFiles):
    """Traverses through the file tree and resets the restart count of all jobs.
    """
    for absFileName in jobFiles:
        if os.path.isfile(absFileName):
            job = ET.parse(absFileName).getroot()
            logger.info("Restarting job: %s" % job.attrib["file"])
            job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
            if job.attrib["colour"] == "red":
                job.attrib["colour"] = "white"
            #Is leaf and job failed when the system went downbut the status did not get updated.
            if job.attrib["colour"] == "grey": 
                job.attrib["colour"] = "white"
            writeJobs([ job ])
Esempio n. 13
0
def reloadJobTree(jobTree):
    """Load the job tree from a dir.
    """
    logger.info("The job tree appears to already exist, so we'll reload it")
    assert os.path.isfile(os.path.join(jobTree, "config.xml")) #A valid job tree must contain the config file
    assert os.path.isfile(os.path.join(jobTree, "environ.pickle")) #A valid job tree must contain a pickle file which encodes the path environment of the job
    assert os.path.isfile(os.path.join(jobTree, "jobNumber.xml")) #A valid job tree must contain a file which is updated with the number of jobs that have been run.
    assert os.path.isdir(os.path.join(jobTree, "jobs")) #A job tree must have a directory of jobs.
    assert os.path.isdir(os.path.join(jobTree, "tempDirDir")) #A job tree must have a directory of temporary directories (for jobs to make temp files in).
    assert os.path.isdir(os.path.join(jobTree, "logFileDir")) #A job tree must have a directory of log files.
    assert os.path.isdir(os.path.join(jobTree, "slaveLogFileDir")) #A job tree must have a directory of slave log files.
    config = ET.parse(os.path.join(jobTree, "config.xml")).getroot()
    setupTempFileTrees(config)
    batchSystem = loadTheBatchSystem(config)
    logger.info("Reloaded the jobtree")
    return config, batchSystem
Esempio n. 14
0
def killMasterAndParasol():
    """Method to destroy master process
    """
    tempFile = getTempFile()
    popen("ps -a", tempFile)
    fileHandle = open(tempFile, 'r')
    line = fileHandle.readline()
    #Example parasol state lines:
    #67401 ttys002    0:00.06 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i
    #67403 ttys002    0:00.65 /Users/benedictpaten/kent/src/parasol/bin/paraHub -log=/tmp/hub.2009-07-08.log machineList subnet=127.0.0
    #68573 ttys002    0:00.00 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i
    while line != '':
        tokens = line.split()
        if 'paraNode' in line or 'paraHub' in line:
            if random.random() > 0.5:
                i = os.system("kill %i" % int(tokens[0]))
                logger.info("Tried to kill parasol process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i))
                break
        elif 'jobTreeMaster.py' in line:
            logger.info("Have job tree master line")
            if random.random() > 0.5:
                i = os.system("kill %i" % int(tokens[0]))
                logger.info("Tried to kill master process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i))
                break
        line = fileHandle.readline()
    fileHandle.close()
    os.remove(tempFile)
    parasolRestart()
Esempio n. 15
0
 def testJobTree_Parasol(self):
     """Runs a test program using the job tree, whilst constantly restarting parasol
     by killing the nodes.
     """
     for test in xrange(self.testNo): #Does not run this test when doing short testing
         jobTreeCommand, fileTreeRootFile = setupJobTree(self.tempFileTree, self.jobTreeDir, 
                                                         "parasol", depth=self.depth)
         jobTreeCommand += " --rescueJobsFrequency 20"
         #Run the job
         parasolAndMasterKiller = ParasolAndMasterKiller()
         parasolAndMasterKiller.start()
         while True:
             while True:
                 process = subprocess.Popen(jobTreeCommand, shell=True)
                 sts = os.waitpid(process.pid, 0)
                 if sts[1] == 0:
                     logger.info("The job tree master ended, with an okay exit value (using parasol)")
                     break
                 else:
                     logger.info("The job tree master ended with an error exit value, restarting: %i" % sts[1])
             if checkEndStateOfJobTree(self.jobTreeDir): #Check the state of the job files
                 break
             
             jobTreeCommand = "jobTree --jobTree %s --logDebug" % self.jobTreeDir
         checkFileTreeCounts(fileTreeRootFile)
         os.system("rm -rf %s" % self.jobTreeDir)
         parasolAndMasterKiller.stopKilling()
         logger.info("Test done okay")
Esempio n. 16
0
def setupJobTree(tempFileTree, jobTreeDir, batchSystem, depth=2):
    """Sets up a job tree using the jobTreeSetup.py command.
    """
    #Setup a job
    retryCount = random.choice(xrange(1,10))
    
    logger.info("Setup the basic files for the test")
    
    fileTreeRootFile = tempFileTree.getTempFile()
    makeFileTree(fileTreeRootFile, depth, tempFileTree)
    treePointerFile = makeTreePointer(fileTreeRootFile, tempFileTree.getTempFile())
    
    #Setup the job
    command = "jobTreeTest_CommandFirst.py --treePointer %s --job JOB_FILE" % \
    (treePointerFile)
    
    jobTreeCommand = "jobTree --jobTree %s --retryCount %i\
     --command '%s' --logLevel=INFO --maxJobDuration 100 --batchSystem %s" % \
    (jobTreeDir, retryCount, command, batchSystem)
        
    logger.info("Setup the job okay")
    return (jobTreeCommand, fileTreeRootFile)
Esempio n. 17
0
def testJobTree(testNo, depth, tempFileTree, jobTreeDir, batchSystem):
    """Runs a test program using the job tree using the single machine batch system.
    """
    for test in xrange(testNo):
        jobTreeCommand, fileTreeRootFile = setupJobTree(tempFileTree, jobTreeDir, 
                                                        batchSystem, depth=depth)
        #Run the job
        while True:
            print "job tree command", jobTreeCommand
            
            process = subprocess.Popen(jobTreeCommand, shell=True)
            sts = os.waitpid(process.pid, 0)
            assert sts[1] == 0
            logger.info("The job tree master ended, with an okay exit value")
        
            if checkEndStateOfJobTree(jobTreeDir): #Check the state of the job files, exit if none
                break
            
            jobTreeCommand = "jobTree --jobTree %s --logInfo" % jobTreeDir
            
        checkFileTreeCounts(fileTreeRootFile)
        os.system("rm -rf %s" % jobTreeDir)
        logger.info("Test done okay")
Esempio n. 18
0
def createJobTree(options):
    logger.info("Starting to create the job tree setup for the first time")
    options.jobTree = os.path.abspath(options.jobTree)
    os.mkdir(options.jobTree)
    config = ET.Element("config")
    config.attrib["environment_file"] = os.path.join(options.jobTree, "environ.pickle")
    config.attrib["job_number_file"] = os.path.join(options.jobTree, "jobNumber.xml")
    config.attrib["job_file_dir"] = os.path.join(options.jobTree, "jobs")
    config.attrib["temp_dir_dir"] = os.path.join(options.jobTree, "tempDirDir")
    config.attrib["log_file_dir"] = os.path.join(options.jobTree, "logFileDir")
    config.attrib["slave_log_file_dir"] = os.path.join(options.jobTree, "slaveLogFileDir")
    config.attrib["results_file"] = os.path.join(options.jobTree, "results.txt")
    config.attrib["scratch_file"] = os.path.join(options.jobTree, "scratch.txt")
    config.attrib["retry_count"] = str(int(options.retryCount))
    config.attrib["max_job_duration"] = str(float(options.maxJobDuration))
    config.attrib["batch_system"] = options.batchSystem
    config.attrib["job_time"] = str(float(options.jobTime))
    config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize))
    config.attrib["default_memory"] = str(int(options.defaultMemory))
    config.attrib["default_cpu"] = str(int(options.defaultCpu))
    config.attrib["max_jobs"] = str(int(options.maxJobs))
    config.attrib["max_threads"] = str(int(options.maxThreads))
    if options.stats:
        config.attrib["stats"] = os.path.join(options.jobTree, "stats.xml")
        fileHandle = open(config.attrib["stats"], 'w')
        fileHandle.write("<stats>")
        fileHandle.close()
    #Load the batch system.
    batchSystem = loadTheBatchSystem(config)
    
    #Set the two parameters determining the polling frequency of the system.
    config.attrib["wait_duration"] = str(float(batchSystem.getWaitDuration()))
    if options.waitDuration != None:
        config.attrib["wait_duration"] = str(float(options.waitDuration))
        
    config.attrib["rescue_jobs_frequency"] = str(float(batchSystem.getRescueJobFrequency()))
    if options.rescueJobsFrequency != None:
        config.attrib["rescue_jobs_frequency"] = str(float(options.rescueJobsFrequency))
    
    #Write the config file to disk
    fileHandle = open(os.path.join(options.jobTree, "config.xml"), 'w')
    
    tree = ET.ElementTree(config)
    tree.write(fileHandle)
    fileHandle.close()
    logger.info("Written the config file")
    
    #Set up the jobNumber file
    fileHandle = open(config.attrib["job_number_file"], 'w')
    ET.ElementTree(ET.Element("job_number", { "job_number":'0' })).write(fileHandle)
    fileHandle.close()
    
    #Setup the temp file trees.
    setupTempFileTrees(config)
    
    logger.info("Finished the job tree setup")
    return config, batchSystem
Esempio n. 19
0
def main():
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--jobTree", dest="jobTree", 
                      help="Directory containing the job tree to kill")
    
    options, args = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    assert len(args) == 0 #This program takes no arguments
    assert options.jobTree != None #The jobtree should not be null
    assert os.path.isdir(options.jobTree) #The job tree must exist if we are going to kill it.
    logger.info("Starting routine to kill running jobs in the jobTree: %s" % options.jobTree)
    config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot()
    batchSystem = loadTheBatchSystem(config) #This should automatically kill the existing jobs.. so we're good.
    for job in batchSystem.getIssuedJobIDs(): #Just in case we do it again.
        batchSystem.killJobs(job)
    logger.info("All jobs SHOULD have been killed")
Esempio n. 20
0
def loadTheBatchSystem(config):
    """Load the batch system.
    """
    batchSystemString = config.attrib["batch_system"]
    if batchSystemString == "parasol":
        batchSystem = ParasolBatchSystem(config)
        logger.info("Using the parasol batch system")
    elif batchSystemString == "single_machine" or batchSystemString == "singleMachine":
        batchSystem = SingleMachineBatchSystem(config)
        logger.info("Using the single machine batch system")
    elif batchSystemString == "gridengine" or batchSystemString == "gridEngine":
        batchSystem = GridengineBatchSystem(config)
        logger.info("Using the grid engine machine batch system")
    elif batchSystemString == "acid_test" or batchSystemString == "acidTest":
        batchSystem = SingleMachineBatchSystem(config, workerClass=BadWorker)
        config.attrib["retry_count"] = str(32) #The chance that a job does not complete after 32 goes in one in 4 billion, so you need a lot of jobs before this becomes probable
    else:
        raise RuntimeError("Unrecognised batch system: %s" % batchSystemString)
    return batchSystem
Esempio n. 21
0
 def __init__(self, config):
     AbstractBatchSystem.__init__(self, config) #Call the parent constructor
     #Keep the name of the results file for the pstat2 command..
     self.parasolResultsFile = config.attrib["results_file"]
     #Reset the job queue and results (initially, we do this again once we've killed the jobs)
     self.parasolResultsFileHandle = open(self.parasolResultsFile, 'w')
     self.parasolResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence    
     self.queuePattern = re.compile("q\s+([0-9]+)")
     self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+")
     #The scratch file
     self.scratchFile = self.config.attrib["scratch_file"]
     self.killJobs(self.getIssuedJobIDs()) #Kill any jobs on the current stack
     logger.info("Going to sleep for a few seconds to kill any existing jobs")
     time.sleep(5) #Give batch system a second to sort itself out.
     logger.info("Removed any old jobs from the queue")
     #Reset the job queue and results
     self.parasolResultsFileHandle = open(self.parasolResultsFile, 'w')
     self.parasolResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence
     self.parasolResultsFileHandle = open(self.parasolResultsFile, 'r')
     logger.info("Reset the results queue")
Esempio n. 22
0
def processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats):
    from workflow.jobTree.lib.bioio import getTempFile
    from workflow.jobTree.lib.bioio import getTempDirectory
    from workflow.jobTree.lib.bioio import logger
    from workflow.jobTree.lib.bioio import system
    from workflow.jobTree.lib.bioio import getTotalCpuTime
    
    assert len(job.find("children").findall("child")) == 0
    assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
    command = jobToRun.attrib["command"]
    #Copy the job file to be edited
    
    tempJob = ET.Element("job")
    ET.SubElement(tempJob, "children")
    
    #Log for job
    tempJob.attrib["log_level"] = job.attrib["log_level"]
    
    #Time length of 'ideal' job before further parallelism is required
    tempJob.attrib["job_time"] = job.attrib["job_time"]
    
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()

    #Temp file dirs for job.
    localTempDir = getTempDirectory(rootDir=localSlaveTempDir)
    tempJob.attrib["local_temp_dir"] = localTempDir
    depth = len(job.find("followOns").findall("followOn"))
    tempJob.attrib["global_temp_dir"] = os.path.join(job.attrib["global_temp_dir"], str(depth))
    if not os.path.isdir(tempJob.attrib["global_temp_dir"]): #Ensures that the global temp dirs of each level are kept separate.
        os.mkdir(tempJob.attrib["global_temp_dir"])
        os.chmod(tempJob.attrib["global_temp_dir"], 0777)
    if os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+1))):
        system("rm -rf %s" % os.path.join(job.attrib["global_temp_dir"], str(depth+1)))
    assert not os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+2)))
    
    #Deal with memory and cpu requirements (this pass tells the running job how much cpu and memory they have,
    #according to the batch system
    tempJob.attrib["available_memory"] = str(memoryAvailable)
    tempJob.attrib["available_cpu"] = str(cpuAvailable)
    if stats != None:
        tempJob.attrib["stats"] = getTempFile(rootDir=localSlaveTempDir)
        os.remove(tempJob.attrib["stats"])
    
    #Now write the temp job file
    tempFile = getTempFile(rootDir=localSlaveTempDir)
    fileHandle = open(tempFile, 'w') 
    tree = ET.ElementTree(tempJob)
    tree.write(fileHandle)
    fileHandle.close()
    logger.info("Copied the jobs files ready for the job")
    
    if "JOB_FILE" not in command:
        logger.critical("There is no 'JOB_FILE' string in the command to be run to take the job-file argument: %s" % command)
        job.attrib["colour"] = "red" #Update the colour
    else:
        #First load the environment for the job.
        fileHandle = open(job.attrib["environment_file"], 'r')
        environment = cPickle.load(fileHandle)
        fileHandle.close()
        logger.info("Loaded the environment for the process")
        
        #Run the actual command
        tempLogFile = getTempFile(suffix=".log", rootDir=localSlaveTempDir)
        fileHandle = open(tempLogFile, 'w')
        finalCommand = command.replace("JOB_FILE", tempFile)
        if stats != None:
            startTime = time.time()
            startClock = getTotalCpuTime()
        process = subprocess.Popen(finalCommand, shell=True, stdout=fileHandle, stderr=subprocess.STDOUT, env=environment)
            
        sts = os.waitpid(process.pid, 0)
        fileHandle.close()
        truncateFile(tempLogFile, int(job.attrib["max_log_file_size"]))
        
        #Copy across the log file
        system("mv %s %s" % (tempLogFile, job.attrib["log_file"]))
        i = sts[1]
        
        logger.info("Ran the job command=%s with exit status %i" % (finalCommand, i))
        
        if i == 0:
            logger.info("Passed the job, okay")
            
            if stats != None:
                jobTag = ET.SubElement(stats, "job", { "time":str(time.time() - startTime), "clock":str(getTotalCpuTime() - startClock) })
                if os.path.exists(tempJob.attrib["stats"]):
                    jobTag.append(ET.parse(tempJob.attrib["stats"]).getroot())
            
            tempJob = ET.parse(tempFile).getroot()
            job.attrib["colour"] = "black" #Update the colour
            
            #Update the runtime of the stack..
            totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
            runtime = float(jobToRun.attrib["time"])
            totalRuntime -= runtime
            if totalRuntime < 0.0:
                totalRuntime = 0.0
            
            #The children
            children = job.find("children")
            assert len(children.findall("child")) == 0 #The children
            assert tempJob.find("children") != None
            for child in tempJob.find("children").findall("child"):
                memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, child)
                ET.SubElement(children, "child", { "command":child.attrib["command"], 
                        "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
                logger.info("Making a child with command: %s" % (child.attrib["command"]))
            
            #The follow on command
            followOns = job.find("followOns")
            followOns.remove(followOns.findall("followOn")[-1]) #Remove the old job
            if tempJob.attrib.has_key("command"):
                memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, tempJob)
                ET.SubElement(followOns, "followOn", { "command":tempJob.attrib["command"], 
                        "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
                ##Add the runtime to the total runtime..
                totalRuntime += compTime
                logger.info("Making a follow on job with command: %s" % tempJob.attrib["command"])
                
            elif len(tempJob.find("children").findall("child")) != 0: #This is to keep the stack of follow on jobs consistent.
                ET.SubElement(followOns, "followOn", { "command":"echo JOB_FILE", "time":"0", "memory":"1000000", "cpu":"1" })
                logger.info("Making a stub follow on job")
            #Write back the runtime, after addin the follow on time and subtracting the time of the run job.
            job.attrib["total_time"] = str(totalRuntime)
        else:
            logger.info("Failed the job")
            job.attrib["colour"] = "red" #Update the colour
    
    #Clean up
    system("rm -rf %s" % (localSlaveTempDir))
    logger.info("Cleaned up by removing temp jobfile (the copy), and the temporary file directory for the job")
Esempio n. 23
0
def main():
    sys.path +=  [ sys.argv[1] ]
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from workflow.jobTree.lib.bioio import getBasicOptionParser
    from workflow.jobTree.lib.bioio import parseBasicOptions
    from workflow.jobTree.lib.bioio import logger
    from workflow.jobTree.lib.bioio import addLoggingFileHandler
    from workflow.jobTree.lib.bioio import setLogLevel
    from workflow.jobTree.lib.bioio import getTotalCpuTime
    
    from workflow.jobTree.lib.master import writeJobs
    
    ##########################################
    #Construct the arguments.
    ##########################################
    
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--job", dest="jobFile", 
                      help="Job file containing command to run",
                      default="None")
    
    options, args = parseBasicOptions(parser)
    assert len(args) == 0

    ##########################################
    #Parse the job.
    ##########################################
    
    job = ET.parse(options.jobFile).getroot()
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging
    setLogLevel(job.attrib["log_level"])
    addLoggingFileHandler(job.attrib["slave_log_file"], rotatingLogging=False)
    logger.info("Parsed arguments and set up logging")
    
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if job.attrib.has_key("stats"):
        startTime = time.time()
        startClock = time.clock()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #Run the script.
    ##########################################
    
    maxTime = float(job.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint
    jobToRun = job.find("followOns").findall("followOn")[-1]
    memoryAvailable = int(jobToRun.attrib["memory"])
    cpuAvailable = int(jobToRun.attrib["cpu"])
    while True:
        processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats)
        
        if job.attrib["colour"] != "black":
            logger.info("Exiting the slave because of a failed job")
            break
   
        totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
        
        childrenNode = job.find("children")
        childrenList = childrenNode.findall("child")
        #childRuntime = sum([ float(child.attrib["time"]) for child in childrenList ])
            
        if len(childrenList) >= 2: # or totalRuntime + childRuntime > maxTime: #We are going to have to return to the parent
            logger.info("No more jobs can run in series by this slave, its got %i children" % len(childrenList))
            break
        
        followOns = job.find("followOns")
        while len(childrenList) > 0:
            child = childrenList.pop()
            childrenNode.remove(child)
            totalRuntime += float(child.attrib["time"])
            ET.SubElement(followOns, "followOn", child.attrib.copy())
        #assert totalRuntime <= maxTime + 1 #The plus one second to avoid unimportant rounding errors
        job.attrib["total_time"] = str(totalRuntime)
        assert len(childrenNode.findall("child")) == 0
        
        if len(followOns.findall("followOn")) == 0:
            logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
            break
        
        #Get the next job and see if we have enough cpu and memory to run it..
        jobToRun = job.find("followOns").findall("followOn")[-1]
        if int(jobToRun.attrib["memory"]) > memoryAvailable:
            logger.info("We need more memory for the next job, so finishing")
            break
        if int(jobToRun.attrib["cpu"]) > cpuAvailable:
            logger.info("We need more cpus for the next job, so finishing")
            break
        
        ##Updated the job so we can start the next loop cycle
        job.attrib["colour"] = "grey"
        writeJobs([ job ])
        logger.info("Updated the status of the job to grey and starting the next job")
    
    #Write back the job file with the updated jobs, using the checkpoint method.
    writeJobs([ job ])
    logger.info("Written out an updated job file")
    
    logger.info("Finished running the chain of jobs on this node")
    
    ##########################################
    #Finish up the stats
    ##########################################
    
    if stats != None:
        stats.attrib["time"] = str(time.time() - startTime)
        stats.attrib["clock"] = str(getTotalCpuTime() - startClock)
        fileHandle = open(job.attrib["stats"], 'w')
        ET.ElementTree(stats).write(fileHandle)
        fileHandle.close()
Esempio n. 24
0
def main():
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")

    parser.add_option("--job", dest="jobFile", help="Job file containing command to run", default="None")

    parser.add_option(
        "--treePointer", dest="treePointerFile", help="File containing pointer to the tree data", default="None"
    )

    options, args = parseBasicOptions(parser)

    logger.info("Parsed the input arguments")

    job = ET.parse(options.jobFile).getroot()
    setLogLevel(job.attrib["log_level"])

    logger.info("Parsed the job XML")

    treePointer = ET.parse(options.treePointerFile).getroot()

    logger.info("Parsed the tree pointer XML")

    tree = ET.parse(treePointer.attrib["file"]).getroot()

    logger.info("Parsed the tree XML")

    for child in tree.find("children").findall("child"):
        # Make the chuld tree pointer
        childTreePointerFile = makeTreePointer(child.attrib["file"], getTempFile(rootDir=job.attrib["global_temp_dir"]))
        # Make the child command
        unbornChild = ET.SubElement(job.find("children"), "child")
        command = "jobTreeTest_CommandFirst.py --treePointer %s --job JOB_FILE" % (childTreePointerFile,)
        unbornChild.attrib["command"] = command
        if random.random() > 0.2:
            unbornChild.attrib["time"] = str(random.random() * 10)
        # Make the child tree pointer
        ET.SubElement(treePointer.find("children"), "child", {"file": childTreePointerFile})

    job.attrib["command"] = "jobTreeTest_CommandSecond.py --treePointer %s --job JOB_FILE" % (options.treePointerFile,)
    logger.info("Made new command")

    fileHandle = open(options.jobFile, "w")
    ET.ElementTree(job).write(fileHandle)
    fileHandle.close()

    logger.info("Updated the job file")

    print >> sys.stderr, "Checking that we can report to std err"  # These lines should end up in the logs
    print "Checking that we can report to std out"

    if random.random() > 0.9:
        logger.info("Going to fail the job")
        sys.exit(1)
    logger.info("Going to pass the job done okay")
    sys.exit(0)
Esempio n. 25
0
def main():
    """Reports the state of the job tree.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  
    
    parser = getBasicOptionParser("usage: %prog [options] \nThe colours returned indicate the state of the job.\n\
\twhite: job has not been started yet\n\
\tgrey: job is issued to batch system\n\
\tred: job failed\n\
\tblue: job has children currently being processed\n\
\tblack: job has finished and will be processed (transient state)\n\
\tdead: job is totally finished and is awaiting deletion (transient state)", "%prog 0.1")
    
    parser.add_option("--jobTree", dest="jobTree", 
                      help="Directory containing the job tree")
    
    parser.add_option("--verbose", dest="verbose", action="store_true",
                      help="Print loads of information, particularly all the log files of errors",
                      default=False)
    
    parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true",
                      help="Return exit value of 1 if job tree jobs not all completed",
                      default=False)
    
    options, args = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    assert len(args) == 0
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    ##########################################
    #Do some checks.
    ##########################################
    
    logger.info("Checking if we have files for job tree")
    assert options.jobTree != None
    assert os.path.isdir(options.jobTree) #The given job dir tree must exist.
    assert os.path.isfile(os.path.join(options.jobTree, "config.xml")) #A valid job tree must contain the config gile
    assert os.path.isdir(os.path.join(options.jobTree, "jobs")) #A job tree must have a directory of jobs.
    assert os.path.isdir(os.path.join(options.jobTree, "tempDirDir")) #A job tree must have a directory of temporary directories (for jobs to make temp files in).
    assert os.path.isdir(os.path.join(options.jobTree, "logFileDir")) #A job tree must have a directory of log files.
    assert os.path.isdir(os.path.join(options.jobTree, "slaveLogFileDir")) #A job tree must have a directory of slave log files.
    
    ##########################################
    #Read the total job number
    ##########################################  
    
    config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot()
    
    ##########################################
    #Survey the status of the job and report.
    ##########################################  
    
    colours = {}
    jobFiles = TempFileTree(config.attrib["job_file_dir"]).listFiles()
    if len(jobFiles) > 0:
        logger.info("Collating the colours of the job tree")
        for absFileName in jobFiles:
            job = parseJobFile(absFileName)
            if job != None:
                if not colours.has_key(job.attrib["colour"]):
                    colours[job.attrib["colour"]] = 0
                colours[job.attrib["colour"]] += 1
    else:
        logger.info("There are no jobs to collate")
    
    print "There are %i jobs currently in job tree: %s" % \
    (len(jobFiles), options.jobTree)
    
    for colour in colours.keys():
        print "\tColour: %s, number of jobs: %s" % (colour, colours[colour])
    
    if options.verbose: #Verbose currently means outputting the files that have failed.
        for absFileName in jobFiles:
            job = parseJobFile(absFileName)
            if job != None:
                if job.attrib["colour"] == "red":
                    if os.path.isfile(job.attrib["log_file"]):
                        def fn(string):
                            print string
                        logFile(job.attrib["log_file"], fn)
                    else:
                        logger.info("Log file for job %s is not present" % job.attrib["file"])
    
    if len(jobFiles) != 0 and options.failIfNotComplete:
        sys.exit(1)
Esempio n. 26
0
def main():
    """Reports stats on the job-tree, use in conjunction with --stats options to jobTree.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  
    
    parser = getBasicOptionParser("usage: %prog", "%prog 0.1")
    
    parser.add_option("--jobTree", dest="jobTree", 
                      help="Directory containing the job tree")
    
    parser.add_option("--outputFile", dest="outputFile", default=None,
                      help="File in which to write results")
    
    
    options, args = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    assert len(args) == 0
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    ##########################################
    #Do some checks.
    ##########################################
    
    logger.info("Checking if we have files for job tree")
    
    if options.jobTree == None:
        raise RuntimeError("You did not specify the job-tree")
    
    if not os.path.isdir(options.jobTree):
        raise RuntimeError("The given job dir tree does not exist: %s" % options.jobTree)
    
    if not os.path.isfile(os.path.join(options.jobTree, "config.xml")):
        raise RuntimeError("A valid job tree must contain the config file")
    
    if not os.path.isfile(os.path.join(options.jobTree, "stats.xml")):
        raise RuntimeError("The job-tree was run without the --stats flag, so no stats were created")
    
    ##########################################
    #Read the stats and config
    ##########################################  
    
    config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot()
    stats = ET.parse(os.path.join(options.jobTree, "stats.xml")).getroot()
    
    ##########################################
    #Collate the stats and report
    ##########################################  
    
    def fn(element, items, itemName):
        itemTimes = [ float(item.attrib["time"]) for item in items ]
        itemTimes.sort()
        itemClocks = [ float(item.attrib["clock"]) for item in items ]
        itemClocks.sort()
        itemWaits = [ float(item.attrib["time"]) - float(item.attrib["clock"]) for item in items ]
        itemWaits.sort()
        if len(itemTimes) == 0:
            itemTimes.append(0)
            itemClocks.append(0)
            itemWaits.append(0)
        return ET.SubElement(element, itemName, { "total_number":str(len(items)),
                                               "total_time":str(sum(itemTimes)),
                                               "median_time":str(itemTimes[len(itemTimes)/2]),
                                               "average_time":str(sum(itemTimes)/len(itemTimes)),
                                               "min_time":str(min(itemTimes)),
                                               "max_time":str(max(itemTimes)),
                                               "total_clock":str(sum(itemClocks)),
                                               "median_clock":str(itemClocks[len(itemClocks)/2]),
                                               "average_clock":str(sum(itemClocks)/len(itemClocks)),
                                               "min_clock":str(min(itemClocks)),
                                               "max_clock":str(max(itemClocks)),
                                               "total_wait":str(sum(itemWaits)),
                                               "median_wait":str(itemWaits[len(itemWaits)/2]),
                                               "average_wait":str(sum(itemWaits)/len(itemWaits)),
                                               "min_wait":str(min(itemWaits)),
                                               "max_wait":str(max(itemWaits))
                                                })
    
    def fn2(element, containingItems, containingItemName, getFn):
        itemCounts = [ len(getFn(containingItem)) for containingItem in containingItems ]
        itemCounts.sort()
        if len(itemCounts) == 0: 
            itemCounts.append(0)
        element.attrib["median_number_per_%s" % containingItemName] = str(itemCounts[len(itemCounts)/2])
        element.attrib["average_number_per_%s" % containingItemName] = str(float(sum(itemCounts))/len(itemCounts))
        element.attrib["min_number_per_%s" % containingItemName] = str(min(itemCounts))
        element.attrib["max_number_per_%s" % containingItemName] = str(max(itemCounts))
    
    if stats.find("total_time") == None: #Hack to allow it to work on unfinished jobtrees.
        ET.SubElement(stats, "total_time", { "time":"0.0", "clock":"0.0"})
    
    collatedStatsTag = ET.Element("collated_stats", { "total_run_time":stats.find("total_time").attrib["time"],
                                                     "total_clock":stats.find("total_time").attrib["clock"],
                                                     "batch_system":config.attrib["batch_system"],
                                                     "job_time":config.attrib["job_time"],
                                                     "default_memory":config.attrib["default_memory"],
                                                     "default_cpu":config.attrib["default_cpu"],
                                                     "max_jobs":config.attrib["max_jobs"],
                                                     "max_threads":config.attrib["max_threads"] })
    
    #Add slave info
    slaves = stats.findall("slave")
    fn(collatedStatsTag, slaves, "slave")
    
    #Add job info
    jobs = []
    for slave in slaves:
        jobs += slave.findall("job")
    def fn3(slave):
        return slave.findall("job")
    fn2(fn(collatedStatsTag, jobs, "job"), slaves, "slave", fn3)
    
    #Add aggregated target info
    targets = []
    for job in jobs:
        for stack in job.findall("stack"):
            targets += stack.findall("target")
    def fn4(job):
        targets = []
        for stack in job.findall("stack"):
            targets += stack.findall("target")
        return targets
    fn2(fn(collatedStatsTag, targets, "target"), jobs, "job", fn4)   
    
    #Get info for each target
    targetNames = set()
    for target in targets:
        targetNames.add(target.attrib["class"])
    
    
    targetTypesTag = ET.SubElement(collatedStatsTag, "target_types")
    for targetName in targetNames:
        targetTypes = [ target for target in targets if target.attrib["class"] == targetName ]
        targetTypeTag = fn(targetTypesTag, targetTypes, targetName)
        estimatedRunTimes = [ float(target.attrib["e_time"]) for target in targetTypes ]
        targetTypeTag.attrib["estimated_time"] = str(sum(estimatedRunTimes)/len(estimatedRunTimes))
    
    def prettify(elem):
        """Return a pretty-printed XML string for the Element.
        """
        rough_string = ET.tostring(elem, 'utf-8')
        reparsed = minidom.parseString(rough_string)
        return reparsed.toprettyxml(indent="  ")
    
    #Now dump it all out to file
    if options.outputFile != None:
        fileHandle = open(options.outputFile, 'w')
        #ET.ElementTree(collatedStatsTag).write(fileHandle)
        fileHandle.write(prettify(collatedStatsTag))
        fileHandle.close()
    
    #Now dump onto the screen
    print prettify(collatedStatsTag)
Esempio n. 27
0
def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """    
    waitDuration = float(config.attrib["wait_duration"])
    assert waitDuration >= 0
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \
                (waitDuration, rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    jobFiles = config.attrib["job_file_dir"].listFiles()
    logger.info("Got a list of job files")
    
    #Repair the job tree using any .old files
    fixJobsList(config, jobFiles)
    logger.info("Fixed the job files using any .old files")
    
    #Get jobs that were running, or that had failed reset to 'white' status
    restartFailedJobs(config, jobFiles)
    logger.info("Reworked failed jobs")
    
    updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started.
    for jobFile in jobFiles:
        job = ET.parse(jobFile).getroot()
        if job.attrib["colour"] not in ("grey", "blue"):
            updatedJobFiles.add(jobFile)
    logger.info("Got the active (non grey/blue) job files")
    
    totalJobFiles = len(jobFiles) #Total number of job files we have.
    jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system.
    
    idealJobTime = float(config.attrib["job_time"]) 
    assert idealJobTime > 0.0
    
    maxIssuedJobs = int(config.attrib["max_jobs"]) #The maximum number of jobs to issue to the batch system
    assert maxIssuedJobs >= 1
    
    stats = config.attrib.has_key("stats")
    if stats:
        startTime = time.time()
        startClock = getTotalCpuTime()
    
    logger.info("Starting the main loop")
    timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug
    while True:
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash)))
        
	jobsToIssue = []
        for jobFile in list(updatedJobFiles):
            job = ET.parse(jobFile).getroot()
            assert job.attrib["colour"] not in ("grey", "blue")
            
            if job.attrib["colour"] == "white": #Get ready to start the job
                if len(jobIDsToJobsHash) < maxIssuedJobs:
                    logger.debug("Job: %s is being started" % job.attrib["file"])
                    updatedJobFiles.remove(job.attrib["file"])
                    
                    #Reset the log files for the job.
                    open(job.attrib["slave_log_file"], 'w').close()
                    open(job.attrib["log_file"], 'w').close()
                    
                    job.attrib["colour"] = "grey"
                    #writeJobs([ job ]) #Check point, do this before issuing job, so state is not read until issued
                    
                    #issueJobs([ job ], jobIDsToJobsHash, batchSystem)
		    jobsToIssue.append(job)
                else:
                    logger.debug("Job: %s is not being issued yet because we have %i jobs issued" % (job.attrib["file"], len(jobIDsToJobsHash)))
            elif job.attrib["colour"] == "black": #Job has finished okay
                logger.debug("Job: %s has finished okay" % job.attrib["file"])
                #Deal with stats
                if stats:
                    system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"]))
                    open(job.attrib["stats"], 'w').close() #Reset the stats file
                childCount = int(job.attrib["child_count"])
                blackChildCount = int(job.attrib["black_child_count"])
                assert childCount == blackChildCount #Has no currently running child jobs
                #Launch any unborn children
                unbornChildren = job.find("children")
                unbornChild = unbornChildren.find("child")
                if unbornChild != None: #We must give birth to the unborn children
                    logger.debug("Job: %s has children to schedule" % job.attrib["file"])
                    newChildren = []
                    while unbornChild != None:
                        cummulativeChildTime = float(unbornChild.attrib["time"])
                        
                        newJob = createJob(unbornChild.attrib.copy(), job.attrib["file"], config)
                        
                        totalJobFiles += 1
                        updatedJobFiles.add(newJob.attrib["file"])
                        
                        newChildren.append(newJob)
                        unbornChildren.remove(unbornChild)
                        unbornChild = unbornChildren.find("child")
                        
                        #This was code to aggregate groups of children into one job, but we don't do this now
                        #while cummulativeChildTime < idealJobTime and unbornChild != None: #Cummulate a series of children into each job as a stack of jobs (to balance cost of parellelism with cost of running stuff in serially).
                        #    cummulativeChildTime += float(unbornChild.attrib["time"])
                        #    ET.SubElement(newJob.find("followOns"), "followOn", unbornChild.attrib.copy())    
                        #    unbornChildren.remove(unbornChild)
                        #    unbornChild = unbornChildren.find("child")
                        newJob.attrib["total_time"] = str(cummulativeChildTime)
                    
                    updatedJobFiles.remove(job.attrib["file"])
                    job.attrib["child_count"] = str(childCount + len(newChildren))
                    job.attrib["colour"] = "blue" #Blue - has children running.
                    writeJobs([ job ] + newChildren ) #Check point
                    
                elif len(job.find("followOns").findall("followOn")) != 0: #Has another job
                    logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"])
                    ##Reset the job run info
                    job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
                    job.attrib["colour"] = "white"
                    ##End resetting the job
                    writeJobs([ job ])
                    
                else: #Job has finished, so we can defer to any parent
                    logger.debug("Job: %s is now dead" % job.attrib["file"])
                    job.attrib["colour"] = "dead"
                    if job.attrib.has_key("parent"):
                        parent = ET.parse(job.attrib["parent"]).getroot()
                        assert parent.attrib["colour"] == "blue"
                        assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"])
                        parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1)
                        if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]):
                            parent.attrib["colour"] = "black"
                            assert parent.attrib["file"] not in updatedJobFiles
                            updatedJobFiles.add(parent.attrib["file"])
                        writeJobs([ job, parent ]) #Check point
                    else:
                        writeJobs([ job ])
                         
            elif job.attrib["colour"] == "red": #Job failed
                logger.critical("Job: %s failed" % job.attrib["file"])
                logger.critical("The log file of the failed job")
                logFile(job.attrib["log_file"], logger.critical)
                logger.critical("The log file of the slave for the failed job")
                logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop
                #Checks
                assert len(job.find("children").findall("child")) == 0
                assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
                
                remainingRetyCount = int(job.attrib["remaining_retry_count"])
                if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere
                    job.attrib["remaining_retry_count"] = str(remainingRetyCount-1)
                    job.attrib["colour"] = "white"
                    logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"]))
                    writeJobs([ job ]) #Check point
                else:
                    assert remainingRetyCount == 0
                    updatedJobFiles.remove(job.attrib["file"])
                    logger.critical("Job: %s is completely failed" % job.attrib["file"])
                    
            else:
                logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"])
                assert job.attrib["colour"] == "dead"
                updatedJobFiles.remove(job.attrib["file"])
                totalJobFiles -= 1
                deleteJob(job, config) #This could be done earlier, but I like it this way.

        ###End of for loop
        writeJobs(jobsToIssue) #Check point, do this before issuing job, so state is not read until issued
        issueJobs(jobsToIssue, jobIDsToJobsHash, batchSystem)

                
        if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles)
            break
        
        if len(updatedJobFiles) > 0:
            updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed.
        else:
            updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed.
        for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, 
            result = updatedJobs[jobID]
            if jobIDsToJobsHash.has_key(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended sucessfully" % jobIDsToJobsHash[jobID])   
                else:
                    logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID], result))  
                processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash)
            else:
                logger.info("A result seems to already have been processed: %i" % jobID) #T
        
        if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds
            reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem)
            logger.info("Reissued any over long jobs")
            
            reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem)
            logger.info("Rescued any (long) missing jobs")
            timeSinceJobsLastRescued = time.time()
        #Going to sleep to let the job system catch up.
        time.sleep(waitDuration)
    
    if stats:
        fileHandle = open(config.attrib["stats"], 'a')
        fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
        fileHandle.close()
    
    logger.info("Finished the main loop")     
    
    return totalJobFiles #Returns number of failed jobs
Esempio n. 28
0
def main():
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--job", dest="jobFile", 
                      help="Job file containing command to run",
                      default="None")
    
    parser.add_option("--treePointer", dest="treePointer", 
                      help="File containing pointer to the tree data",
                      default="None")
    
    options, args = parseBasicOptions(parser)
    
    logger.info("Parsed the input arguments")
    
    print >>sys.stderr, "Checking that we can report to std err" #These lines should end up in the logs
    print "Checking that we can report to std out"
    
    job = ET.parse(options.jobFile).getroot() 
    setLogLevel(job.attrib["log_level"])
    
    logger.info("Parsed the job XML")
    
    treePointer = ET.parse(options.treePointer).getroot() 
    
    logger.info("Parsed the tree pointer XML")
    
    tree = ET.parse(treePointer.attrib["file"]).getroot()
    
    logger.info("Parsed the tree XML")
    
    i = 0
    children = tree.find("children").findall("child")
    if len(children) > 0:
        for child in children:
            #Parse the child XML tree
            childTree = ET.parse(child.attrib["file"]).getroot()
            i += int(childTree.attrib["count"])
    else:
        i = 1
    
    tree.attrib["count"] = str(i)
    
    logger.info("Calculated the leaf count: %i" % i)
    
    fileHandle = open(treePointer.attrib["file"], 'w')
    ET.ElementTree(tree).write(fileHandle)
    fileHandle.close()
    
    logger.info("Updated the tree file: %s" % treePointer.attrib["file"])
        
    for childPointer in treePointer.find("children").findall("child"):
        if os.path.isfile(childPointer.attrib["file"]):
            os.remove(childPointer.attrib["file"])
    
    logger.info("Removed the child pointer files")
    
    logger.info("No need to update the job file, as we didn't make anything new!")

    if random.random() > 0.9:
        logger.info("Going to fail the job")
        sys.exit(1)
    logger.info("Going to pass the job done okay")
    sys.exit(0)
Esempio n. 29
0
def runJobTreeStats(jobTree, outputFile):
    system("jobTreeStats --jobTree %s --outputFile %s" % (jobTree, outputFile))
    logger.info("Ran the job-tree stats command apparently okay")
Esempio n. 30
0
from workflow.jobTree.scriptTree.stack import loadPickleFile

parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")

parser.add_option("--job", dest="jobFile", 
                  help="Job file containing command to run")

parser.add_option("--target", dest="target", 
                  help="File containing a pickled, wrapped instance of target classes")

options, args = parseBasicOptions(parser)

assert options.target != None

logger.info("Parsed the input arguments")      

#Naughty stuff to do the import of the target we need
for className in args:
    logger.info("Loading the class name", className)
    l = className.split(".")
    moduleName = ".".join(l[:-1])
    className = l[-1]
    _temp = __import__(moduleName, globals(), locals(), [ className ], -1)
    exec "%s = 1" % className
    vars()[className] = _temp.__dict__[className]

target = loadPickleFile(options.target)

target.execute(options.jobFile)