Python critical Examples, sonLib.bioio.logger.critical Python Examples

Example #1

0

Show file

File: slurmBatchSystem.py Project: hj1994412/jobTree

    def killJobs(self, jobIDs):
        """
        Kills the given job indexes and makes sure they're dead.
        """
        for jobID in jobIDs:
            slurmJobID = self.getSlurmJobID(jobID)
            logger.info("DEL: " + str(slurmJobID))
            self.currentjobs.remove(jobID)
            try:
                Slurm.killJob(slurmJobID)
            except Exception:
                pass

            #What is this????
            del self.jobIDs[self.slurmJobTasks[jobID]]
            del self.slurmJobTasks[jobID]

        toKill = set(jobIDs)
        maxattempts = 5
        attempts = 0
        while len(toKill) > 0 and attempts < maxattempts:
            for jobID in list(toKill):
                if SlurmBatchSystem.getJobExitCode(
                        self.slurmJobIDs[jobID]) is not None:
                    toKill.remove(jobID)

            if len(toKill) > 0:
                logger.critical(
                    "Tried to kill some jobs, but something happened and they are still going, so I'll try again"
                )
                time.sleep(5)
                attempts += 1

Example #2

0

Show file

    def killJobs(self):
        # Load hit list:
        killList = list()
        while not self.killQueue.empty():
            killList.append(self.killQueue.get())

        # Do the dirty job
        for jobID in list(killList):
            if jobID in self.runningJobs:
                process = subprocess.Popen(["qdel", self.getSgeID(jobID)])
            else:
                if jobID in self.waitingJobs:
                    self.waitingJobs.remove(jobID)
                self.killedJobsQueue.put(jobID)
                killList.remove(jobID)

        # Wait to confirm the kill
        while len(killList) > 0:
            for jobID in list(killList):
                if getjobexitcode(self.sgeJobIDs[jobID]) is not None:
                    self.killedJobsQueue.put(jobID)
                    killList.remove(jobID)
                    self.forgetJob(jobID)

            if len(killList) > 0:
                logger.critical(
                    "Tried to kill some jobs, but something happened and they are still going, so I'll try again"
                )
                time.sleep(5)

Example #3

0

Show file

File: gridengine.py Project: cooketho/jobTree

    def killJobs(self):
        # Load hit list:
        killList = list()
        while not self.killQueue.empty():
            killList.append(self.killQueue.get())

        # Do the dirty job
        for jobID in list(killList):
            if jobID in self.runningJobs:
                process = subprocess.Popen(["qdel", self.getSgeID(jobID)])
            else:
                if jobID in self.waitingJobs:
                    self.waitingJobs.remove(jobID)
                self.killedJobsQueue.put(jobID)
                killList.remove(jobID)

        # Wait to confirm the kill
        while len(killList) > 0:
            for jobID in list(killList):
                if getjobexitcode(self.sgeJobIDs[jobID]) is not None:
                    self.killedJobsQueue.put(jobID)
                    killList.remove(jobID)
                    self.forgetJob(jobID)

            if len(killList) > 0:
                logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")
                time.sleep(5)

Example #4

0

Show file

File: blastTest.py Project: zqingyuan/cactus

    def testBlastParameters(self):
        """Tests if changing parameters of lastz creates results similar to the desired default.
        """
        encodeRegion = "ENm001"
        species = ("human", "mouse", "dog")
        #Other species to try "rat", "monodelphis", "macaque", "chimp"
        regionPath = os.path.join(self.encodePath, encodeRegion)
        for i in xrange(len(species)):
            species1 = species[i]
            for species2 in species[i + 1:]:
                seqFile1 = os.path.join(regionPath,
                                        "%s.%s.fa" % (species1, encodeRegion))
                seqFile2 = os.path.join(regionPath,
                                        "%s.%s.fa" % (species2, encodeRegion))

                #Run the random
                runNaiveBlast(seqFile1,
                              seqFile2,
                              self.tempOutputFile2,
                              self.tempDir,
                              lastzArguments=
                              "--nogapped --hspthresh=3000 --ambiguous=iupac")
                #Run the blast
                runNaiveBlast(
                    seqFile1,
                    seqFile2,
                    self.tempOutputFile,
                    self.tempDir,
                    lastzArguments=
                    "--nogapped --step=3 --hspthresh=3000 --ambiguous=iupac")

                logger.critical("Comparing blast settings")
                compareResultsFile(self.tempOutputFile, self.tempOutputFile2,
                                   0.7)

Example #5

0

Show file

def killKtServer(dbElem, killSwitchPath, killTimeout=10000):
    if not os.path.isfile(killSwitchPath):
        raise RuntimeError("Can't kill server because file" +
                           " not found %s" % killSwitchPath)
    logPath = __getLogPath(dbElem)
    isRunning =  __isKtServerRunning(dbElem, killSwitchPath)
    os.remove(killSwitchPath)
    logPath = __getLogPath(dbElem)
    if not isRunning:
        raise RuntimeError("Can't find running server to kill %s" % logPath)

    success = False
    for i in xrange(killTimeout):
        try:
            if pingKtServer(dbElem) or len(__scrapePids([logPath])) > 0:
                logger.critical("Waiting for ktserver to die, but still running with logPath: %s, pingKtServer returned %s, dB port: %s, __scrapePids returned: %s" % (logPath, pingKtServer(dbElem), dbElem.getDbPort(), __scrapePids([logPath])))
                sleep(1)
            else:
                success = True
                break
        except RuntimeError:
            logger.critical("Got runtime error while trying to kill ktserver, putting it down to bad luck and carrying on")
    if not success:
        raise RuntimeError("Failed to kill server within timeout. " +
                           "Server log is %s" % logPath)
    if os.path.exists(logPath):
        os.remove(logPath)
    return True

Example #6

0

Show file

File: blastTest.py Project: sanboissel/cactus

    def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode):
        """We compare the output with a naive run of the blast program, to check the results are nearly
        equivalent.
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six
        species = ("human", "mouse", "dog")
        #Other species to try "rat", "monodelphis", "macaque", "chimp"
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            for i in xrange(len(species)):
                species1 = species[i]
                for species2 in species[i+1:]:
                    seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
                    seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))

                    #Run simple blast
                    runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir)
                    logger.info("Ran the naive blast okay")
                    
                    #Run cactus blast pipeline
                    toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
                    if blastMode == "allAgainstAll":
                        runCactusBlast(sequenceFiles=[ seqFile1, seqFile2 ],
                                       alignmentsFile=self.tempOutputFile2, toilDir=toilDir,
                                       chunkSize=500000, overlapSize=10000)
                    else:
                        runCactusBlast(sequenceFiles=[ seqFile1 ], alignmentsFile=self.tempOutputFile2,
                                       toilDir=toilDir, chunkSize=500000, overlapSize=10000,
                                       targetSequenceFiles=[ seqFile2 ])
                    logger.info("Ran cactus_blast okay")
                    logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode)
                    checkCigar(self.tempOutputFile)
                    checkCigar(self.tempOutputFile2)
                    compareResultsFile(self.tempOutputFile, self.tempOutputFile2)

Example #7

0

Show file

File: master.py Project: decarlin/jobTree

def reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem, cpusUsed, killAfterNTimesMissing=3):
    """Check all the current job ids are in the list of currently running batch system jobs. 
    If a job is missing, we mark it as so, if it is missing for a number of runs of 
    this function (say 10).. then we try deleting the job (though its probably lost), we wait
    then we pass the job to processFinishedJob.
    """
    runningJobs = set(batchSystem.getIssuedJobIDs())
    jobIDsSet = set(jobIDsToJobsHash.keys())
    #Clean up the reissueMissingJobs_missingHash hash
    missingJobIDsSet = set(reissueMissingJobs_missingHash.keys())
    for jobID in missingJobIDsSet.difference(jobIDsSet):
        reissueMissingJobs_missingHash.pop(jobID)
    assert runningJobs.issubset(jobIDsSet) #Assert checks we have no unexpected jobs running
    for jobID in set(jobIDsSet.difference(runningJobs)):
        jobFile = jobIDsToJobsHash[jobID][0]
        if reissueMissingJobs_missingHash.has_key(jobID):
            reissueMissingJobs_missingHash[jobID] = reissueMissingJobs_missingHash[jobID]+1
        else:
            reissueMissingJobs_missingHash[jobID] = 1
        timesMissing = reissueMissingJobs_missingHash[jobID]
        logger.critical("Job %s with id %i is missing for the %i time" % (jobFile, jobID, timesMissing))
        if timesMissing == killAfterNTimesMissing:
            reissueMissingJobs_missingHash.pop(jobID)
            batchSystem.killJobs([ jobID ])
            cpusUsed = processFinishedJob(jobID, 1, updatedJobFiles, jobIDsToJobsHash, cpusUsed)
    return len(reissueMissingJobs_missingHash) == 0, cpusUsed #We use this to inform if there are missing jobs

Example #8

0

Show file

File: master.py Project: decarlin/jobTree

def fixJobsList(config, jobFiles):
    """Traverses through and finds any .old files, using there saved state to recover a 
    valid state of the job tree.
    """
    updatingFiles = []
    for fileName in jobFiles[:]:
        if "updating" in fileName:
            updatingFiles.append(fileName)
    
    for updatingFileName in updatingFiles: #Things crashed while the state was updating, so we should remove the 'updating' and '.new' files
        fileHandle = open(updatingFileName, 'r')
        for fileName in fileHandle.readline().split():
            if os.path.isfile(fileName):
                config.attrib["job_file_dir"].destroyTempFile(fileName)
                jobFiles.remove(fileName)
        fileHandle.close()
        config.attrib["job_file_dir"].destroyTempFile(updatingFileName)
        jobFiles.remove(updatingFileName)
    
    for fileName in jobFiles[:]: #Else any '.new' files will be switched in place of the original file. 
        if fileName[-3:] == 'new':
            originalFileName = fileName[:-4]
            os.rename(fileName, originalFileName)
            jobFiles.remove(fileName)
            if originalFileName not in jobFiles:
                jobFiles.append(originalFileName)
            logger.critical("Fixing the file: %s from %s" % (originalFileName, fileName))

Example #9

0

Show file

File: master.py Project: adamnovak/jobTree

def reissueMissingJobs(updatedJobFiles, jobBatcher, batchSystem, 
                       childJobFileToParentJob, childCounts, config,
                       killAfterNTimesMissing=3):
    """Check all the current job ids are in the list of currently running batch system jobs. 
    If a job is missing, we mark it as so, if it is missing for a number of runs of 
    this function (say 10).. then we try deleting the job (though its probably lost), we wait
    then we pass the job to processFinishedJob.
    """
    runningJobs = set(batchSystem.getIssuedJobIDs())
    jobIDsSet = set(jobBatcher.getJobIDs())
    #Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
    missingJobIDsSet = set(reissueMissingJobs_missingHash.keys())
    for jobID in missingJobIDsSet.difference(jobIDsSet):
        reissueMissingJobs_missingHash.pop(jobID)
        logger.critical("Job id %s is no longer missing" % str(jobID))
    assert runningJobs.issubset(jobIDsSet) #Assert checks we have no unexpected jobs running
    jobsToKill = []
    for jobID in set(jobIDsSet.difference(runningJobs)):
        jobFile = jobBatcher.getJob(jobID)
        if reissueMissingJobs_missingHash.has_key(jobID):
            reissueMissingJobs_missingHash[jobID] = reissueMissingJobs_missingHash[jobID]+1
        else:
            reissueMissingJobs_missingHash[jobID] = 1
        timesMissing = reissueMissingJobs_missingHash[jobID]
        logger.critical("Job %s with id %s is missing for the %i time" % (jobFile, str(jobID), timesMissing))
        if timesMissing == killAfterNTimesMissing:
            reissueMissingJobs_missingHash.pop(jobID)
            jobsToKill.append(jobID)
    killJobs(jobsToKill, updatedJobFiles, jobBatcher, batchSystem, childJobFileToParentJob, childCounts, config)
    return len(reissueMissingJobs_missingHash) == 0 #We use this to inform if there are missing jobs

Example #10

0

Show file

File: blastTest.py Project: benedictpaten/cactus

    def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode):
        """We compare the output with a naive run of the blast program, to check the results are nearly
        equivalent.
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six
        species = ("human", "mouse", "dog")
        #Other species to try "rat", "monodelphis", "macaque", "chimp"
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            for i in xrange(len(species)):
                species1 = species[i]
                for species2 in species[i+1:]:
                    seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
                    seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))

                    #Run simple blast
                    runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir)
                    logger.info("Ran the naive blast okay")
                    
                    #Run cactus blast pipeline
                    toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
                    if blastMode == "allAgainstAll":
                        runCactusBlast(sequenceFiles=[ seqFile1, seqFile2 ],
                                       alignmentsFile=self.tempOutputFile2, toilDir=toilDir,
                                       chunkSize=500000, overlapSize=10000)
                    else:
                        runCactusBlast(sequenceFiles=[ seqFile1 ], alignmentsFile=self.tempOutputFile2,
                                       toilDir=toilDir, chunkSize=500000, overlapSize=10000,
                                       targetSequenceFiles=[ seqFile2 ])
                    logger.info("Ran cactus_blast okay")
                    logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode)
                    checkCigar(self.tempOutputFile)
                    checkCigar(self.tempOutputFile2)
                    compareResultsFile(self.tempOutputFile, self.tempOutputFile2)

Example #11

0

Show file

File: master.py Project: adamnovak/jobTree

def processAnyNewFile(jobFile):
    if os.path.isfile(jobFile + ".new"): #The job was not properly updated before crashing
        logger.critical("There was a .new file for the job and no .updating file %s" % jobFile)
        if os.path.isfile(jobFile):
            os.remove(jobFile)
        os.rename(jobFile + ".new", jobFile)
        return True
    return False

Example #12

0

Show file

File: master.py Project: tmfarrell/ont_dap

def processAnyNewFile(jobFile):
    if os.path.isfile(
            jobFile +
            ".new"):  #The job was not properly updated before crashing
        logger.critical(
            "There was a .new file for the job and no .updating file %s" %
            jobFile)
        if os.path.isfile(jobFile):
            os.remove(jobFile)
        os.rename(jobFile + ".new", jobFile)
        return True
    return False

Example #13

0

Show file

File: parasol.py Project: ArtRand/jobTree

 def killJobs(self, jobIDs):
     """Kills the given jobs, represented as Job ids, then checks they are dead by checking
     they are not in the list of issued jobs.
     """
     while True:
         for jobID in jobIDs:
             exitValue = popenParasolCommand("%s remove job %i" % (self.parasolCommand, jobID), runUntilSuccessful=False)[0]
             logger.info("Tried to remove jobID: %i, with exit value: %i" % (jobID, exitValue))
         runningJobs = self.getIssuedJobIDs()
         if set(jobIDs).difference(set(runningJobs)) == set(jobIDs):
             return
         time.sleep(5)
         logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")

Example #14

0

Show file

File: master.py Project: adamnovak/jobTree

def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobBatcher, childJobFileToParentJob, childCounts, config):
    """Function reads a processed job file and updates it state.
    """
    jobFile = jobBatcher.removeJobID(jobID)
    updatingFilePresent = processAnyUpdatingFile(jobFile)
    newFilePresent = processAnyNewFile(jobFile)
    jobDir = os.path.split(jobFile)[0]
    if os.path.exists(getJobLogFileName(jobDir)):
        logger.critical("The job seems to have left a log file, indicating failure: %s", jobFile)
        logFile(getJobLogFileName(jobDir), logger.critical)
    if os.path.isfile(jobFile):        
        job = Job.read(jobFile)
        assert job not in updatedJobFiles
        if resultStatus != 0 or newFilePresent or updatingFilePresent:
            if not os.path.exists(job.getLogFileName()):
                logger.critical("No log file is present, despite job failing: %s", jobFile)
            setupJobAfterFailure(job, config)
        if len(job.followOnCommands) > 0 or len(job.children) > 0:
            updatedJobFiles.add(job) #Now we know the job is done we can add it to the list of updated job files
            logger.debug("Added job: %s to active jobs" % jobFile)
        else:
            for message in job.messages: #This is here because jobs with no children or follow ons may log to master.
                logger.critical("Got message from job at time: %s : %s" % (time.time(), message))
            logger.debug("Job has no follow-ons or children despite job file being present so we'll consider it done: %s" % jobFile)
            updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts)
    else:  #The job is done
        if resultStatus != 0:
            logger.critical("Despite the batch system claiming failure the job %s seems to have finished and been removed" % jobFile)
        updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts)

Example #15

0

Show file

File: parasol.py Project: ArtRand/jobTree

def popenParasolCommand(command, runUntilSuccessful=True):
    """Issues a parasol command using popen to capture the output.
    If the command fails then it will try pinging parasol until it gets a response.
    When it gets a response it will recursively call the issue parasol command, repeating this pattern 
    for a maximum of N times. 
    The final exit value will reflect this.
    """
    while True:
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1)
        output, nothing = process.communicate() #process.stdout.read().strip()
        exitValue = process.wait()
        if exitValue == 0:
            return 0, output.split("\n")
        logger.critical("The following parasol command failed (exit value %s): %s" % (exitValue, command))
        if not runUntilSuccessful:
            return exitValue, None
        time.sleep(10)
        logger.critical("Waited for a few seconds, will try again")

Example #16

0

Show file

File: master.py Project: adamnovak/jobTree

def reissueOverLongJobs(updatedJobFiles, jobBatcher, config, batchSystem, childJobFileToParentJob, childCounts):
    """Check each issued job - if it is running for longer than desirable.. issue a kill instruction.
    Wait for the job to die then we pass the job to processFinishedJob.
    """
    maxJobDuration = float(config.attrib["max_job_duration"])
    idealJobTime = float(config.attrib["job_time"])
    if maxJobDuration < idealJobTime * 10:
        logger.info("The max job duration is less than 10 times the ideal the job time, so I'm setting it to the ideal job time, sorry, but I don't want to crash your jobs because of limitations in jobTree ")
        maxJobDuration = idealJobTime * 10
    jobsToKill = []
    if maxJobDuration < 10000000: #We won't both doing anything is the rescue time is more than 16 weeks.
        runningJobs = batchSystem.getRunningJobIDs()
        for jobID in runningJobs.keys():
            if runningJobs[jobID] > maxJobDuration:
                logger.critical("The job: %s has been running for: %s seconds, more than the max job duration: %s, we'll kill it" % \
                            (str(jobBatcher.getJob(jobID)), str(runningJobs[jobID]), str(maxJobDuration)))
                jobsToKill.append(jobID)
        killJobs(jobsToKill, updatedJobFiles, jobBatcher, batchSystem, childJobFileToParentJob, childCounts, config)

Example #17

0

Show file

File: parasol.py Project: tmfarrell/ont_dap

 def killJobs(self, jobIDs):
     """Kills the given jobs, represented as Job ids, then checks they are dead by checking
     they are not in the list of issued jobs.
     """
     while True:
         for jobID in jobIDs:
             exitValue = popenParasolCommand("%s remove job %i" %
                                             (self.parasolCommand, jobID),
                                             runUntilSuccessful=False)[0]
             logger.info("Tried to remove jobID: %i, with exit value: %i" %
                         (jobID, exitValue))
         runningJobs = self.getIssuedJobIDs()
         if set(jobIDs).difference(set(runningJobs)) == set(jobIDs):
             return
         time.sleep(5)
         logger.critical(
             "Tried to kill some jobs, but something happened and they are still going, so I'll try again"
         )

Example #18

0

Show file

File: lsf.py Project: benedictpaten/jobTree

    def killJobs(self, jobIDs):
        """Kills the given job IDs.
        """
        for jobID in jobIDs:
            logger.info("DEL: " + str(self.getLsfID(jobID)))
            self.currentjobs.remove(jobID)
            process = subprocess.Popen(["bkill", self.getLsfID(jobID)])
            del self.jobIDs[self.lsfJobIDs[jobID]]
            del self.lsfJobIDs[jobID]

        toKill = set(jobIDs)
        while len(toKill) > 0:
            for jobID in list(toKill):
                if getjobexitcode(self.lsfJobIDs[jobID]) is not None:
                    toKill.remove(jobID)

            if len(toKill) > 0:
                logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")
                time.sleep(5)

Example #19

0

Show file

File: gridengine.py Project: adamnovak/jobTree

    def killJobs(self, jobIDs):
        """Kills the given jobs, represented as Job ids, then checks they are dead by checking
        they are not in the list of issued jobs.
        """
        for jobID in jobIDs:
            self.currentjobs.remove(jobID)
            process = subprocess.Popen(["qdel", self.getSgeID(jobID)])
            del self.jobIDs[self.sgeJobIDs[jobID]]
            del self.sgeJobIDs[jobID]

        toKill = set(jobIDs)
        while len(toKill) > 0:
            for jobID in list(toKill):
                if getjobexitcode(self.sgeJobIDs[jobID]) is not None:
                    toKill.remove(jobID)

            if len(toKill) > 0:
                logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")
                time.sleep(5)

Example #20

0

Show file

File: parasol.py Project: decarlin/jobTree

def popenParasolCommand(command, tmpFileForStdOut, runUntilSuccessful=True):
    """Issues a parasol command using popen to capture the output.
    If the command fails then it will try pinging parasol until it gets a response.
    When it gets a response it will recursively call the issue parasol command, repeating this pattern 
    for a maximum of N times. 
    The final exit value will reflect this.
    """
    while True:
        fileHandle = open(tmpFileForStdOut, 'w')
        process = subprocess.Popen(command, shell=True, stdout=fileHandle)
        sts = os.waitpid(process.pid, 0)
        fileHandle.close()
        i = sts[1]
        if i != 0 and runUntilSuccessful:
            logger.critical("The following parasol command failed: %s" % command)
            time.sleep(10)
            logger.critical("Waited for a few seconds, will try again")
        else:
            return i

Example #21

0

Show file

File: lsf.py Project: hj1994412/jobTree

    def killJobs(self, jobIDs):
        """Kills the given job IDs.
        """
        for jobID in jobIDs:
            logger.info("DEL: " + str(self.getLsfID(jobID)))
            self.currentjobs.remove(jobID)
            process = subprocess.Popen(["bkill", self.getLsfID(jobID)])
            del self.jobIDs[self.lsfJobIDs[jobID]]
            del self.lsfJobIDs[jobID]

        toKill = set(jobIDs)
        while len(toKill) > 0:
            for jobID in list(toKill):
                if getjobexitcode(self.lsfJobIDs[jobID]) is not None:
                    toKill.remove(jobID)

            if len(toKill) > 0:
                logger.critical(
                    "Tried to kill some jobs, but something happened and they are still going, so I'll try again"
                )
                time.sleep(5)

Example #22

0

Show file

File: parasol.py Project: ArtRand/jobTree

 def __init__(self, config, maxCpus, maxMemory):
     AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory) #Call the parent constructor
     if maxMemory != sys.maxint:
         logger.critical("A max memory has been specified for the parasol batch system class of %i, but currently this batchsystem interface does not support such limiting" % maxMemory)
     #Keep the name of the results file for the pstat2 command..
     self.parasolCommand = config.attrib["parasol_command"]
     self.parasolResultsFile = getParasolResultsFileName(config.attrib["job_tree"])
     #Reset the job queue and results (initially, we do this again once we've killed the jobs)
     self.queuePattern = re.compile("q\s+([0-9]+)")
     self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+")
     self.killJobs(self.getIssuedJobIDs()) #Kill any jobs on the current stack
     logger.info("Going to sleep for a few seconds to kill any existing jobs")
     time.sleep(5) #Give batch system a second to sort itself out.
     logger.info("Removed any old jobs from the queue")
     #Reset the job queue and results
     exitValue = popenParasolCommand("%s -results=%s clear sick" % (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue != None:
         logger.critical("Could not clear sick status of the parasol batch %s" % self.parasolResultsFile)
     exitValue = popenParasolCommand("%s -results=%s flushResults" % (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue != None:
         logger.critical("Could not flush the parasol batch %s" % self.parasolResultsFile)
     open(self.parasolResultsFile, 'w').close()
     logger.info("Reset the results queue")
     #Stuff to allow max cpus to be work
     self.outputQueue1 = Queue()
     self.outputQueue2 = Queue()
     #worker = Thread(target=getUpdatedJob, args=(self.parasolResultsFileHandle, self.outputQueue1, self.outputQueue2))
     #worker.setDaemon(True)
     worker = Process(target=getUpdatedJob, args=(self.parasolResultsFile, self.outputQueue1, self.outputQueue2))
     worker.daemon = True
     worker.start()
     self.usedCpus = 0
     self.jobIDsToCpu = {}

Example #23

0

Show file

File: master.py Project: adamnovak/jobTree

def processAnyUpdatingFile(jobFile):
    if os.path.isfile(jobFile + ".updating"):
        logger.critical("There was an .updating file for job: %s" % jobFile)
        if os.path.isfile(jobFile + ".new"): #The job failed while writing the updated job file.
            logger.critical("There was a .new file for the job: %s" % jobFile)
            os.remove(jobFile + ".new") #The existance of the .updating file means it wasn't complete
        for f in listChildDirs(os.path.split(jobFile)[0]):
            logger.critical("Removing broken child %s\n" % f)
            system("rm -rf %s" % f)
        assert os.path.isfile(jobFile)
        os.remove(jobFile + ".updating") #Delete second the updating file second to preserve a correct state
        logger.critical("We've reverted to the original job file: %s" % jobFile)
        return True
    return False

Example #24

0

Show file

File: blastTest.py Project: sanboissel/cactus

 def testCompression(self):
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     tempSeqFile2 = os.path.join(self.tempDir, "tempSeq2.fa")
     self.tempFiles.append(tempSeqFile)
     self.tempFiles.append(tempSeqFile2)
     self.encodePath = os.path.join(self.encodePath, "ENm001")
     catFiles([ os.path.join(self.encodePath, fileName) for fileName in os.listdir(self.encodePath) ], tempSeqFile)
     startTime = time.time()
     compressFastaFile(tempSeqFile)
     logger.critical("It took %s seconds to compress the fasta file" % (time.time() - startTime))
     startTime = time.time()
     system("rm %s" % tempSeqFile + ".bz2")
     system("bzip2 --keep --fast %s" % tempSeqFile)
     logger.critical("It took %s seconds to compress the fasta file by system functions" % (time.time() - startTime))
     startTime = time.time()
     decompressFastaFile(tempSeqFile + ".bz2", tempSeqFile2)
     logger.critical("It took %s seconds to decompress the fasta file" % (time.time() - startTime))
     system("rm %s" % tempSeqFile2)
     startTime = time.time()
     system("bunzip2 --stdout %s > %s" % (tempSeqFile + ".bz2", tempSeqFile2))
     logger.critical("It took %s seconds to decompress the fasta file using system function" % (time.time() - startTime))
     logger.critical("File sizes, before: %s, compressed: %s, decompressed: %s" % (os.stat(tempSeqFile).st_size, os.stat(tempSeqFile + ".bz2").st_size, os.stat(tempSeqFile2).st_size))

Example #25

0

Show file

File: blastTest.py Project: benedictpaten/cactus

 def testBlastParameters(self):
     """Tests if changing parameters of lastz creates results similar to the desired default.
     """
     encodeRegion = "ENm001"
     species = ("human", "mouse", "dog")
     #Other species to try "rat", "monodelphis", "macaque", "chimp"
     regionPath = os.path.join(self.encodePath, encodeRegion)
     for i in xrange(len(species)):
         species1 = species[i]
         for species2 in species[i+1:]:
             seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
             seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))
             
             #Run the random
             runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile2, self.tempDir,
                           lastzArguments="--nogapped --hspthresh=3000 --ambiguous=iupac")
             #Run the blast
             runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir,
                           lastzArguments="--nogapped --step=3 --hspthresh=3000 --ambiguous=iupac")
             
             logger.critical("Comparing blast settings")
             compareResultsFile(self.tempOutputFile, self.tempOutputFile2, 0.7)

Example #26

0

Show file

File: master.py Project: tmfarrell/ont_dap

def reissueMissingJobs(updatedJobFiles,
                       jobBatcher,
                       batchSystem,
                       childJobFileToParentJob,
                       childCounts,
                       config,
                       killAfterNTimesMissing=3):
    """Check all the current job ids are in the list of currently running batch system jobs.
    If a job is missing, we mark it as so, if it is missing for a number of runs of
    this function (say 10).. then we try deleting the job (though its probably lost), we wait
    then we pass the job to processFinishedJob.
    """
    runningJobs = set(batchSystem.getIssuedJobIDs())
    jobIDsSet = set(jobBatcher.getJobIDs())
    #Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
    missingJobIDsSet = set(reissueMissingJobs_missingHash.keys())
    for jobID in missingJobIDsSet.difference(jobIDsSet):
        reissueMissingJobs_missingHash.pop(jobID)
        logger.critical("Job id %s is no longer missing" % str(jobID))
    assert runningJobs.issubset(
        jobIDsSet)  #Assert checks we have no unexpected jobs running
    jobsToKill = []
    for jobID in set(jobIDsSet.difference(runningJobs)):
        jobFile = jobBatcher.getJob(jobID)
        if reissueMissingJobs_missingHash.has_key(jobID):
            reissueMissingJobs_missingHash[
                jobID] = reissueMissingJobs_missingHash[jobID] + 1
        else:
            reissueMissingJobs_missingHash[jobID] = 1
        timesMissing = reissueMissingJobs_missingHash[jobID]
        logger.critical("Job %s with id %s is missing for the %i time" %
                        (jobFile, str(jobID), timesMissing))
        if timesMissing == killAfterNTimesMissing:
            reissueMissingJobs_missingHash.pop(jobID)
            jobsToKill.append(jobID)
    killJobs(jobsToKill, updatedJobFiles, jobBatcher, batchSystem,
             childJobFileToParentJob, childCounts, config)
    return len(reissueMissingJobs_missingHash
               ) == 0  #We use this to inform if there are missing jobs

Example #27

0

Show file

File: master.py Project: tmfarrell/ont_dap

def reissueOverLongJobs(updatedJobFiles, jobBatcher, config, batchSystem,
                        childJobFileToParentJob, childCounts):
    """Check each issued job - if it is running for longer than desirable.. issue a kill instruction.
    Wait for the job to die then we pass the job to processFinishedJob.
    """
    maxJobDuration = float(config.attrib["max_job_duration"])
    idealJobTime = float(config.attrib["job_time"])
    if maxJobDuration < idealJobTime * 10:
        logger.info(
            "The max job duration is less than 10 times the ideal the job time, so I'm setting it to the ideal job time, sorry, but I don't want to crash your jobs because of limitations in jobTree "
        )
        maxJobDuration = idealJobTime * 10
    jobsToKill = []
    if maxJobDuration < 10000000:  #We won't both doing anything is the rescue time is more than 16 weeks.
        runningJobs = batchSystem.getRunningJobIDs()
        for jobID in runningJobs.keys():
            if runningJobs[jobID] > maxJobDuration:
                logger.critical("The job: %s has been running for: %s seconds, more than the max job duration: %s, we'll kill it" % \
                            (str(jobBatcher.getJob(jobID)), str(runningJobs[jobID]), str(maxJobDuration)))
                jobsToKill.append(jobID)
        killJobs(jobsToKill, updatedJobFiles, jobBatcher, batchSystem,
                 childJobFileToParentJob, childCounts, config)

Example #28

0

Show file

File: blastTest.py Project: benedictpaten/cactus

 def testCompression(self):
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     tempSeqFile2 = os.path.join(self.tempDir, "tempSeq2.fa")
     self.tempFiles.append(tempSeqFile)
     self.tempFiles.append(tempSeqFile2)
     self.encodePath = os.path.join(self.encodePath, "ENm001")
     catFiles([ os.path.join(self.encodePath, fileName) for fileName in os.listdir(self.encodePath) ], tempSeqFile)
     startTime = time.time()
     compressFastaFile(tempSeqFile)
     logger.critical("It took %s seconds to compress the fasta file" % (time.time() - startTime))
     startTime = time.time()
     system("rm %s" % tempSeqFile + ".bz2")
     system("bzip2 --keep --fast %s" % tempSeqFile)
     logger.critical("It took %s seconds to compress the fasta file by system functions" % (time.time() - startTime))
     startTime = time.time()
     decompressFastaFile(tempSeqFile + ".bz2", tempSeqFile2)
     logger.critical("It took %s seconds to decompress the fasta file" % (time.time() - startTime))
     system("rm %s" % tempSeqFile2)
     startTime = time.time()
     system("bunzip2 --stdout %s > %s" % (tempSeqFile + ".bz2", tempSeqFile2))
     logger.critical("It took %s seconds to decompress the fasta file using system function" % (time.time() - startTime))
     logger.critical("File sizes, before: %s, compressed: %s, decompressed: %s" % (os.stat(tempSeqFile).st_size, os.stat(tempSeqFile + ".bz2").st_size, os.stat(tempSeqFile2).st_size))

Example #29

0

Show file

File: parasol.py Project: tmfarrell/ont_dap

def popenParasolCommand(command, runUntilSuccessful=True):
    """Issues a parasol command using popen to capture the output.
    If the command fails then it will try pinging parasol until it gets a response.
    When it gets a response it will recursively call the issue parasol command, repeating this pattern 
    for a maximum of N times. 
    The final exit value will reflect this.
    """
    while True:
        process = subprocess.Popen(command,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=sys.stderr,
                                   bufsize=-1)
        output, nothing = process.communicate()  #process.stdout.read().strip()
        exitValue = process.wait()
        if exitValue == 0:
            return 0, output.split("\n")
        logger.critical(
            "The following parasol command failed (exit value %s): %s" %
            (exitValue, command))
        if not runUntilSuccessful:
            return exitValue, None
        time.sleep(10)
        logger.critical("Waited for a few seconds, will try again")

Example #30

0

Show file

File: master.py Project: ArtRand/jobTree

def setupJobAfterFailure(job, config):
    if len(job.followOnCommands) > 0:
        job.remainingRetryCount = max(0, job.remainingRetryCount-1)
        logger.critical("Due to failure we are reducing the remaining retry count of job %s to %s" % (job.getJobFileName(), job.remainingRetryCount))
        #Set the default memory to be at least as large as the default, in case this was a malloc failure (we do this because of the combined
        #batch system)
        job.followOnCommands[-1] = (job.followOnCommands[-1][0], max(job.followOnCommands[-1][1], float(config.attrib["default_memory"]))) + job.followOnCommands[-1][2:]
        logger.critical("We have set the default memory of the failed job to %s bytes" % job.followOnCommands[-1][1])
    else:
        logger.critical("The job %s has no follow on jobs to reset" % job.getJobFileName())

Example #31

0

Show file

File: master.py Project: tmfarrell/ont_dap

def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobBatcher,
                       childJobFileToParentJob, childCounts, config):
    """Function reads a processed job file and updates it state.
    """
    jobFile = jobBatcher.removeJobID(jobID)
    updatingFilePresent = processAnyUpdatingFile(jobFile)
    newFilePresent = processAnyNewFile(jobFile)
    jobDir = os.path.split(jobFile)[0]
    if os.path.exists(getJobLogFileName(jobDir)):
        logger.critical(
            "The job seems to have left a log file, indicating failure: %s",
            jobFile)
        logFile(getJobLogFileName(jobDir), logger.critical)
    if os.path.isfile(jobFile):
        job = Job.read(jobFile)
        assert job not in updatedJobFiles
        if resultStatus != 0 or newFilePresent or updatingFilePresent:
            if not os.path.exists(job.getLogFileName()):
                logger.critical(
                    "No log file is present, despite job failing: %s", jobFile)
            setupJobAfterFailure(job, config)
        if len(job.followOnCommands) > 0 or len(job.children) > 0:
            updatedJobFiles.add(
                job
            )  #Now we know the job is done we can add it to the list of updated job files
            logger.debug("Added job: %s to active jobs" % jobFile)
        else:
            for message in job.messages:  #This is here because jobs with no children or follow ons may log to master.
                logger.critical("Got message from job at time: %s : %s" %
                                (time.time(), message))
            logger.debug(
                "Job has no follow-ons or children despite job file being present so we'll consider it done: %s"
                % jobFile)
            updateParentStatus(jobFile, updatedJobFiles,
                               childJobFileToParentJob, childCounts)
    else:  #The job is done
        if resultStatus != 0:
            logger.critical(
                "Despite the batch system claiming failure the job %s seems to have finished and been removed"
                % jobFile)
        updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob,
                           childCounts)

Example #32

0

Show file

File: master.py Project: tmfarrell/ont_dap

def processAnyUpdatingFile(jobFile):
    if os.path.isfile(jobFile + ".updating"):
        logger.critical("There was an .updating file for job: %s" % jobFile)
        if os.path.isfile(
                jobFile +
                ".new"):  #The job failed while writing the updated job file.
            logger.critical("There was a .new file for the job: %s" % jobFile)
            os.remove(
                jobFile + ".new"
            )  #The existance of the .updating file means it wasn't complete
        for f in listChildDirs(os.path.split(jobFile)[0]):
            logger.critical("Removing broken child %s\n" % f)
            system("rm -rf %s" % f)
        assert os.path.isfile(jobFile)
        os.remove(
            jobFile + ".updating"
        )  #Delete second the updating file second to preserve a correct state
        logger.critical("We've reverted to the original job file: %s" %
                        jobFile)
        return True
    return False

Example #33

0

Show file

File: parasol.py Project: tmfarrell/ont_dap

 def __init__(self, config, maxCpus, maxMemory):
     AbstractBatchSystem.__init__(self, config, maxCpus,
                                  maxMemory)  #Call the parent constructor
     if maxMemory != sys.maxint:
         logger.critical(
             "A max memory has been specified for the parasol batch system class of %i, but currently this batchsystem interface does not support such limiting"
             % maxMemory)
     #Keep the name of the results file for the pstat2 command..
     self.parasolCommand = config.attrib["parasol_command"]
     self.parasolResultsFile = getParasolResultsFileName(
         config.attrib["job_tree"])
     #Reset the job queue and results (initially, we do this again once we've killed the jobs)
     self.queuePattern = re.compile("q\s+([0-9]+)")
     self.runningPattern = re.compile(
         "r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+")
     self.killJobs(
         self.getIssuedJobIDs())  #Kill any jobs on the current stack
     logger.info(
         "Going to sleep for a few seconds to kill any existing jobs")
     time.sleep(5)  #Give batch system a second to sort itself out.
     logger.info("Removed any old jobs from the queue")
     #Reset the job queue and results
     exitValue = popenParasolCommand(
         "%s -results=%s clear sick" %
         (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue != None:
         logger.critical(
             "Could not clear sick status of the parasol batch %s" %
             self.parasolResultsFile)
     exitValue = popenParasolCommand(
         "%s -results=%s flushResults" %
         (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue != None:
         logger.critical("Could not flush the parasol batch %s" %
                         self.parasolResultsFile)
     open(self.parasolResultsFile, 'w').close()
     logger.info("Reset the results queue")
     #Stuff to allow max cpus to be work
     self.outputQueue1 = Queue()
     self.outputQueue2 = Queue()
     #worker = Thread(target=getUpdatedJob, args=(self.parasolResultsFileHandle, self.outputQueue1, self.outputQueue2))
     #worker.setDaemon(True)
     worker = Process(target=getUpdatedJob,
                      args=(self.parasolResultsFile, self.outputQueue1,
                            self.outputQueue2))
     worker.daemon = True
     worker.start()
     self.usedCpus = 0
     self.jobIDsToCpu = {}

Example #34

0

Show file

File: master.py Project: hj1994412/jobTree

def processAnyNewFile(jobFile):
    if os.path.isfile(
            jobFile +
            ".new"):  #The job was not properly updated before crashing
        logger.critical(
            "There was a .new file for the job and no .updating file %s" %
            jobFile)
        os.remove(jobFile +
                  ".new")  #A new file should only exist if atomic update fails
        for f in listChildDirs(os.path.split(jobFile)[0]):
            logger.critical("Removing broken child %s\n" % f)
            system("rm -rf %s" % f)
        assert os.path.isfile(jobFile)
        logger.critical("We've reverted to the original job file: %s" %
                        jobFile)
        return True
    return False

Example #35

0

Show file

File: master.py Project: tmfarrell/ont_dap

def setupJobAfterFailure(job, config):
    if len(job.followOnCommands) > 0:
        job.remainingRetryCount = max(0, job.remainingRetryCount - 1)
        logger.critical(
            "Due to failure we are reducing the remaining retry count of job %s to %s"
            % (job.getJobFileName(), job.remainingRetryCount))
        #Set the default memory to be at least as large as the default, in case this was a malloc failure (we do this because of the combined
        #batch system)
        job.followOnCommands[-1] = (job.followOnCommands[-1][0],
                                    max(job.followOnCommands[-1][1],
                                        float(config.attrib["default_memory"]))
                                    ) + job.followOnCommands[-1][2:]
        logger.critical(
            "We have set the default memory of the failed job to %s bytes" %
            job.followOnCommands[-1][1])
    else:
        logger.critical("The job %s has no follow on jobs to reset" %
                        job.getJobFileName())

Example #36

0

Show file

File: master.py Project: decarlin/jobTree

def reportJobLogFiles(job):
    logger.critical("The log file of the job")
    logFile(job.attrib["log_file"], logger.critical)
    logger.critical("The log file of the slave for the job")
    logFile(job.attrib["slave_log_file"], logger.critical) #We log the job log file in the main loop

Example #37

0

Show file

File: blastTest.py Project: benedictpaten/cactus

def checkResultsAreApproximatelyEqual(resultsComparator, closeness=0.95):
    """Checks that the comparisons show the two blasts are suitably close together.
    """
    logger.critical("Results are: %s" % resultsComparator)
    assert resultsComparator.sensitivity >= closeness
    assert resultsComparator.specificity >= closeness

Example #38

0

Show file

File: jobTreeSlave.py Project: adamnovak/jobTree

def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler, redirectLoggerStreamHandlers
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime, getTotalCpuTimeAndMemoryUsage
    from sonLib.bioio import getTempDirectory
    from sonLib.bioio import makeSubDir
    from jobTree.src.job import Job
    from jobTree.src.master import getEnvironmentFileName, getConfigFileName, listChildDirs, getTempStatsFile, setupJobAfterFailure
    from sonLib.bioio import system
    
    ########################################## 
    #Input args
    ##########################################
    
    jobTreePath = sys.argv[1]
    jobFile = sys.argv[2]
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(getEnvironmentFileName(jobTreePath), 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = makeSubDir(os.path.join(localSlaveTempDir, "localTempDir"))
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging
    tempSlaveLogFile = os.path.join(localSlaveTempDir, "slave_log.txt")
    slaveHandle = open(tempSlaveLogFile, 'w')
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    logger.addHandler(logging.StreamHandler(slaveHandle))
    origStdErr = sys.stderr
    origStdOut = sys.stdout
    sys.stderr = slaveHandle 
    sys.stdout = slaveHandle
    
    ##########################################
    #Parse input files
    ##########################################
    
    config = ET.parse(getConfigFileName(jobTreePath)).getroot()
    setLogLevel(config.attrib["log_level"])
    job = Job.read(jobFile)
    job.messages = [] #This is the only way to stop messages logging twice, as are read only in the master
    job.children = []
    if os.path.exists(job.getLogFileName()): #This cleans the old log file
        os.remove(job.getLogFileName())
    logger.info("Parsed arguments and set up logging")

     #Try loop for slave logging
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if config.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #The max time 
    ##########################################
    
    maxTime = float(config.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint

    ##########################################
    #Slave log file trapped from here on in
    ##########################################

    slaveFailed = False
    try:
        
        ##########################################
        #The next job
        ##########################################
        
        def globalTempDirName(job, depth):
            return job.getGlobalTempDirName() + str(depth)
        
        command, memoryAvailable, cpuAvailable, depth = job.followOnCommands[-1]
        defaultMemory = int(config.attrib["default_memory"])
        defaultCpu = int(config.attrib["default_cpu"])
        assert len(job.children) == 0
        
        startTime = time.time() 
        while True:
            job.followOnCommands.pop()
            
            ##########################################
            #Global temp dir
            ##########################################
            
            globalTempDir = makeSubDir(globalTempDirName(job, depth))
            i = 1
            while os.path.isdir(globalTempDirName(job, depth+i)):
                system("rm -rf %s" % globalTempDirName(job, depth+i))
                i += 1
                
            ##########################################
            #Old children, not yet deleted
            #
            #These may exist because of the lazy cleanup
            #we do
            ##########################################
        
            for childDir in listChildDirs(job.jobDir):
                logger.debug("Cleaning up old child %s" % childDir)
                system("rm -rf %s" % childDir)
        
            ##########################################
            #Run the job
            ##########################################
        
            if command != "": #Not a stub
                if command[:11] == "scriptTree ":
                    ##########################################
                    #Run the target
                    ##########################################
                    
                    loadStack(command).execute(job=job, stats=stats,
                                    localTempDir=localTempDir, globalTempDir=globalTempDir, 
                                    memoryAvailable=memoryAvailable, cpuAvailable=cpuAvailable, 
                                    defaultMemory=defaultMemory, defaultCpu=defaultCpu, depth=depth)
            
                else: #Is another command
                    system(command) 
            
            ##########################################
            #Cleanup/reset a successful job/checkpoint
            ##########################################
            
            job.remainingRetryCount = int(config.attrib["try_count"])
            system("rm -rf %s/*" % (localTempDir))
            job.update(depth=depth, tryCount=job.remainingRetryCount)
            
            ##########################################
            #Establish if we can run another job
            ##########################################
            
            if time.time() - startTime > maxTime:
                logger.info("We are breaking because the maximum time the job should run for has been exceeded")
                break
            
            #Deal with children
            if len(job.children) >= 1:  #We are going to have to return to the parent
                logger.info("No more jobs can run in series by this slave, its got %i children" % len(job.children))
                break
            
            if len(job.followOnCommands) == 0:
                logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
                break
            
            #Get the next job and see if we have enough cpu and memory to run it..
            command, memory, cpu, depth = job.followOnCommands[-1]
            
            if memory > memoryAvailable:
                logger.info("We need more memory for the next job, so finishing")
                break
            if cpu > cpuAvailable:
                logger.info("We need more cpus for the next job, so finishing")
                break
            
            logger.info("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        
        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            tempStatsFile = getTempStatsFile(jobTreePath)
            fileHandle = open(tempStatsFile + ".new", "w")
            ET.ElementTree(stats).write(fileHandle)
            fileHandle.close()
            os.rename(tempStatsFile + ".new", tempStatsFile) #This operation is atomic
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Where slave goes wrong
    ##########################################
    except: #Case that something goes wrong in slave
        traceback.print_exc(file = slaveHandle)
        logger.critical("Exiting the slave because of a failed job on host %s", socket.gethostname())
        job = Job.read(jobFile)
        setupJobAfterFailure(job, config)
        job.write()
        slaveFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the slave logging
    slaveHandle.flush()
    sys.stderr = origStdErr
    sys.stdout = origStdOut
    redirectLoggerStreamHandlers(slaveHandle, sys.stderr)
    slaveHandle.close()
    
    #Copy back the log file to the global dir, if needed
    if slaveFailed:
        truncateFile(tempSlaveLogFile)
        system("mv %s %s" % (tempSlaveLogFile, job.getLogFileName()))
    #Remove the temp dir
    system("rm -rf %s" % localSlaveTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not slaveFailed) and len(job.followOnCommands) == 0 and len(job.children) == 0 and len(job.messages) == 0:
        ##########################################
        #Cleanup global files at the end of the chain
        ##########################################
        job.delete()

Example #39

0

Show file

File: master.py Project: tmfarrell/ont_dap

def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters,rescue jobs frequency: %s max job duration: %s" % \
                (rescueJobsFrequency, maxJobDuration))

    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()
               ) == 0  #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")

    childJobFileToParentJob, childCounts, updatedJobFiles = {}, {}, set()
    parseJobFiles(getJobFileDirName(config.attrib["job_tree"]),
                  updatedJobFiles, childJobFileToParentJob, childCounts,
                  config)
    jobBatcher = JobBatcher(config, batchSystem)
    logger.info(
        "Found %s jobs to start and %i parent jobs with children to run" %
        (len(updatedJobFiles), len(childCounts)))

    stats = config.attrib.has_key("stats")
    if stats:
        stop = Queue()
        worker = Process(target=statsAggregatorProcess,
                         args=(config.attrib["job_tree"],
                               makeTemporaryStatsDirs(
                                   config.attrib["job_tree"]), stop))
        worker.daemon = True
        worker.start()

    timeSinceJobsLastRescued = time.time(
    )  #Sets up the timing of the job rescuing method
    totalFailedJobs = 0
    logger.info("Starting the main loop")
    while True:
        if len(updatedJobFiles) > 0:
            logger.debug(
                "Built the jobs list, currently have %i jobs to update and %i jobs issued"
                % (len(updatedJobFiles), jobBatcher.getNumberOfJobsIssued()))

            for job in updatedJobFiles:
                for message in job.messages:
                    logger.critical("Got message from job at time: %s : %s" %
                                    (time.time(), message))
                job.messages = []

                if len(job.children) > 0:
                    logger.debug("Job: %s has %i children to schedule" %
                                 (job.getJobFileName(), len(job.children)))
                    children = job.children
                    job.children = []
                    for childJobFile, memory, cpu in children:
                        childJobFileToParentJob[childJobFile] = job
                    assert job not in childCounts
                    childCounts[job] = len(children)
                    jobBatcher.issueJobs(children)
                else:
                    assert len(job.followOnCommands) > 0
                    if job.remainingRetryCount > 0:
                        logger.debug(
                            "Job: %s has a new command that we can now issue" %
                            job.getJobFileName())
                        memory, cpu = job.followOnCommands[-1][1:3]
                        jobBatcher.issueJob(job.getJobFileName(), memory, cpu)
                    else:
                        totalFailedJobs += 1
                        logger.critical("Job: %s is completely failed" %
                                        job.getJobFileName())
            updatedJobFiles = set()  #We've considered them all, so reset

        if jobBatcher.getNumberOfJobsIssued() == 0:
            logger.info(
                "Only failed jobs and their dependents (%i total) are remaining, so exiting."
                % totalFailedJobs)
            break

        updatedJob = batchSystem.getUpdatedJob(
            10)  #Asks the batch system what jobs have been completed.
        if updatedJob != None:
            jobID, result = updatedJob
            if jobBatcher.hasJob(jobID):
                if result == 0:
                    logger.debug(
                        "Batch system is reporting that the job %s ended successfully"
                        % jobBatcher.getJob(jobID))
                else:
                    logger.critical(
                        "Batch system is reporting that the job %s %s failed with exit value %i"
                        % (jobID, jobBatcher.getJob(jobID), result))
                processFinishedJob(jobID, result, updatedJobFiles, jobBatcher,
                                   childJobFileToParentJob, childCounts,
                                   config)
            else:
                logger.critical(
                    "A result seems to already have been processed: %i" %
                    jobID)
        else:
            #logger.debug("Waited but no job was finished, still have %i jobs issued" % jobBatcher.getNumberOfJobsIssued())
            if time.time(
            ) - timeSinceJobsLastRescued >= rescueJobsFrequency:  #We only rescue jobs every N seconds, and when we have apparently exhausted the current job supply
                reissueOverLongJobs(updatedJobFiles, jobBatcher, config,
                                    batchSystem, childJobFileToParentJob,
                                    childCounts)
                logger.info("Reissued any over long jobs")

                hasNoMissingJobs = reissueMissingJobs(updatedJobFiles,
                                                      jobBatcher, batchSystem,
                                                      childJobFileToParentJob,
                                                      childCounts, config)
                if hasNoMissingJobs:
                    timeSinceJobsLastRescued = time.time()
                else:
                    timeSinceJobsLastRescued += 60  #This means we'll try again in a minute, providing things are quiet
                logger.info("Rescued any (long) missing jobs")

    logger.info("Finished the main loop")

    if stats:
        startTime = time.time()
        logger.info("Waiting for stats collator process to finish")
        stop.put(True)
        worker.join()
        logger.info("Stats finished collating in %s seconds" %
                    (time.time() - startTime))

    return totalFailedJobs  #Returns number of failed jobs

Example #40

0

Show file

File: jobTreeSlave.py Project: decarlin/jobTree

def processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats, environment, 
               localSlaveTempDir, localTempDir):
    """Runs a job.
    """
    from sonLib.bioio import logger
    from sonLib.bioio import system
    from sonLib.bioio import getTotalCpuTime
    from sonLib.bioio import redirectLoggerStreamHandlers
    
    assert len(job.find("children").findall("child")) == 0
    assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
    command = jobToRun.attrib["command"]
    #Copy the job file to be edited
    
    tempJob = ET.Element("job")
    ET.SubElement(tempJob, "children")
    
    #Log for job
    tempJob.attrib["log_level"] = job.attrib["log_level"]
    
    #Time length of 'ideal' job before further parallelism is required
    tempJob.attrib["job_time"] = job.attrib["job_time"]

    #Temp file dirs for job.
    tempJob.attrib["local_temp_dir"] = localTempDir
    depth = len(job.find("followOns").findall("followOn"))
    tempJob.attrib["global_temp_dir"] = os.path.join(job.attrib["global_temp_dir"], str(depth))
    if not os.path.isdir(tempJob.attrib["global_temp_dir"]): #Ensures that the global temp dirs of each level are kept separate.
        os.mkdir(tempJob.attrib["global_temp_dir"])
        os.chmod(tempJob.attrib["global_temp_dir"], 0777)
    if os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+1))):
        system("rm -rf %s" % os.path.join(job.attrib["global_temp_dir"], str(depth+1)))
    assert not os.path.isdir(os.path.join(job.attrib["global_temp_dir"], str(depth+2)))
    
    #Deal with memory and cpu requirements (this pass tells the running job how much cpu and memory they have,
    #according to the batch system
    tempJob.attrib["available_memory"] = str(memoryAvailable) 
    tempJob.attrib["available_cpu"] = str(cpuAvailable)
    
    #Run the actual command
    tempLogFile = os.path.join(localSlaveTempDir, "temp.log")
    fileHandle = open(tempLogFile, 'w')
    
    if stats != None:
        tempJob.attrib["stats"] = ""
        startTime = time.time()
        startClock = getTotalCpuTime()
    
    #If you're a script tree python process, we don't need to python
    if command[:10] == "scriptTree":
        import jobTree.scriptTree.scriptTree
        savedStdErr = sys.stderr
        savedStdOut = sys.stdout
        exitValue = 0
        try: 
            sys.stderr = fileHandle 
            sys.stdout = fileHandle
            redirectLoggerStreamHandlers(savedStdErr, fileHandle)
            l = command.split()
            jobTree.scriptTree.scriptTree.run(tempJob, l[1], l[2:])
        except:
            traceback.print_exc(file = fileHandle)
            exitValue = 1
        sys.stderr = savedStdErr
        sys.stdout = savedStdOut
        redirectLoggerStreamHandlers(fileHandle, sys.stderr)
        if exitValue == 1:
            logger.critical("Caught an exception in the target being run")
    else:
        if "JOB_FILE" not in command:
            logger.critical("There is no 'JOB_FILE' string in the command to be run to take the job-file argument: %s" % command)
            job.attrib["colour"] = "red" #Update the colour
        
        #Now write the temp job file
        tempFile = os.path.join(localSlaveTempDir, "tempJob.xml")
        fileHandle2 = open(tempFile, 'w')  
        tree = ET.ElementTree(tempJob)
        tree.write(fileHandle2)
        fileHandle2.close()
        logger.info("Copied the jobs files ready for the job")
        
        process = subprocess.Popen(command.replace("JOB_FILE", tempFile), shell=True, stdout=fileHandle, stderr=subprocess.STDOUT, env=environment)
        sts = os.waitpid(process.pid, 0)
        exitValue = sts[1]
        if exitValue == 0:
            tempJob = ET.parse(tempFile).getroot()
        
    fileHandle.close()
    truncateFile(tempLogFile, int(job.attrib["max_log_file_size"]))
    
    logger.info("Ran the job command=%s with exit status %i" % (command, exitValue))
    
    if exitValue == 0:
        logger.info("Passed the job, okay")
        
        if stats != None:
            jobTag = ET.SubElement(stats, "job", { "time":str(time.time() - startTime), "clock":str(getTotalCpuTime() - startClock) })
            if tempJob.find("stack") != None:
                jobTag.append(tempJob.find("stack"))
        
        job.attrib["colour"] = "black" #Update the colour
        
        #Deal with any logging messages directed at the master
        if tempJob.find("messages") != None:
            messages = job.find("messages")
            if messages == None:
                messages = ET.SubElement(job, "messages")
            for messageTag in tempJob.find("messages").findall("message"):
                messages.append(messageTag)
        
        #Update the runtime of the stack..
        totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
        runtime = float(jobToRun.attrib["time"])
        totalRuntime -= runtime
        if totalRuntime < 0.0:
            totalRuntime = 0.0
        
        #The children
        children = job.find("children")
        assert len(children.findall("child")) == 0 #The children
        assert tempJob.find("children") != None
        for child in tempJob.find("children").findall("child"):
            memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, child)
            ET.SubElement(children, "child", { "command":child.attrib["command"], 
                    "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
            logger.info("Making a child with command: %s" % (child.attrib["command"]))
        
        #The follow on command
        followOns = job.find("followOns")
        followOns.remove(followOns.findall("followOn")[-1]) #Remove the old job
        if tempJob.attrib.has_key("command"):
            memory, cpu, compTime = getMemoryCpuAndTimeRequirements(job, tempJob)
            ET.SubElement(followOns, "followOn", { "command":tempJob.attrib["command"], 
                    "time":str(compTime), "memory":str(memory), "cpu":str(cpu) })
            ##Add the runtime to the total runtime..
            totalRuntime += compTime
            logger.info("Making a follow on job with command: %s" % tempJob.attrib["command"])
            
        elif len(tempJob.find("children").findall("child")) != 0: #This is to keep the stack of follow on jobs consistent.
            ET.SubElement(followOns, "followOn", { "command":"echo JOB_FILE", "time":"0", "memory":"1000000", "cpu":"1" })
            logger.info("Making a stub follow on job")
        #Write back the runtime, after addin the follow on time and subtracting the time of the run job.
        job.attrib["total_time"] = str(totalRuntime)
    else:
        logger.critical("Failed the job")
        job.attrib["colour"] = "red" #Update the colour
    
    #Clean up
    system("rm -rf %s/*" % (localTempDir))
    logger.info("Cleaned up by removing the contents of the local temporary file directory for the job")
    
    return tempLogFile

Example #41

0

Show file

File: jobTreeSlave.py Project: decarlin/jobTree

def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime
    from sonLib.bioio import getTempDirectory
    from jobTree.src.master import writeJobs
    from jobTree.src.master import readJob
    from sonLib.bioio import system
    
    ##########################################
    #Construct the arguments.
    ##########################################
    
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--job", dest="jobFile", 
                      help="Job file containing command to run",
                      default="None")
    
    options, args = parseBasicOptions(parser)
    assert len(args) == 0

    ##########################################
    #Parse the job.
    ##########################################
    
    job = readJob(options.jobFile)
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging
    setLogLevel(job.attrib["log_level"])
    addLoggingFileHandler(job.attrib["slave_log_file"], rotatingLogging=False)
    logger.info("Parsed arguments and set up logging")
    
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if job.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(job.attrib["environment_file"], 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
    logger.info("Loaded the environment for the process")
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = os.path.join(localSlaveTempDir, "localTempDir") 
    os.mkdir(localTempDir)
    os.chmod(localTempDir, 0777)
    
    ##########################################
    #Run the script.
    ##########################################
    
    maxTime = float(job.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint
    jobToRun = job.find("followOns").findall("followOn")[-1]
    memoryAvailable = int(jobToRun.attrib["memory"])
    cpuAvailable = int(jobToRun.attrib["cpu"])
    startTime = time.time()
    while True:
        tempLogFile = processJob(job, jobToRun, memoryAvailable, cpuAvailable, stats, environment, localSlaveTempDir, localTempDir)
        
        if job.attrib["colour"] != "black": 
            logger.critical("Exiting the slave because of a failed job")
            system("mv %s %s" % (tempLogFile, job.attrib["log_file"])) #Copy back the job log file, because we saw failure
            break
        elif job.attrib.has_key("reportAllJobLogFiles"):
            logger.info("Exiting because we've been asked to report all logs, and this involves returning to the master")
            #Copy across the log file
            system("mv %s %s" % (tempLogFile, job.attrib["log_file"]))
            break
   
        totalRuntime = float(job.attrib["total_time"])  #This is the estimate runtime of the jobs on the followon stack
        
        childrenNode = job.find("children")
        childrenList = childrenNode.findall("child")
        #childRuntime = sum([ float(child.attrib["time"]) for child in childrenList ])
            
        if len(childrenList) >= 2: # or totalRuntime + childRuntime > maxTime: #We are going to have to return to the parent
            logger.info("No more jobs can run in series by this slave, its got %i children" % len(childrenList))
            break
        
        if time.time() - startTime > maxTime:
            logger.info("We are breaking because the maximum time the job should run for has been exceeded")
            break
        
        followOns = job.find("followOns")
        while len(childrenList) > 0:
            child = childrenList.pop()
            childrenNode.remove(child)
            totalRuntime += float(child.attrib["time"])
            ET.SubElement(followOns, "followOn", child.attrib.copy())
        #assert totalRuntime <= maxTime + 1 #The plus one second to avoid unimportant rounding errors
        job.attrib["total_time"] = str(totalRuntime)
        assert len(childrenNode.findall("child")) == 0
        
        if len(followOns.findall("followOn")) == 0:
            logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
            break
        
        #Get the next job and see if we have enough cpu and memory to run it..
        jobToRun = job.find("followOns").findall("followOn")[-1]
        if int(jobToRun.attrib["memory"]) > memoryAvailable:
            logger.info("We need more memory for the next job, so finishing")
            break
        if int(jobToRun.attrib["cpu"]) > cpuAvailable:
            logger.info("We need more cpus for the next job, so finishing")
            break
        
        ##Updated the job so we can start the next loop cycle
        job.attrib["colour"] = "grey"
        writeJobs([ job ])
        logger.info("Updated the status of the job to grey and starting the next job")
    
    #Write back the job file with the updated jobs, using the checkpoint method.
    writeJobs([ job ])
    logger.info("Written out an updated job file")
    
    logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Cleanup the temporary directory
    ##########################################
    
    system("rm -rf %s" % localSlaveTempDir)
    
    ##########################################
    #Finish up the stats
    ##########################################
    
    if stats != None:
        stats.attrib["time"] = str(time.time() - startTime)
        stats.attrib["clock"] = str(getTotalCpuTime() - startClock)
        fileHandle = open(job.attrib["stats"], 'w')
        ET.ElementTree(stats).write(fileHandle)
        fileHandle.close()

Example #42

0

Show file

File: master.py Project: decarlin/jobTree

 def checkFileExists(fileName, type):
     if not os.path.isfile(fileName): #We need to keep these files in existence.
         open(fileName, 'w').close()
         logger.critical("The file %s of type %s for job %s had disappeared" % (fileName, type, jobFile))

Example #43

0

Show file

File: master.py Project: adamnovak/jobTree

def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters,rescue jobs frequency: %s max job duration: %s" % \
                (rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    childJobFileToParentJob, childCounts, updatedJobFiles = {}, {}, set()
    parseJobFiles(getJobFileDirName(config.attrib["job_tree"]), updatedJobFiles, childJobFileToParentJob, childCounts, config)
    jobBatcher = JobBatcher(config, batchSystem)
    logger.info("Found %s jobs to start and %i parent jobs with children to run" % (len(updatedJobFiles), len(childCounts)))
    
    stats = config.attrib.has_key("stats")
    if stats:
        stop = Queue()
        worker = Process(target=statsAggregatorProcess, args=(config.attrib["job_tree"], makeTemporaryStatsDirs(config.attrib["job_tree"]), stop))
        worker.daemon = True
        worker.start()
        
    timeSinceJobsLastRescued = time.time() #Sets up the timing of the job rescuing method
    totalFailedJobs = 0
    logger.info("Starting the main loop")
    while True: 
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i jobs to update and %i jobs issued" % (len(updatedJobFiles), jobBatcher.getNumberOfJobsIssued()))
        
            for job in updatedJobFiles:         
                for message in job.messages:
                    logger.critical("Got message from job at time: %s : %s" % (time.time(), message))
                job.messages = []
                
                if len(job.children) > 0:
                    logger.debug("Job: %s has %i children to schedule" % (job.getJobFileName(), len(job.children)))
                    children = job.children
                    job.children = []
                    for childJobFile, memory, cpu in children:
                        childJobFileToParentJob[childJobFile] = job
                    assert job not in childCounts
                    childCounts[job] = len(children)
                    jobBatcher.issueJobs(children)
                else:
                    assert len(job.followOnCommands) > 0
                    if job.remainingRetryCount > 0:
                        logger.debug("Job: %s has a new command that we can now issue" % job.getJobFileName())
                        memory, cpu = job.followOnCommands[-1][1:3]
                        jobBatcher.issueJob(job.getJobFileName(), memory, cpu)
                    else:
                        totalFailedJobs += 1
                        logger.critical("Job: %s is completely failed" % job.getJobFileName())
            updatedJobFiles = set() #We've considered them all, so reset
             
        if jobBatcher.getNumberOfJobsIssued() == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalFailedJobs)
            break 
        
        updatedJob = batchSystem.getUpdatedJob(10) #Asks the batch system what jobs have been completed.
        if updatedJob != None: 
            jobID, result = updatedJob
            if jobBatcher.hasJob(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended successfully" % jobBatcher.getJob(jobID))   
                else:
                    logger.critical("Batch system is reporting that the job %s %s failed with exit value %i" % (jobID, jobBatcher.getJob(jobID), result))  
                processFinishedJob(jobID, result, updatedJobFiles, jobBatcher, childJobFileToParentJob, childCounts, config)
            else:
                logger.critical("A result seems to already have been processed: %i" % jobID)
        else:
            #logger.debug("Waited but no job was finished, still have %i jobs issued" % jobBatcher.getNumberOfJobsIssued())
            if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds, and when we have apparently exhausted the current job supply
                reissueOverLongJobs(updatedJobFiles, jobBatcher, config, batchSystem, childJobFileToParentJob, childCounts)
                logger.info("Reissued any over long jobs")
                
                hasNoMissingJobs = reissueMissingJobs(updatedJobFiles, jobBatcher, batchSystem, childJobFileToParentJob, childCounts, config)
                if hasNoMissingJobs:
                    timeSinceJobsLastRescued = time.time()
                else:
                    timeSinceJobsLastRescued += 60 #This means we'll try again in a minute, providing things are quiet
                logger.info("Rescued any (long) missing jobs")
    
    logger.info("Finished the main loop")   
    
    if stats:
        startTime = time.time()
        logger.info("Waiting for stats collator process to finish")  
        stop.put(True)
        worker.join()
        logger.info("Stats finished collating in %s seconds" % (time.time() - startTime))  
    
    return totalFailedJobs #Returns number of failed jobs

Example #44

0

Show file

File: master.py Project: decarlin/jobTree

def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """
    waitDuration = float(config.attrib["wait_duration"])
    assert waitDuration >= 0
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \
                (waitDuration, rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    jobFiles = config.attrib["job_file_dir"].listFiles()
    logger.info("Got a list of job files")
    
    #Repair the job tree using any .old files
    fixJobsList(config, jobFiles)
    logger.info("Fixed the job files using any .old files")
    
    #Get jobs that were running, or that had failed reset to 'grey' status
    restartFailedJobs(config, jobFiles)
    logger.info("Reworked failed jobs")
    
    updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started.
    for jobFile in jobFiles:
        job = readJob(jobFile)
        if job.attrib["colour"] not in ("blue"):
            updatedJobFiles.add(jobFile)
    logger.info("Got the active (non blue) job files")
    
    totalJobFiles = len(jobFiles) #Total number of job files we have.
    jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system.
    
    idealJobTime = float(config.attrib["job_time"]) 
    assert idealJobTime > 0.0
    
    reportAllJobLogFiles = bool(int(config.attrib["reportAllJobLogFiles"]))
    
    stats = config.attrib.has_key("stats")
    if stats:
        startTime = time.time()
        startClock = getTotalCpuTime()
        
    #Stuff do handle the maximum number of issued jobs
    queueingJobs = []
    maxJobs = int(config.attrib["max_jobs"])
    cpusUsed = 0
    
    logger.info("Starting the main loop")
    timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug
    while True: 
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash)))
        
        for jobFile in list(updatedJobFiles):
            job = readJob(jobFile)
            assert job.attrib["colour"] is not "blue"
            
            ##Check the log files exist, because they must ultimately be cleaned up by their respective file trees.
            def checkFileExists(fileName, type):
                if not os.path.isfile(fileName): #We need to keep these files in existence.
                    open(fileName, 'w').close()
                    logger.critical("The file %s of type %s for job %s had disappeared" % (fileName, type, jobFile))
            checkFileExists(job.attrib["log_file"], "log_file")
            checkFileExists(job.attrib["slave_log_file"], "slave_log_file")
            if stats:
                checkFileExists(job.attrib["stats"], "stats")
            
            def reissueJob(job):
                #Reset the log files for the job.
                updatedJobFiles.remove(jobFile)
                open(job.attrib["slave_log_file"], 'w').close()
                open(job.attrib["log_file"], 'w').close()
                assert job.attrib["colour"] == "grey"
                return issueJobs([ job ], jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed)
                
            def makeGreyAndReissueJob(job):
                job.attrib["colour"] = "grey"
                writeJobs([ job ])
                return reissueJob(job)
            
            if job.attrib["colour"] == "grey": #Get ready to start the job
                cpusUsed = reissueJob(job)
            elif job.attrib["colour"] == "black": #Job has finished okay
                logger.debug("Job: %s has finished okay" % job.attrib["file"])
                if reportAllJobLogFiles:
                    reportJobLogFiles(job)
                #Deal with stats
                if stats:
                    system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"]))
                    open(job.attrib["stats"], 'w').close() #Reset the stats file
                if job.find("messages") != None:
                    for message in job.find("messages").findall("message"):
                        logger.critical("Received the following message from job: %s" % message.attrib["message"])
                    job.remove(job.find("messages"))
                childCount = int(job.attrib["child_count"])
                blackChildCount = int(job.attrib["black_child_count"])
                assert childCount == blackChildCount #Has no currently running child jobs
                #Launch any unborn children
                unbornChildren = job.find("children")
                unbornChild = unbornChildren.find("child")
                if unbornChild != None: #We must give birth to the unborn children
                    logger.debug("Job: %s has %i children to schedule" % (job.attrib["file"], len(unbornChildren.findall("child"))))
                    newChildren = []
                    while unbornChild != None:
                        newJob = createJob(unbornChild.attrib, job.attrib["file"], config)
                        totalJobFiles += 1
                        newChildren.append(newJob)
                        unbornChildren.remove(unbornChild)
                        unbornChild = unbornChildren.find("child")
                    
                    updatedJobFiles.remove(job.attrib["file"])
                    job.attrib["child_count"] = str(childCount + len(newChildren))
                    job.attrib["colour"] = "blue" #Blue - has children running.
                    writeJobs([ job ] + newChildren ) #Check point
                    cpusUsed = issueJobs(newChildren, jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed) #Issue the new children directly
                    
                elif len(job.find("followOns").findall("followOn")) != 0: #Has another job
                    logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"])
                    ##Reset the job run info
                    job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
                    cpusUsed = makeGreyAndReissueJob(job)
                    
                else: #Job has finished, so we can defer to any parent
                    logger.debug("Job: %s is now dead" % job.attrib["file"])
                    job.attrib["colour"] = "dead"
                    if job.attrib.has_key("parent"):
                        parent = readJob(job.attrib["parent"])
                        assert job.attrib["parent"] != jobFile
                        assert parent.attrib["colour"] == "blue"
                        assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"])
                        parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1)
                        if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]):
                            parent.attrib["colour"] = "black"
                            assert parent.attrib["file"] not in updatedJobFiles
                            updatedJobFiles.add(parent.attrib["file"])
                        writeJobs([ job, parent ]) #Check point
                    updatedJobFiles.remove(job.attrib["file"])
                    totalJobFiles -= 1
                    deleteJob(job, config)
                         
            elif job.attrib["colour"] == "red": #Job failed
                logger.critical("Job: %s failed" % job.attrib["file"])
                reportJobLogFiles(job)
                #Checks
                assert len(job.find("children").findall("child")) == 0
                assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
                
                remainingRetyCount = int(job.attrib["remaining_retry_count"])
                if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere
                    job.attrib["remaining_retry_count"] = str(remainingRetyCount-1)
                    logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"]))
                    cpusUsed = makeGreyAndReissueJob(job)
                else:
                    assert remainingRetyCount == 0
                    updatedJobFiles.remove(job.attrib["file"]) #We remove the job and neither delete it or reissue it
                    logger.critical("Job: %s is completely failed" % job.attrib["file"])
                    
            else: #This case should only occur after failure
                logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"])
                assert job.attrib["colour"] == "dead"
                updatedJobFiles.remove(job.attrib["file"])
                totalJobFiles -= 1
                deleteJob(job, config)
                
        #This command is issued to ensure any queing jobs are issued at the end of the loop
        cpusUsed = issueJobs([], jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed)
      
        if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles)
            assert cpusUsed == 0
            break
        
        if len(updatedJobFiles) > 0:
            updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed.
        else:
            updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed.
        
        for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, 
            result = updatedJobs[jobID]
            if jobIDsToJobsHash.has_key(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended successfully" % jobIDsToJobsHash[jobID][0])   
                else:
                    logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID][0], result))  
                cpusUsed = processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash, cpusUsed)
            else:
                logger.info("A result seems to already have been processed: %i" % jobID) #T
        
        if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds
            cpusUsed = reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem, cpusUsed)
            logger.info("Reissued any over long jobs")
            
            hasNoMissingJobs, cpusUsed = reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem, cpusUsed)
            if hasNoMissingJobs:
                timeSinceJobsLastRescued = time.time()
            else:
                timeSinceJobsLastRescued += 60 #This means we'll try again in 60 seconds
            logger.info("Rescued any (long) missing jobs")
        #Going to sleep to let the job system catch up.
        time.sleep(waitDuration)
        ##Check that the total number of cpus
        assert sum([ cpus for jobID, cpus in jobIDsToJobsHash.values() ]) == cpusUsed
        assert cpusUsed <= maxJobs
    
    if stats:
        fileHandle = open(config.attrib["stats"], 'a')
        fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
        fileHandle.close()
    
    logger.info("Finished the main loop")     
    
    return totalJobFiles #Returns number of failed jobs

Example #45

0

Show file

File: blastTest.py Project: sanboissel/cactus

def checkResultsAreApproximatelyEqual(resultsComparator, closeness=0.95):
    """Checks that the comparisons show the two blasts are suitably close together.
    """
    logger.critical("Results are: %s" % resultsComparator)
    assert resultsComparator.sensitivity >= closeness
    assert resultsComparator.specificity >= closeness

Example #46

0

Show file

File: jobTreeSlave.py Project: tmfarrell/ont_dap

def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler, redirectLoggerStreamHandlers
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime, getTotalCpuTimeAndMemoryUsage
    from sonLib.bioio import getTempDirectory
    from sonLib.bioio import makeSubDir
    from jobTree.src.job import Job
    from jobTree.src.master import getEnvironmentFileName, getConfigFileName, listChildDirs, getTempStatsFile, setupJobAfterFailure
    from sonLib.bioio import system
    
    ########################################## 
    #Input args
    ##########################################
    
    jobTreePath = sys.argv[1]
    jobFile = sys.argv[2]
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(getEnvironmentFileName(jobTreePath), 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = makeSubDir(os.path.join(localSlaveTempDir, "localTempDir"))
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging. This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?    
    tempSlaveLogFile = os.path.join(localSlaveTempDir, "slave_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)
    
    #Open the file to send stdout/stderr to.
    logDescriptor = os.open(tempSlaveLogFile, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logDescriptor, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logDescriptor, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logDescriptor)
    
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    #Put a message at the top of the log, just to make sure it's working.
    print "---JOBTREE SLAVE OUTPUT LOG---"
    sys.stdout.flush()
    
    #Log the number of open file descriptors so we can tell if we're leaking
    #them.
    logger.debug("Next available file descriptor: {}".format(
        nextOpenDescriptor()))
    
    ##########################################
    #Parse input files
    ##########################################
    
    config = ET.parse(getConfigFileName(jobTreePath)).getroot()
    setLogLevel(config.attrib["log_level"])
    job = Job.read(jobFile)
    job.messages = [] #This is the only way to stop messages logging twice, as are read only in the master
    job.children = [] #Similarly, this is where old children are flushed out.
    job.write() #Update status, to avoid reissuing children after running a follow on below.
    if os.path.exists(job.getLogFileName()): #This cleans the old log file
        os.remove(job.getLogFileName())
    logger.info("Parsed arguments and set up logging")

     #Try loop for slave logging
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if config.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #The max time 
    ##########################################
    
    maxTime = float(config.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint

    ##########################################
    #Slave log file trapped from here on in
    ##########################################

    slaveFailed = False
    try:
        
        ##########################################
        #The next job
        ##########################################
        
        def globalTempDirName(job, depth):
            return job.getGlobalTempDirName() + str(depth)
        
        command, memoryAvailable, cpuAvailable, depth = job.followOnCommands[-1]
        defaultMemory = int(config.attrib["default_memory"])
        defaultCpu = int(config.attrib["default_cpu"])
        assert len(job.children) == 0
        
        startTime = time.time() 
        while True:
            job.followOnCommands.pop()
            
            ##########################################
            #Global temp dir
            ##########################################
            
            globalTempDir = makeSubDir(globalTempDirName(job, depth))
            i = 1
            while os.path.isdir(globalTempDirName(job, depth+i)):
                system("rm -rf %s" % globalTempDirName(job, depth+i))
                i += 1
                
            ##########################################
            #Old children, not yet deleted
            #
            #These may exist because of the lazy cleanup
            #we do
            ##########################################
        
            for childDir in listChildDirs(job.jobDir):
                logger.debug("Cleaning up old child %s" % childDir)
                system("rm -rf %s" % childDir)
        
            ##########################################
            #Run the job
            ##########################################
        
            if command != "": #Not a stub
                if command[:11] == "scriptTree ":
                    ##########################################
                    #Run the target
                    ##########################################
                    
                    loadStack(command).execute(job=job, stats=stats,
                                    localTempDir=localTempDir, globalTempDir=globalTempDir, 
                                    memoryAvailable=memoryAvailable, cpuAvailable=cpuAvailable, 
                                    defaultMemory=defaultMemory, defaultCpu=defaultCpu, depth=depth)
            
                else: #Is another command
                    system(command) 
            
            ##########################################
            #Cleanup/reset a successful job/checkpoint
            ##########################################
            
            job.remainingRetryCount = int(config.attrib["try_count"])
            system("rm -rf %s/*" % (localTempDir))
            job.update(depth=depth, tryCount=job.remainingRetryCount)
            
            ##########################################
            #Establish if we can run another job
            ##########################################
            
            if time.time() - startTime > maxTime:
                logger.info("We are breaking because the maximum time the job should run for has been exceeded")
                break
            
            #Deal with children
            if len(job.children) >= 1:  #We are going to have to return to the parent
                logger.info("No more jobs can run in series by this slave, its got %i children" % len(job.children))
                break
            
            if len(job.followOnCommands) == 0:
                logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
                break
            
            #Get the next job and see if we have enough cpu and memory to run it..
            command, memory, cpu, depth = job.followOnCommands[-1]
            
            if memory > memoryAvailable:
                logger.info("We need more memory for the next job, so finishing")
                break
            if cpu > cpuAvailable:
                logger.info("We need more cpus for the next job, so finishing")
                break
            
            logger.info("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        
        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            tempStatsFile = getTempStatsFile(jobTreePath)
            fileHandle = open(tempStatsFile + ".new", "w")
            ET.ElementTree(stats).write(fileHandle)
            fileHandle.close()
            os.rename(tempStatsFile + ".new", tempStatsFile) #This operation is atomic
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Where slave goes wrong
    ##########################################
    except: #Case that something goes wrong in slave
        traceback.print_exc()
        logger.critical("Exiting the slave because of a failed job on host %s", socket.gethostname())
        job = Job.read(jobFile)
        setupJobAfterFailure(job, config)
        job.write()
        slaveFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the slave logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if slaveFailed:
        truncateFile(tempSlaveLogFile)
        system("mv %s %s" % (tempSlaveLogFile, job.getLogFileName()))
    #Remove the temp dir
    system("rm -rf %s" % localSlaveTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not slaveFailed) and len(job.followOnCommands) == 0 and len(job.children) == 0 and len(job.messages) == 0:
        ##########################################
        #Cleanup global files at the end of the chain
        ##########################################
        job.delete()

Example #47

0

Show file

File: master.py Project: decarlin/jobTree

def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobIDsToJobsHash, cpusUsed):
    """Function reads a processed job file and updates it state.
    """
    assert jobID in jobIDsToJobsHash
    jobFile, cpus = jobIDsToJobsHash.pop(jobID)
    cpusUsed -= cpus #Fix the tally of the total number of cpus being
    
    updatingFileIsPresent = os.path.isfile(jobFile + ".updating")
    newFileIsPresent = os.path.isfile(jobFile + ".new")
    
    if resultStatus == 0 and updatingFileIsPresent:
        logger.critical("Despite the batch system claiming success there is a .updating file present: %s", jobFile + ".updating")
        
    if resultStatus == 0 and newFileIsPresent:
        logger.critical("Despite the batch system claiming success there is a .new file present: %s", jobFile + ".new")

    if resultStatus != 0 or newFileIsPresent or updatingFileIsPresent: #Job not successful according to batchsystem, or according to the existance of a .new or .updating file
        if updatingFileIsPresent: #The job failed while attempting to write the job file.
            logger.critical("There was an .updating file for the crashed job: %s" % jobFile)
            if os.path.isfile(jobFile + ".new"): #The job failed while writing the updated job file.
                logger.critical("There was an .new file for the crashed job: %s" % jobFile)
                os.remove(jobFile + ".new") #The existance of the .updating file means it wasn't complete
            os.remove(jobFile + ".updating") #Delete second the updating file second to preserve a correct state
            assert os.path.isfile(jobFile)
            job = readJob(jobFile) #The original must still be there.
            assert job.find("children").find("child") == None #The original can not reflect the end state of the job.
            assert int(job.attrib["black_child_count"]) == int(job.attrib["child_count"])
            job.attrib["colour"] = "red" #It failed, so we mark it so and continue.
            writeJobs([ job ])
            logger.critical("We've reverted to the original job file and marked it as failed: %s" % jobFile)
        else:
            if newFileIsPresent: #The job was not properly updated before crashing
                logger.critical("There is a valid .new file %s" % jobFile)
                if os.path.isfile(jobFile):
                    os.remove(jobFile)
                os.rename(jobFile + ".new", jobFile)
                job = readJob(jobFile)
                if job.attrib["colour"] == "grey": #The job failed while preparing to run another job on the slave
                    assert job.find("children").find("child") == None #File 
                    job.attrib["colour"] = "red"
                    writeJobs([ job ])
                assert job.attrib["colour"] in ("black", "red")
            else:
                logger.critical("There was no valid .new file %s" % jobFile)
                assert os.path.isfile(jobFile)
                job = readJob(jobFile) #The job may have failed before or after creating this file, we check the state.
                if job.attrib["colour"] == "black": #The job completed okay, so we'll keep it
                    logger.critical("Despite the batch system job failing, the job appears to have completed okay")
                else:
                    assert job.attrib["colour"] in ("grey", "red")
                    assert job.find("children").find("child") == None #File 
                    assert int(job.attrib["black_child_count"]) == int(job.attrib["child_count"])
                    if job.attrib["colour"] == "grey":
                        job.attrib["colour"] = "red"
                        writeJobs([ job ])
                    logger.critical("We've reverted to the original job file and marked it as failed: %s" % jobFile)

    assert jobFile not in updatedJobFiles
    updatedJobFiles.add(jobFile) #Now we know the job is done we can add it to the list of updated job files
    logger.debug("Added job: %s to active jobs" % jobFile)
    
    return cpusUsed