Example #1
0
def updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob,
                       childCounts):
    """Update status of parent for finished child job.
    """
    while True:
        if jobFile not in childJobFileToParentJob:
            assert len(updatedJobFiles) == 0
            assert len(childJobFileToParentJob) == 0
            assert len(childCounts) == 0
            break
        parentJob = childJobFileToParentJob.pop(jobFile)
        childCounts[parentJob] -= 1
        assert childCounts[parentJob] >= 0
        if childCounts[parentJob] == 0:  #Job is done
            childCounts.pop(parentJob)
            logger.debug("Parent job %s has all its children run successfully",
                         parentJob.getJobFileName())
            assert parentJob not in updatedJobFiles
            if len(parentJob.followOnCommands) > 0:
                updatedJobFiles.add(
                    parentJob
                )  #Now we know the job is done we can add it to the list of updated job files
                break
            else:
                jobFile = parentJob.getJobFileName()
        else:
            break
Example #2
0
 def issueJobs(self, jobCommands):
     """Issues parasol with job commands.
     """
     issuedJobs = {}
     for jobCommand, memory, cpu, logFile in jobCommands:
         assert memory != None
         assert cpu != None
         assert logFile != None
         pattern = re.compile("your job ([0-9]+).*")
         command = "parasol -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (memory, cpu, self.parasolResultsFile, jobCommand)
         while True:
             #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary
             popenParasolCommand(command, self.scratchFile)
             fileHandle = open(self.scratchFile, 'r')
             line = fileHandle.readline()
             fileHandle.close()
             match = pattern.match(line)
             if match != None: #This is because parasol add job will return success, even if the job was not properly issued!
                 break
             else:
                 logger.info("We failed to properly add the job, we will try again after a sleep")
                 time.sleep(5)
         jobID = int(match.group(1))
         logger.debug("Got the job id: %s from line: %s" % (jobID, line))
         assert jobID not in issuedJobs.keys()
         issuedJobs[jobID] = jobCommand
         logger.debug("Issued the job command: %s with job id: %i " % (command, jobID))
     return issuedJobs
Example #3
0
 def getUpdatedJobs(self):
     """We use the parasol results to update the status of jobs, adding them
     to the list of updated jobs.
     
     Results have the following structure.. (thanks Mark D!)
     
     int status;    /* Job status - wait() return format. 0 is good. */
     char *host;    /* Machine job ran on. */
     char *jobId;    /* Job queuing system job ID */
     char *exe;    /* Job executable file (no path) */
     int usrTicks;    /* 'User' CPU time in ticks. */
     int sysTicks;    /* 'System' CPU time in ticks. */
     unsigned submitTime;    /* Job submission time in seconds since 1/1/1970 */
     unsigned startTime;    /* Job start time in seconds since 1/1/1970 */
     unsigned endTime;    /* Job end time in seconds since 1/1/1970 */
     char *user;    /* User who ran job */
     char *errFile;    /* Location of stderr file on host */
     
     plus you finally have the command name..
     """
     line = self.parasolResultsFileHandle.readline()
     updatedJobs = {}
     while line != '':
         results = line.split()
         if line[-1] == '\n':
             line = line[:-1]
         logger.debug("Parasol completed a job, this is what we got: %s" % line)
         result = int(results[0])
         jobID = int(results[2])
         updatedJobs[jobID] = result
         line = self.parasolResultsFileHandle.readline()
     return updatedJobs
Example #4
0
 def issueJob(self, command, memory, cpu):
     """Issues parasol with job commands.
     """
     self.checkResourceRequest(memory, cpu)
     pattern = re.compile("your job ([0-9]+).*")
     parasolCommand = "%s -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (self.parasolCommand, memory, cpu, self.parasolResultsFile, command)
     #Deal with the cpus
     self.usedCpus += cpu
     while True: #Process finished results with no wait
         try:
            jobID = self.outputQueue1.get_nowait()
            self.usedCpus -= self.jobIDsToCpu.pop(jobID)
            assert self.usedCpus >= 0
            self.outputQueue1.task_done()
         except Empty:
             break
     while self.usedCpus > self.maxCpus: #If we are still waiting
         self.usedCpus -= self.jobIDsToCpu.pop(self.outputQueue1.get())
         assert self.usedCpus >= 0
         self.outputQueue1.task_done()
     #Now keep going
     while True:
         #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary
         line = popenParasolCommand(parasolCommand)[1][0]
         match = pattern.match(line)
         if match != None: #This is because parasol add job will return success, even if the job was not properly issued!
             break
         else:
             logger.info("We failed to properly add the job, we will try again after a sleep")
             time.sleep(5)
     jobID = int(match.group(1))
     self.jobIDsToCpu[jobID] = cpu
     logger.debug("Got the parasol job id: %s from line: %s" % (jobID, line))
     logger.debug("Issued the job command: %s with (parasol) job id: %i " % (parasolCommand, jobID))
     return jobID
 def fn2(header, sequence):
     nonRepetitiveSequence = sequence.replace("a", "").replace("c", '').replace("g", '').replace("t", "")
     logger.debug("Got a non-repetitive sequence of length %s for a sequence starting with length %s" % (len(nonRepetitiveSequence), len(sequence)))
     if len(nonRepetitiveSequence) >= lengthOfFragment:
         header.fragSize = len( sequence )
         return header.getStr(), sequence
     return None
Example #6
0
def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobBatcher, childJobFileToParentJob, childCounts, config):
    """Function reads a processed job file and updates it state.
    """
    jobFile = jobBatcher.removeJobID(jobID)
    updatingFilePresent = processAnyUpdatingFile(jobFile)
    newFilePresent = processAnyNewFile(jobFile)
    jobDir = os.path.split(jobFile)[0]
    if os.path.exists(getJobLogFileName(jobDir)):
        logger.critical("The job seems to have left a log file, indicating failure: %s", jobFile)
        logFile(getJobLogFileName(jobDir), logger.critical)
    if os.path.isfile(jobFile):        
        job = Job.read(jobFile)
        assert job not in updatedJobFiles
        if resultStatus != 0 or newFilePresent or updatingFilePresent:
            if not os.path.exists(job.getLogFileName()):
                logger.critical("No log file is present, despite job failing: %s", jobFile)
            setupJobAfterFailure(job, config)
        if len(job.followOnCommands) > 0 or len(job.children) > 0:
            updatedJobFiles.add(job) #Now we know the job is done we can add it to the list of updated job files
            logger.debug("Added job: %s to active jobs" % jobFile)
        else:
            for message in job.messages: #This is here because jobs with no children or follow ons may log to master.
                logger.critical("Got message from job at time: %s : %s" % (time.time(), message))
            logger.debug("Job has no follow-ons or children despite job file being present so we'll consider it done: %s" % jobFile)
            updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts)
    else:  #The job is done
        if resultStatus != 0:
            logger.critical("Despite the batch system claiming failure the job %s seems to have finished and been removed" % jobFile)
        updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts)
Example #7
0
def issueJobs(jobs, jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed):
    """Issues jobs to the batch system.
    """
    for job in jobs:
        queueingJobs.append(job)
    jobCommands = {}
    #for i in xrange(min(maxJobs - len(jobIDsToJobsHash.keys()), len(queueingJobs))):
    while len(queueingJobs) > 0:
        job = queueingJobs[-1]
        jobCommand = os.path.join(workflowRootPath(), "bin", "jobTreeSlave")
        followOnJob = job.find("followOns").findall("followOn")[-1]
        memory = int(followOnJob.attrib["memory"])
        cpu = int(followOnJob.attrib["cpu"])
        if cpu > maxJobs:
            raise RuntimeError("A request was made for %i cpus by the maxJobs parameters is set to %i, try increasing max jobs or lowering cpu demands" % (cpu, maxJobs))
        if cpu + cpusUsed > maxJobs:
            break
        cpusUsed += cpu
        jobCommands["%s -E %s %s --job %s" % (sys.executable, jobCommand, os.path.split(workflowRootPath())[0], job.attrib["file"])] = (job.attrib["file"], memory, cpu, job.attrib["slave_log_file"])
        queueingJobs.pop()
    issuedJobs = batchSystem.issueJobs([ (key, jobCommands[key][1], jobCommands[key][2], jobCommands[key][3]) for key in jobCommands.keys() ])
    assert len(issuedJobs.keys()) == len(jobCommands.keys())
    for jobID in issuedJobs.keys():
        command = issuedJobs[jobID]
        jobFile = jobCommands[command][0]
        cpu = jobCommands[command][2]
        assert jobID not in jobIDsToJobsHash
        jobIDsToJobsHash[jobID] = (jobFile, cpu)
        logger.debug("Issued the job: %s with job id: %i and cpus: %i" % (jobFile, jobID, cpu))
    return cpusUsed
Example #8
0
def bsub(bsubline):
    process = subprocess.Popen(" ".join(bsubline), shell=True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    liney = process.stdout.readline()
    logger.info("BSUB: " + liney)
    result = int(liney.strip().split()[1].strip('<>'))
    logger.debug("Got the job id: %s" % (str(result)))
    return result
def InitializeArguments(parser):
  logger.debug('Initializing arguments')
  parser.add_argument('--in_dir', type=str,
                      help='location of augustus results directory.')
  parser.add_argument('--out_dir', type=str,
                      help='location to write out merged results.')
  parser.add_argument('--merger', type=str,
                      help='location of augustus_gff_merge.py.')
Example #10
0
 def issueJob(self, jobFile, memory, cpu):
     """Add a job to the queue of jobs
     """
     self.jobsIssued += 1
     jobCommand = "%s -E %s %s %s %s" % (sys.executable, self.jobTreeSlavePath, self.rootPath, self.jobTree, jobFile)
     jobID = self.batchSystem.issueJob(jobCommand, memory, cpu)
     self.jobIDsToJobsHash[jobID] = jobFile
     logger.debug("Issued the job: %s with job id: %s and cpus: %i" % (jobFile, str(jobID), cpu))
Example #11
0
    def issueJob(self, command, memory, cpu):
        self.checkResourceRequest(memory, cpu)
        jobID = self.nextJobID
        self.nextJobID += 1

        self.currentjobs.add(jobID)
        self.newJobsQueue.put((jobID, cpu, memory, command))
        logger.debug("Issued the job command: %s with job id: %s " % (command, str(jobID)))
        return jobID
Example #12
0
    def issueJob(self, command, memory, cpu):
        self.checkResourceRequest(memory, cpu)
        jobID = self.nextJobID
        self.nextJobID += 1

        self.currentjobs.add(jobID)
        self.newJobsQueue.put((jobID, cpu, memory, command))
        logger.debug("Issued the job command: %s with job id: %s " % (command, str(jobID)))
        return jobID
Example #13
0
 def testNewickTreeParser_UnaryNodes(self):
     #tests with unary nodes
     for test in range(0, self.testNo):
         tree = getRandomTreeString()
         logger.debug("tree to try\t", tree)
         tree2 = newickTreeParser(tree, reportUnaryNodes=True)
         tree3 = printBinaryTree(tree2, True)
         logger.debug("tree found\t", tree3)
         assert tree == tree3
Example #14
0
 def addJob(self, command, sgeJobID, issuedJobs, index=None):
     jobID = self.nextJobID
     self.nextJobID += 1
     self.jobIDs[(sgeJobID, index)] = jobID
     self.sgeJobIDs[jobID] = (sgeJobID, index) 
     assert jobID not in issuedJobs.keys()
     issuedJobs[jobID] = command
     logger.debug("Issued the job command: %s with job id: %s " % (command, str(jobID)))
     self.currentjobs.add(jobID)
     self.newJobsQueue.put((sgeJobID, index))
Example #15
0
def bsub(bsubline):
    process = subprocess.Popen(" ".join(bsubline),
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)
    liney = process.stdout.readline()
    logger.info("BSUB: " + liney)
    result = int(liney.strip().split()[1].strip('<>'))
    logger.debug("Got the job id: %s" % (str(result)))
    return result
Example #16
0
 def issueJob(self, jobFile, memory, cpu):
     """Add a job to the queue of jobs
     """
     self.jobsIssued += 1
     jobCommand = "%s -E %s %s %s %s" % (
         sys.executable, self.jobTreeSlavePath, self.rootPath, self.jobTree,
         jobFile)
     jobID = self.batchSystem.issueJob(jobCommand, memory, cpu)
     self.jobIDsToJobsHash[jobID] = jobFile
     logger.debug("Issued the job: %s with job id: %s and cpus: %i" %
                  (jobFile, str(jobID), cpu))
Example #17
0
def prepareQsub(cpu, mem):
    qsubline = list(["qsub","-b","y","-terse","-j" ,"y", "-cwd","-v", 
                     "LD_LIBRARY_PATH=%s" % os.environ["LD_LIBRARY_PATH"]])
    reqline = list()
    if cpu is not None:
        reqline.append("p="+str(cpu))
    if mem is not None:
        reqline.append("vf="+str(mem/ 1024)+"K")
    if len(reqline) > 0:
        qsubline.extend(["-hard","-l", ",".join(reqline)])
    logger.debug("**"+" ".join(qsubline))
    return qsubline
Example #18
0
 def issueJobs(self, commands):
     """Runs the jobs right away.
     """
     issuedJobs = {}
     for command, memory, cpu, logFile in commands: #Add the commands to the queue
         assert memory != None
         assert cpu != None
         assert logFile != None
         logger.debug("Issuing the command: %s with memory: %i, cpu: %i" % (command, memory, cpu))
         self.jobs[self.jobIndex] = command
         issuedJobs[self.jobIndex] = command
         self.inputQueue.put((command, logFile, self.jobIndex))
         self.jobIndex += 1
     return issuedJobs
Example #19
0
 def getUpdatedJobs(self):
     """Returns a map of the run jobs and the return value of their processes.
     """
     runJobs = {}
     try:
         while True:
             command, exitValue, jobID = self.outputQueue.get_nowait()
             runJobs[jobID] = exitValue
             self.jobs.pop(jobID)
             logger.debug("Ran the command: %s with exit value: %i" % (command, exitValue))
             self.outputQueue.task_done()
     except Empty:
         pass
     return runJobs
Example #20
0
 def getUpdatedJob(self, maxWait):
     """Returns a map of the run jobs and the return value of their processes.
     """
     i = self.getFromQueueSafely(self.outputQueue, maxWait)
     if i == None:
         return None
     jobID, exitValue, threadsToStart = i
     self.jobs.pop(jobID)
     logger.debug("Ran jobID: %s with exit value: %i" % (jobID, exitValue))
     for j in xrange(threadsToStart):
         worker = Process(target=self.workerFn, args=(self.inputQueue, self.outputQueue))
         worker.daemon = True
         worker.start()
     self.outputQueue.task_done()
     return (jobID, exitValue)
 def run(self):
   count = 0
   windows = glob(os.path.join(self.args.in_dir, 'window_*', ''))
   seqs = set()
   for w in windows:
     gffs = glob(os.path.join(w, '*gff'))
   for g in gffs:
     seq = os.path.basename(g).split('.')[0]
     seqs.add(seq)
   count = len(seqs)
   [self.addChildTarget(MergeCall(windows, s, self.args)) for s in seqs]
   logger.debug('There will be %d MergeCall children' % count)
   self.args.batch_start_time = CreateSummaryReport(
     self.args.out_dir, self.args.batch_start_time, count,
     self.args.calling_command)
Example #22
0
 def getUpdatedJob(self, maxWait):
     """Returns a map of the run jobs and the return value of their processes.
     """
     i = self.getFromQueueSafely(self.outputQueue, maxWait)
     if i == None:
         return None
     jobID, exitValue, threadsToStart = i
     self.jobs.pop(jobID)
     logger.debug("Ran jobID: %s with exit value: %i" % (jobID, exitValue))
     for j in xrange(threadsToStart):
         worker = Process(target=self.workerFn,
                          args=(self.inputQueue, self.outputQueue))
         worker.daemon = True
         worker.start()
     self.outputQueue.task_done()
     return (jobID, exitValue)
Example #23
0
 def issueJob(self, command, memory, cpu):
     """Runs the jobs right away.
     """
     self.checkResourceRequest(memory, cpu)
     logger.debug("Issuing the command: %s with memory: %i, cpu: %i" % (command, memory, cpu))
     self.jobs[self.jobIndex] = command
     i = self.jobIndex
     #Deal with the max cpus calculation
     k = 0
     while cpu > self.cpusPerThread or memory > self.memoryPerThread:
         self.inputQueue.put(None)
         cpu -= self.cpusPerThread
         memory -= self.memoryPerThread
         k += 1
     assert k < self.maxThreads
     self.inputQueue.put((command, self.jobIndex, k))
     self.jobIndex += 1
     return i
Example #24
0
 def issueJob(self, command, memory, cpu):
     """Runs the jobs right away.
     """
     self.checkResourceRequest(memory, cpu)
     logger.debug("Issuing the command: %s with memory: %i, cpu: %i" %
                  (command, memory, cpu))
     self.jobs[self.jobIndex] = command
     i = self.jobIndex
     #Deal with the max cpus calculation
     k = 0
     while cpu > self.cpusPerThread or memory > self.memoryPerThread:
         self.inputQueue.put(None)
         cpu -= self.cpusPerThread
         memory -= self.memoryPerThread
         k += 1
     assert k < self.maxThreads
     self.inputQueue.put((command, self.jobIndex, k))
     self.jobIndex += 1
     return i
Example #25
0
def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobBatcher,
                       childJobFileToParentJob, childCounts, config):
    """Function reads a processed job file and updates it state.
    """
    jobFile = jobBatcher.removeJobID(jobID)
    updatingFilePresent = processAnyUpdatingFile(jobFile)
    newFilePresent = processAnyNewFile(jobFile)
    jobDir = os.path.split(jobFile)[0]
    if os.path.exists(getJobLogFileName(jobDir)):
        logger.critical(
            "The job seems to have left a log file, indicating failure: %s",
            jobFile)
        logFile(getJobLogFileName(jobDir), logger.critical)
    if os.path.isfile(jobFile):
        job = Job.read(jobFile)
        assert job not in updatedJobFiles
        if resultStatus != 0 or newFilePresent or updatingFilePresent:
            if not os.path.exists(job.getLogFileName()):
                logger.critical(
                    "No log file is present, despite job failing: %s", jobFile)
            setupJobAfterFailure(job, config)
        if len(job.followOnCommands) > 0 or len(job.children) > 0:
            updatedJobFiles.add(
                job
            )  #Now we know the job is done we can add it to the list of updated job files
            logger.debug("Added job: %s to active jobs" % jobFile)
        else:
            for message in job.messages:  #This is here because jobs with no children or follow ons may log to master.
                logger.critical("Got message from job at time: %s : %s" %
                                (time.time(), message))
            logger.debug(
                "Job has no follow-ons or children despite job file being present so we'll consider it done: %s"
                % jobFile)
            updateParentStatus(jobFile, updatedJobFiles,
                               childJobFileToParentJob, childCounts)
    else:  #The job is done
        if resultStatus != 0:
            logger.critical(
                "Despite the batch system claiming failure the job %s seems to have finished and been removed"
                % jobFile)
        updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob,
                           childCounts)
Example #26
0
 def issueJob(self, command, memory, cpu):
     """Issues parasol with job commands.
     """
     self.checkResourceRequest(memory, cpu)
     pattern = re.compile("your job ([0-9]+).*")
     parasolCommand = "%s -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (
         self.parasolCommand, memory, cpu, self.parasolResultsFile, command)
     #Deal with the cpus
     self.usedCpus += cpu
     while True:  #Process finished results with no wait
         try:
             jobID = self.outputQueue1.get_nowait()
             self.usedCpus -= self.jobIDsToCpu.pop(jobID)
             assert self.usedCpus >= 0
             self.outputQueue1.task_done()
         except Empty:
             break
     while self.usedCpus > self.maxCpus:  #If we are still waiting
         self.usedCpus -= self.jobIDsToCpu.pop(self.outputQueue1.get())
         assert self.usedCpus >= 0
         self.outputQueue1.task_done()
     #Now keep going
     while True:
         #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary
         line = popenParasolCommand(parasolCommand)[1][0]
         match = pattern.match(line)
         if match != None:  #This is because parasol add job will return success, even if the job was not properly issued!
             break
         else:
             logger.info(
                 "We failed to properly add the job, we will try again after a sleep"
             )
             time.sleep(5)
     jobID = int(match.group(1))
     self.jobIDsToCpu[jobID] = cpu
     logger.debug("Got the parasol job id: %s from line: %s" %
                  (jobID, line))
     logger.debug("Issued the job command: %s with (parasol) job id: %i " %
                  (parasolCommand, jobID))
     return jobID
Example #27
0
def updateParentStatus(jobFile, updatedJobFiles, childJobFileToParentJob, childCounts):
    """Update status of parent for finished child job.
    """
    while True:
        if jobFile not in childJobFileToParentJob:
            assert len(updatedJobFiles) == 0
            assert len(childJobFileToParentJob) == 0
            assert len(childCounts) == 0
            break
        parentJob = childJobFileToParentJob.pop(jobFile)
        childCounts[parentJob] -= 1
        assert childCounts[parentJob] >= 0
        if childCounts[parentJob] == 0: #Job is done
            childCounts.pop(parentJob)
            logger.debug("Parent job %s has all its children run successfully", parentJob.getJobFileName())
            assert parentJob not in updatedJobFiles
            if len(parentJob.followOnCommands) > 0:
                updatedJobFiles.add(parentJob) #Now we know the job is done we can add it to the list of updated job files  
                break
            else:
                jobFile = parentJob.getJobFileName()
        else:
            break
Example #28
0
 def testNewickIO(self):
     # feslenstein's own... (http://evolution.genetics.washington.edu/phylip/newicktree.html)
     tree1 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997, seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201, weasel:18.87953):2.0946):3.87382,dog:25.46154);'
     tree2 = '(Bovine:0.69395,(Gibbon:0.36079,(Orang:0.33636,(Gorilla:0.17147,(Chimp:0.19268, Human:0.11927):0.08386):0.06124):0.15057):0.54939,Mouse:1.2146):0.1;'
     tree3 = '(Bovine:0.69395,(Hylobates:0.36079,(Pongo:0.33636,(G._Gorilla:0.17147, (P._paniscus:0.19268,H._sapiens:0.11927):0.08386):0.06124):0.15057):0.54939, Rodent:1.2146);'
     tree4 = 'A;'
     tree5 = '((A,B):0.0,(C,D));'
     tree6 = '(Alpha,Beta,Gamma,Delta,,Epsilon,,,);'
     
     trees = [tree1, tree2, tree3, tree4, tree5, tree6]        
     newickParser = NXNewick()
     
     for tree in trees:
         newickParser.parseString(tree)
         answer = self.__cleanTree(tree)
         outputString = newickParser.writeString()
         logger.debug(" ***************** ")
         logger.debug(outputString)
         logger.debug(answer)
         assert outputString == answer
Example #29
0
 def __round(i):
     if i < 0:
         logger.debug("I got a less than 0 value: %s" % i)
         return 0.0
     return i
Example #30
0
    def loadConfig(cls, conffile=DEFAULT_SLURM_CONF_FILE):
        '''
        Constructs the object using the given slurm.conf file name.
        If this is called more than once, the values will be reloaded.
        
        If there are backslashes at the end of the line it's concatenated
        to the next one.
        
        NodeName lines are not saved because of the stupid DEFAULT stuff.  
        Maybe someday.
        '''
        logger.debug("Initializing Slurm config using %s" % conffile)
        Slurm.conf = dict()
        currline = ''
        m = re.compile(r'([^=]+)\s*=\s*(.*)')  #Used to extract name=value
        n = re.compile(r'(\S+)\s+(.*)')  #Parse values on PartitionName
        with open(conffile, 'rt') as f:
            for line in f:
                line = line.rstrip().lstrip()
                if line.startswith('#') or line.isspace() or not line:
                    continue

                # Concatenate lines with escaped line ending
                if line.endswith('\\'):
                    logger.debug("Concatenating line %s" % line)
                    currline += line.rstrip('\\')
                    continue

                currline += line

                # Skip nodename lines
                if currline.startswith('NodeName'):
                    currline = ''
                    continue

                # Split on first equal
                result = m.match(currline)
                if result is not None:
                    name = result.group(1)
                    value = result.group(2)

                    # For PartitionName lines, we need to extract the name
                    # and add it to the Partitions list
                    if name == 'PartitionName':
                        result2 = n.match(value)
                        if result2 is None:
                            logger.info("Bad PartitionName value %s.  Skipping." \
                                % value)
                            continue
                        pname = result2.group(1)
                        pvalue = result2.group(2)
                        if 'Partitions' not in Slurm.conf:
                            Slurm.conf['Partitions'] = dict()
                        Slurm.conf['Partitions'][pname] = pvalue
                    else:
                        Slurm.conf[name] = value
                else:
                    logger.error("Slurm config file %s has strange line '%s'" %
                                 (conffile, currline))

                currline = ''
    def runVanilla(self):
        logger.debug("Going to put the alignment in %s" % self.outputDir)
        if not os.path.isdir(self.outputDir):
            os.mkdir(self.outputDir)

        if not os.path.exists(os.path.join(self.outputDir, "cactusAlignmentVanilla")):
            xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml"))
            
            #Set the parameters
            tempLocalDir = os.path.join(self.outputDir, "tempVanillaCactusAlignment")
            system("rm -rf %s" % tempLocalDir)
            os.mkdir(tempLocalDir)
            
            #Set the config parameters
            self.params.applyToXml(xmlTree)
            config = xmlTree.getroot()
            assert config is not None
        
            #Write the config file
            tempConfigFile = os.path.join(tempLocalDir, "config.xml")
            fileHandle = open(tempConfigFile, 'w')
            assert fileHandle is not None
            tree = ET.ElementTree(config)
            tree.write(fileHandle)
            fileHandle.close()
         
            #Make the experiment file
            tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml")
            #Now do standard cactus..
            #Make the experiment file
            tempExperimentFile2 = os.path.join(tempLocalDir, "experiment.xml")

            cactusWorkflowExperiment = CactusWorkflowExperiment(
                                                 sequences=self.sequences, 
                                                 newickTreeString=self.newickTree, 
                                                 #requiredSpecies=self.requiredSpecies,
                                                 #singleCopySpecies=self.singleCopySpecies,
                                                 databaseName="cactusAlignmentVanilla",
                                                 outputDir=tempLocalDir,
                                                 configFile=tempConfigFile)
            tempExperimentDir2 = os.path.join(tempLocalDir, "cactusAlignmentVanilla")
            cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile2)
           
            # apply naming to the event tree to be consistent with progressive
            exp = ExperimentWrapper(ET.parse(tempExperimentFile2).getroot())
            cleanEventTree(exp)
            exp.writeXML(tempExperimentFile2)
            
            #We're done with the progressive, now run the vanilla cactus for comparison
            tempJobTreeDir2 = os.path.join(tempLocalDir, "jobTreeVanilla")
            runCactusWorkflow(tempExperimentFile2, tempJobTreeDir2,
                              jobTreeStats=True,
                              setupAndBuildAlignments=True,
                              buildReference=True,
                              maxThreads=4)
            
            runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir2)
            logger.info("Checked the job tree dir for the vanilla run")
            
            runCactusMAFGenerator(os.path.join(self.outputDir, "cactusVanilla.maf"), getCactusDiskString(tempExperimentDir2))
            
            #Run the cactus tree stats
            treeStatsFile = os.path.join(self.outputDir, "treeStats.xml")
            system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(),
                                                                                        treeStatsFile))
            
            system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir2, self.outputDir))
            system("mv %s %s" % (tempExperimentDir2, self.outputDir))
            system("mv %s %s/experiment.xml" % (tempExperimentFile2, self.outputDir))
def initializeArguments(parser):
  logger.debug('Initializing arguments')
  parser.add_argument('--augustus_path', type=str,
                      help='location of augustus directory.')
  parser.add_argument('--hal_path', type=str,
                      help='location of hal tools directory.')
  parser.add_argument('--hal_file_path', type=str,
                      help='location hal file.')
  parser.add_argument('--tree_path', type=str,
                      help='location newick tree file.')
  parser.add_argument('--out_dir', type=str,
                      help='location to store output files.')
  parser.add_argument('--sqlite_db', type=str,
                      help='location of sqlite database.')
  parser.add_argument(
    '--softmasking', type=str, choices=['true', 'false'], default='true',
    help='penalize exons in softmasked regions. default=%(default)s')
  parser.add_argument('--extrinsicCfgFile', help='extrinsic hint config file.')
  parser.add_argument('--speciesfilenames', type=str,
                      help=('location of the species file (text, one line per '
                            'species and location of .fa.'))
  # parser.add_argument('--dbaccess_file', type=str,
  #                     help='location of dbaccess file containing login info.')
  parser.add_argument('--maf_file_path', type=str,
                      help=('location maf file. Overrides all hal window '
                            'extraction. Debugging feature.'))
  parser.add_argument('--debug', default=False, action='store_true',
                      help='turns off execution of commands, just writes logs.')
  window = parser.add_argument_group('Window options')
  window.add_argument('--ref_genome', type=str,
                      help='reference genome to use for region extraction.')
  window.add_argument('--ref_sequence', type=str,
                      help=('reference sequence (chr) to use for '
                            'region extraction.'))
  window.add_argument('--window_start', type=int, default=0,
                      help='start of windowing region.')
  window.add_argument('--window_end', type=int, default=None,
                      help='end of windowing region.')
  window.add_argument('--window_length', type=int, default=2000000,
                      help='length of each window. default=%(default)s')
  window.add_argument('--window_overlap', type=int, default=1000000,
                      help='overlap of each window. default=%(default)s')
  augustus = parser.add_argument_group('Augustus options')
  augustus.add_argument('--species', default='human', type=str,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--temperature', default=3, type=int,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/x0_E', default=-1.25,
                        dest='_MeaPrediction_x0_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/x0_I', default=-0.78125,
                        dest='_MeaPrediction_x0_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/x1_E', default=5,
                        dest='_MeaPrediction_x1_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/x1_I', default=10,
                        dest='_MeaPrediction_x1_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/y0_E', default=0.5,
                        dest='_MeaPrediction_y0_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/y0_I', default=0.9,
                        dest='_MeaPrediction_y0_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/alpha_E', default=9.375,
                        dest='_MeaPrediction_alpha_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/alpha_I', default=2.5075,
                        dest='_MeaPrediction_alpha_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/i1_E', default=0.,
                        dest='_MeaPrediction_i1_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/i1_I', default=0.,
                        dest='_MeaPrediction_i1_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/i2_E', default=0.5,
                        dest='_MeaPrediction_i2_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/i2_I', default=0.9,
                        dest='_MeaPrediction_i2_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/j1_E', default=-1.25,
                        dest='_MeaPrediction_j1_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/j1_I', default=-0.78125,
                        dest='_MeaPrediction_j1_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/j2_E', default=0.,
                        dest='_MeaPrediction_j2_E', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/MeaPrediction/j2_I', default=0.,
                        dest='_MeaPrediction_j2_I', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/CompPred/ec_score', default=-13,
                        dest='_CompPred_ec_score', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/CompPred/ec_addend', default=-20.6,
                        dest='_CompPred_ec_addend', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/CompPred/ec_factor', default=6,
                        dest='_CompPred_ec_factor', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/CompPred/dd_factor', default=20,
                        dest='_CompPred_dd_factor', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/CompPred/exon_gain', default=0.0001,
                        dest='_CompPred_exon_gain', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/CompPred/exon_loss', default=0.0001,
                        dest='_CompPred_exon_loss', type=float,
                        help='Augustus option. default=%(default)s')
  augustus.add_argument('--/CompPred/phylo_factor', default=50.,
                        dest='_CompPred_phylo_factor', type=float,
                        help='Augustus option. default=%(default)s')
  #####
  augustus.add_argument('--/CompPred/only_species', default=None,
                        dest='_CompPred_only_species', type=str,
                        help='Augustus option. default=%(default)s')
Example #33
0
def processFinishedJob(jobID, resultStatus, updatedJobFiles, jobIDsToJobsHash, cpusUsed):
    """Function reads a processed job file and updates it state.
    """
    assert jobID in jobIDsToJobsHash
    jobFile, cpus = jobIDsToJobsHash.pop(jobID)
    cpusUsed -= cpus #Fix the tally of the total number of cpus being
    
    updatingFileIsPresent = os.path.isfile(jobFile + ".updating")
    newFileIsPresent = os.path.isfile(jobFile + ".new")
    
    if resultStatus == 0 and updatingFileIsPresent:
        logger.critical("Despite the batch system claiming success there is a .updating file present: %s", jobFile + ".updating")
        
    if resultStatus == 0 and newFileIsPresent:
        logger.critical("Despite the batch system claiming success there is a .new file present: %s", jobFile + ".new")

    if resultStatus != 0 or newFileIsPresent or updatingFileIsPresent: #Job not successful according to batchsystem, or according to the existance of a .new or .updating file
        if updatingFileIsPresent: #The job failed while attempting to write the job file.
            logger.critical("There was an .updating file for the crashed job: %s" % jobFile)
            if os.path.isfile(jobFile + ".new"): #The job failed while writing the updated job file.
                logger.critical("There was an .new file for the crashed job: %s" % jobFile)
                os.remove(jobFile + ".new") #The existance of the .updating file means it wasn't complete
            os.remove(jobFile + ".updating") #Delete second the updating file second to preserve a correct state
            assert os.path.isfile(jobFile)
            job = readJob(jobFile) #The original must still be there.
            assert job.find("children").find("child") == None #The original can not reflect the end state of the job.
            assert int(job.attrib["black_child_count"]) == int(job.attrib["child_count"])
            job.attrib["colour"] = "red" #It failed, so we mark it so and continue.
            writeJobs([ job ])
            logger.critical("We've reverted to the original job file and marked it as failed: %s" % jobFile)
        else:
            if newFileIsPresent: #The job was not properly updated before crashing
                logger.critical("There is a valid .new file %s" % jobFile)
                if os.path.isfile(jobFile):
                    os.remove(jobFile)
                os.rename(jobFile + ".new", jobFile)
                job = readJob(jobFile)
                if job.attrib["colour"] == "grey": #The job failed while preparing to run another job on the slave
                    assert job.find("children").find("child") == None #File 
                    job.attrib["colour"] = "red"
                    writeJobs([ job ])
                assert job.attrib["colour"] in ("black", "red")
            else:
                logger.critical("There was no valid .new file %s" % jobFile)
                assert os.path.isfile(jobFile)
                job = readJob(jobFile) #The job may have failed before or after creating this file, we check the state.
                if job.attrib["colour"] == "black": #The job completed okay, so we'll keep it
                    logger.critical("Despite the batch system job failing, the job appears to have completed okay")
                else:
                    assert job.attrib["colour"] in ("grey", "red")
                    assert job.find("children").find("child") == None #File 
                    assert int(job.attrib["black_child_count"]) == int(job.attrib["child_count"])
                    if job.attrib["colour"] == "grey":
                        job.attrib["colour"] = "red"
                        writeJobs([ job ])
                    logger.critical("We've reverted to the original job file and marked it as failed: %s" % jobFile)

    assert jobFile not in updatedJobFiles
    updatedJobFiles.add(jobFile) #Now we know the job is done we can add it to the list of updated job files
    logger.debug("Added job: %s to active jobs" % jobFile)
    
    return cpusUsed
Example #34
0
def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """
    waitDuration = float(config.attrib["wait_duration"])
    assert waitDuration >= 0
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters, wait duration %s, rescue jobs frequency: %s max job duration: %s" % \
                (waitDuration, rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    jobFiles = config.attrib["job_file_dir"].listFiles()
    logger.info("Got a list of job files")
    
    #Repair the job tree using any .old files
    fixJobsList(config, jobFiles)
    logger.info("Fixed the job files using any .old files")
    
    #Get jobs that were running, or that had failed reset to 'grey' status
    restartFailedJobs(config, jobFiles)
    logger.info("Reworked failed jobs")
    
    updatedJobFiles = set() #Jobs whose status needs updating, either because they have finished, or because they need to be started.
    for jobFile in jobFiles:
        job = readJob(jobFile)
        if job.attrib["colour"] not in ("blue"):
            updatedJobFiles.add(jobFile)
    logger.info("Got the active (non blue) job files")
    
    totalJobFiles = len(jobFiles) #Total number of job files we have.
    jobIDsToJobsHash = {} #A hash of the currently running jobs ids, made by the batch system.
    
    idealJobTime = float(config.attrib["job_time"]) 
    assert idealJobTime > 0.0
    
    reportAllJobLogFiles = bool(int(config.attrib["reportAllJobLogFiles"]))
    
    stats = config.attrib.has_key("stats")
    if stats:
        startTime = time.time()
        startClock = getTotalCpuTime()
        
    #Stuff do handle the maximum number of issued jobs
    queueingJobs = []
    maxJobs = int(config.attrib["max_jobs"])
    cpusUsed = 0
    
    logger.info("Starting the main loop")
    timeSinceJobsLastRescued = time.time() - rescueJobsFrequency + 100 #We hack it so that we rescue jobs after the first 100 seconds to get around an apparent parasol bug
    while True: 
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i job files, %i jobs to update and %i jobs currently issued" % (totalJobFiles, len(updatedJobFiles), len(jobIDsToJobsHash)))
        
        for jobFile in list(updatedJobFiles):
            job = readJob(jobFile)
            assert job.attrib["colour"] is not "blue"
            
            ##Check the log files exist, because they must ultimately be cleaned up by their respective file trees.
            def checkFileExists(fileName, type):
                if not os.path.isfile(fileName): #We need to keep these files in existence.
                    open(fileName, 'w').close()
                    logger.critical("The file %s of type %s for job %s had disappeared" % (fileName, type, jobFile))
            checkFileExists(job.attrib["log_file"], "log_file")
            checkFileExists(job.attrib["slave_log_file"], "slave_log_file")
            if stats:
                checkFileExists(job.attrib["stats"], "stats")
            
            def reissueJob(job):
                #Reset the log files for the job.
                updatedJobFiles.remove(jobFile)
                open(job.attrib["slave_log_file"], 'w').close()
                open(job.attrib["log_file"], 'w').close()
                assert job.attrib["colour"] == "grey"
                return issueJobs([ job ], jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed)
                
            def makeGreyAndReissueJob(job):
                job.attrib["colour"] = "grey"
                writeJobs([ job ])
                return reissueJob(job)
            
            if job.attrib["colour"] == "grey": #Get ready to start the job
                cpusUsed = reissueJob(job)
            elif job.attrib["colour"] == "black": #Job has finished okay
                logger.debug("Job: %s has finished okay" % job.attrib["file"])
                if reportAllJobLogFiles:
                    reportJobLogFiles(job)
                #Deal with stats
                if stats:
                    system("cat %s >> %s" % (job.attrib["stats"], config.attrib["stats"]))
                    open(job.attrib["stats"], 'w').close() #Reset the stats file
                if job.find("messages") != None:
                    for message in job.find("messages").findall("message"):
                        logger.critical("Received the following message from job: %s" % message.attrib["message"])
                    job.remove(job.find("messages"))
                childCount = int(job.attrib["child_count"])
                blackChildCount = int(job.attrib["black_child_count"])
                assert childCount == blackChildCount #Has no currently running child jobs
                #Launch any unborn children
                unbornChildren = job.find("children")
                unbornChild = unbornChildren.find("child")
                if unbornChild != None: #We must give birth to the unborn children
                    logger.debug("Job: %s has %i children to schedule" % (job.attrib["file"], len(unbornChildren.findall("child"))))
                    newChildren = []
                    while unbornChild != None:
                        newJob = createJob(unbornChild.attrib, job.attrib["file"], config)
                        totalJobFiles += 1
                        newChildren.append(newJob)
                        unbornChildren.remove(unbornChild)
                        unbornChild = unbornChildren.find("child")
                    
                    updatedJobFiles.remove(job.attrib["file"])
                    job.attrib["child_count"] = str(childCount + len(newChildren))
                    job.attrib["colour"] = "blue" #Blue - has children running.
                    writeJobs([ job ] + newChildren ) #Check point
                    cpusUsed = issueJobs(newChildren, jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed) #Issue the new children directly
                    
                elif len(job.find("followOns").findall("followOn")) != 0: #Has another job
                    logger.debug("Job: %s has a new command that we can now issue" % job.attrib["file"])
                    ##Reset the job run info
                    job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
                    cpusUsed = makeGreyAndReissueJob(job)
                    
                else: #Job has finished, so we can defer to any parent
                    logger.debug("Job: %s is now dead" % job.attrib["file"])
                    job.attrib["colour"] = "dead"
                    if job.attrib.has_key("parent"):
                        parent = readJob(job.attrib["parent"])
                        assert job.attrib["parent"] != jobFile
                        assert parent.attrib["colour"] == "blue"
                        assert int(parent.attrib["black_child_count"]) < int(parent.attrib["child_count"])
                        parent.attrib["black_child_count"] = str(int(parent.attrib["black_child_count"]) + 1)
                        if int(parent.attrib["child_count"]) == int(parent.attrib["black_child_count"]):
                            parent.attrib["colour"] = "black"
                            assert parent.attrib["file"] not in updatedJobFiles
                            updatedJobFiles.add(parent.attrib["file"])
                        writeJobs([ job, parent ]) #Check point
                    updatedJobFiles.remove(job.attrib["file"])
                    totalJobFiles -= 1
                    deleteJob(job, config)
                         
            elif job.attrib["colour"] == "red": #Job failed
                logger.critical("Job: %s failed" % job.attrib["file"])
                reportJobLogFiles(job)
                #Checks
                assert len(job.find("children").findall("child")) == 0
                assert int(job.attrib["child_count"]) == int(job.attrib["black_child_count"])
                
                remainingRetyCount = int(job.attrib["remaining_retry_count"])
                if remainingRetyCount > 0: #Give it another try, maybe there is a bad node somewhere
                    job.attrib["remaining_retry_count"] = str(remainingRetyCount-1)
                    logger.critical("Job: %s will be restarted, it has %s goes left" % (job.attrib["file"], job.attrib["remaining_retry_count"]))
                    cpusUsed = makeGreyAndReissueJob(job)
                else:
                    assert remainingRetyCount == 0
                    updatedJobFiles.remove(job.attrib["file"]) #We remove the job and neither delete it or reissue it
                    logger.critical("Job: %s is completely failed" % job.attrib["file"])
                    
            else: #This case should only occur after failure
                logger.debug("Job: %s is already dead, we'll get rid of it" % job.attrib["file"])
                assert job.attrib["colour"] == "dead"
                updatedJobFiles.remove(job.attrib["file"])
                totalJobFiles -= 1
                deleteJob(job, config)
                
        #This command is issued to ensure any queing jobs are issued at the end of the loop
        cpusUsed = issueJobs([], jobIDsToJobsHash, batchSystem, queueingJobs, maxJobs, cpusUsed)
      
        if len(jobIDsToJobsHash) == 0 and len(updatedJobFiles) == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalJobFiles)
            assert cpusUsed == 0
            break
        
        if len(updatedJobFiles) > 0:
            updatedJobs = batchSystem.getUpdatedJobs() #Asks the batch system what jobs have been completed.
        else:
            updatedJobs = pauseForUpdatedJobs(batchSystem.getUpdatedJobs) #Asks the batch system what jobs have been completed.
        
        for jobID in updatedJobs.keys(): #Runs through a map of updated jobs and there status, 
            result = updatedJobs[jobID]
            if jobIDsToJobsHash.has_key(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended successfully" % jobIDsToJobsHash[jobID][0])   
                else:
                    logger.critical("Batch system is reporting that the job %s failed with exit value %i" % (jobIDsToJobsHash[jobID][0], result))  
                cpusUsed = processFinishedJob(jobID, result, updatedJobFiles, jobIDsToJobsHash, cpusUsed)
            else:
                logger.info("A result seems to already have been processed: %i" % jobID) #T
        
        if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds
            cpusUsed = reissueOverLongJobs(updatedJobFiles, jobIDsToJobsHash, config, batchSystem, cpusUsed)
            logger.info("Reissued any over long jobs")
            
            hasNoMissingJobs, cpusUsed = reissueMissingJobs(updatedJobFiles, jobIDsToJobsHash, batchSystem, cpusUsed)
            if hasNoMissingJobs:
                timeSinceJobsLastRescued = time.time()
            else:
                timeSinceJobsLastRescued += 60 #This means we'll try again in 60 seconds
            logger.info("Rescued any (long) missing jobs")
        #Going to sleep to let the job system catch up.
        time.sleep(waitDuration)
        ##Check that the total number of cpus
        assert sum([ cpus for jobID, cpus in jobIDsToJobsHash.values() ]) == cpusUsed
        assert cpusUsed <= maxJobs
    
    if stats:
        fileHandle = open(config.attrib["stats"], 'a')
        fileHandle.write("<total_time time='%s' clock='%s'/></stats>" % (str(time.time() - startTime), str(getTotalCpuTime() - startClock)))
        fileHandle.close()
    
    logger.info("Finished the main loop")     
    
    return totalJobFiles #Returns number of failed jobs
Example #35
0
    def testTempFileTree(self):
        for test in range(100):  #self.testNo):
            levels = random.choice(range(1, 4))
            fileNo = random.choice(range(1, 6))
            maxTempFiles = int(math.pow(fileNo, levels))

            print("Got %s levels, %s fileNo and %s maxTempFiles" %
                  (levels, fileNo, maxTempFiles))

            tempFileTreeRootDir = os.path.join(self.tempDir,
                                               getRandomAlphaNumericString())
            tempFileTree = TempFileTree(tempFileTreeRootDir, fileNo, levels)

            tempFiles = []
            tempDirs = []
            #Check we can mac number of temp files.
            for i in range(maxTempFiles):
                if random.random() > 0.5:
                    tempFile = tempFileTree.getTempFile()
                    assert os.path.isfile(tempFile)
                    tempFiles.append(tempFile)
                else:
                    tempFile = tempFileTree.getTempDirectory()
                    assert os.path.isdir(tempFile)
                    tempDirs.append(tempFile)

            #Check assertion is created
            try:
                tempFileTree.getTempFile()
                assert False
            except RuntimeError:
                logger.debug("Got expected error message")

            #Now remove a few temp files
            while random.random() > 0.1 and len(tempFiles) > 0:
                tempFile = tempFiles.pop()
                assert os.path.isfile(tempFile)
                tempFileTree.destroyTempFile(tempFile)
                assert not os.path.isfile(tempFile)

            #Now remove a few temp dirs
            while random.random() > 0.1 and len(tempDirs) > 0:
                tempDir = tempDirs.pop()
                assert os.path.isdir(tempDir)
                tempFileTree.destroyTempDir(tempDir)
                assert not os.path.isdir(tempDir)

            #Check temp files is okay
            set(tempFileTree.listFiles()) == set(tempFiles + tempDirs)

            #Either remove all the temp files or just destroy the whole thing
            if random.random() > 0.5:
                #Remove all temp files and check thing is empty.
                for tempFile in tempFiles:
                    tempFileTree.destroyTempFile(tempFile)
                for tempDir in tempDirs:
                    tempFileTree.destroyTempDir(tempDir)
                os.remove(os.path.join(tempFileTreeRootDir, "lock"))
                os.rmdir(tempFileTreeRootDir)
            else:
                tempFileTree.destroyTempFiles()
                assert not os.path.isdir(tempFileTreeRootDir)
Example #36
0
def readJob(jobFile):
    logger.debug("Going to load the file %s" % jobFile)
    return ET.parse(jobFile).getroot()
Example #37
0
def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters,rescue jobs frequency: %s max job duration: %s" % \
                (rescueJobsFrequency, maxJobDuration))

    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()
               ) == 0  #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")

    childJobFileToParentJob, childCounts, updatedJobFiles = {}, {}, set()
    parseJobFiles(getJobFileDirName(config.attrib["job_tree"]),
                  updatedJobFiles, childJobFileToParentJob, childCounts,
                  config)
    jobBatcher = JobBatcher(config, batchSystem)
    logger.info(
        "Found %s jobs to start and %i parent jobs with children to run" %
        (len(updatedJobFiles), len(childCounts)))

    stats = config.attrib.has_key("stats")
    if stats:
        stop = Queue()
        worker = Process(target=statsAggregatorProcess,
                         args=(config.attrib["job_tree"],
                               makeTemporaryStatsDirs(
                                   config.attrib["job_tree"]), stop))
        worker.daemon = True
        worker.start()

    timeSinceJobsLastRescued = time.time(
    )  #Sets up the timing of the job rescuing method
    totalFailedJobs = 0
    logger.info("Starting the main loop")
    while True:
        if len(updatedJobFiles) > 0:
            logger.debug(
                "Built the jobs list, currently have %i jobs to update and %i jobs issued"
                % (len(updatedJobFiles), jobBatcher.getNumberOfJobsIssued()))

            for job in updatedJobFiles:
                for message in job.messages:
                    logger.critical("Got message from job at time: %s : %s" %
                                    (time.time(), message))
                job.messages = []

                if len(job.children) > 0:
                    logger.debug("Job: %s has %i children to schedule" %
                                 (job.getJobFileName(), len(job.children)))
                    children = job.children
                    job.children = []
                    for childJobFile, memory, cpu in children:
                        childJobFileToParentJob[childJobFile] = job
                    assert job not in childCounts
                    childCounts[job] = len(children)
                    jobBatcher.issueJobs(children)
                else:
                    assert len(job.followOnCommands) > 0
                    if job.remainingRetryCount > 0:
                        logger.debug(
                            "Job: %s has a new command that we can now issue" %
                            job.getJobFileName())
                        memory, cpu = job.followOnCommands[-1][1:3]
                        jobBatcher.issueJob(job.getJobFileName(), memory, cpu)
                    else:
                        totalFailedJobs += 1
                        logger.critical("Job: %s is completely failed" %
                                        job.getJobFileName())
            updatedJobFiles = set()  #We've considered them all, so reset

        if jobBatcher.getNumberOfJobsIssued() == 0:
            logger.info(
                "Only failed jobs and their dependents (%i total) are remaining, so exiting."
                % totalFailedJobs)
            break

        updatedJob = batchSystem.getUpdatedJob(
            10)  #Asks the batch system what jobs have been completed.
        if updatedJob != None:
            jobID, result = updatedJob
            if jobBatcher.hasJob(jobID):
                if result == 0:
                    logger.debug(
                        "Batch system is reporting that the job %s ended successfully"
                        % jobBatcher.getJob(jobID))
                else:
                    logger.critical(
                        "Batch system is reporting that the job %s %s failed with exit value %i"
                        % (jobID, jobBatcher.getJob(jobID), result))
                processFinishedJob(jobID, result, updatedJobFiles, jobBatcher,
                                   childJobFileToParentJob, childCounts,
                                   config)
            else:
                logger.critical(
                    "A result seems to already have been processed: %i" %
                    jobID)
        else:
            #logger.debug("Waited but no job was finished, still have %i jobs issued" % jobBatcher.getNumberOfJobsIssued())
            if time.time(
            ) - timeSinceJobsLastRescued >= rescueJobsFrequency:  #We only rescue jobs every N seconds, and when we have apparently exhausted the current job supply
                reissueOverLongJobs(updatedJobFiles, jobBatcher, config,
                                    batchSystem, childJobFileToParentJob,
                                    childCounts)
                logger.info("Reissued any over long jobs")

                hasNoMissingJobs = reissueMissingJobs(updatedJobFiles,
                                                      jobBatcher, batchSystem,
                                                      childJobFileToParentJob,
                                                      childCounts, config)
                if hasNoMissingJobs:
                    timeSinceJobsLastRescued = time.time()
                else:
                    timeSinceJobsLastRescued += 60  #This means we'll try again in a minute, providing things are quiet
                logger.info("Rescued any (long) missing jobs")

    logger.info("Finished the main loop")

    if stats:
        startTime = time.time()
        logger.info("Waiting for stats collator process to finish")
        stop.put(True)
        worker.join()
        logger.info("Stats finished collating in %s seconds" %
                    (time.time() - startTime))

    return totalFailedJobs  #Returns number of failed jobs
Example #38
0
def qsub(qsubline):
    logger.debug("**"+" ".join(qsubline))
    process = subprocess.Popen(qsubline, stdout=subprocess.PIPE)
    result = int(process.stdout.readline().strip().split('.')[0])
    logger.debug("Got the job id: %s" % (str(result)))
    return result
Example #39
0
def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler, redirectLoggerStreamHandlers
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime, getTotalCpuTimeAndMemoryUsage
    from sonLib.bioio import getTempDirectory
    from sonLib.bioio import makeSubDir
    from jobTree.src.job import Job
    from jobTree.src.master import getEnvironmentFileName, getConfigFileName, listChildDirs, getTempStatsFile, setupJobAfterFailure
    from sonLib.bioio import system
    
    ########################################## 
    #Input args
    ##########################################
    
    jobTreePath = sys.argv[1]
    jobFile = sys.argv[2]
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(getEnvironmentFileName(jobTreePath), 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = makeSubDir(os.path.join(localSlaveTempDir, "localTempDir"))
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging. This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?    
    tempSlaveLogFile = os.path.join(localSlaveTempDir, "slave_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)
    
    #Open the file to send stdout/stderr to.
    logDescriptor = os.open(tempSlaveLogFile, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logDescriptor, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logDescriptor, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logDescriptor)
    
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    #Put a message at the top of the log, just to make sure it's working.
    print "---JOBTREE SLAVE OUTPUT LOG---"
    sys.stdout.flush()
    
    #Log the number of open file descriptors so we can tell if we're leaking
    #them.
    logger.debug("Next available file descriptor: {}".format(
        nextOpenDescriptor()))
    
    ##########################################
    #Parse input files
    ##########################################
    
    config = ET.parse(getConfigFileName(jobTreePath)).getroot()
    setLogLevel(config.attrib["log_level"])
    job = Job.read(jobFile)
    job.messages = [] #This is the only way to stop messages logging twice, as are read only in the master
    job.children = [] #Similarly, this is where old children are flushed out.
    job.write() #Update status, to avoid reissuing children after running a follow on below.
    if os.path.exists(job.getLogFileName()): #This cleans the old log file
        os.remove(job.getLogFileName())
    logger.info("Parsed arguments and set up logging")

     #Try loop for slave logging
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if config.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #The max time 
    ##########################################
    
    maxTime = float(config.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint

    ##########################################
    #Slave log file trapped from here on in
    ##########################################

    slaveFailed = False
    try:
        
        ##########################################
        #The next job
        ##########################################
        
        def globalTempDirName(job, depth):
            return job.getGlobalTempDirName() + str(depth)
        
        command, memoryAvailable, cpuAvailable, depth = job.followOnCommands[-1]
        defaultMemory = int(config.attrib["default_memory"])
        defaultCpu = int(config.attrib["default_cpu"])
        assert len(job.children) == 0
        
        startTime = time.time() 
        while True:
            job.followOnCommands.pop()
            
            ##########################################
            #Global temp dir
            ##########################################
            
            globalTempDir = makeSubDir(globalTempDirName(job, depth))
            i = 1
            while os.path.isdir(globalTempDirName(job, depth+i)):
                system("rm -rf %s" % globalTempDirName(job, depth+i))
                i += 1
                
            ##########################################
            #Old children, not yet deleted
            #
            #These may exist because of the lazy cleanup
            #we do
            ##########################################
        
            for childDir in listChildDirs(job.jobDir):
                logger.debug("Cleaning up old child %s" % childDir)
                system("rm -rf %s" % childDir)
        
            ##########################################
            #Run the job
            ##########################################
        
            if command != "": #Not a stub
                if command[:11] == "scriptTree ":
                    ##########################################
                    #Run the target
                    ##########################################
                    
                    loadStack(command).execute(job=job, stats=stats,
                                    localTempDir=localTempDir, globalTempDir=globalTempDir, 
                                    memoryAvailable=memoryAvailable, cpuAvailable=cpuAvailable, 
                                    defaultMemory=defaultMemory, defaultCpu=defaultCpu, depth=depth)
            
                else: #Is another command
                    system(command) 
            
            ##########################################
            #Cleanup/reset a successful job/checkpoint
            ##########################################
            
            job.remainingRetryCount = int(config.attrib["try_count"])
            system("rm -rf %s/*" % (localTempDir))
            job.update(depth=depth, tryCount=job.remainingRetryCount)
            
            ##########################################
            #Establish if we can run another job
            ##########################################
            
            if time.time() - startTime > maxTime:
                logger.info("We are breaking because the maximum time the job should run for has been exceeded")
                break
            
            #Deal with children
            if len(job.children) >= 1:  #We are going to have to return to the parent
                logger.info("No more jobs can run in series by this slave, its got %i children" % len(job.children))
                break
            
            if len(job.followOnCommands) == 0:
                logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
                break
            
            #Get the next job and see if we have enough cpu and memory to run it..
            command, memory, cpu, depth = job.followOnCommands[-1]
            
            if memory > memoryAvailable:
                logger.info("We need more memory for the next job, so finishing")
                break
            if cpu > cpuAvailable:
                logger.info("We need more cpus for the next job, so finishing")
                break
            
            logger.info("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        
        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            tempStatsFile = getTempStatsFile(jobTreePath)
            fileHandle = open(tempStatsFile + ".new", "w")
            ET.ElementTree(stats).write(fileHandle)
            fileHandle.close()
            os.rename(tempStatsFile + ".new", tempStatsFile) #This operation is atomic
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Where slave goes wrong
    ##########################################
    except: #Case that something goes wrong in slave
        traceback.print_exc()
        logger.critical("Exiting the slave because of a failed job on host %s", socket.gethostname())
        job = Job.read(jobFile)
        setupJobAfterFailure(job, config)
        job.write()
        slaveFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the slave logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if slaveFailed:
        truncateFile(tempSlaveLogFile)
        system("mv %s %s" % (tempSlaveLogFile, job.getLogFileName()))
    #Remove the temp dir
    system("rm -rf %s" % localSlaveTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not slaveFailed) and len(job.followOnCommands) == 0 and len(job.children) == 0 and len(job.messages) == 0:
        ##########################################
        #Cleanup global files at the end of the chain
        ##########################################
        job.delete()            
Example #40
0
def mainLoop(config, batchSystem):
    """This is the main loop from which jobs are issued and processed.
    """
    rescueJobsFrequency = float(config.attrib["rescue_jobs_frequency"])
    maxJobDuration = float(config.attrib["max_job_duration"])
    assert maxJobDuration >= 0
    logger.info("Got parameters,rescue jobs frequency: %s max job duration: %s" % \
                (rescueJobsFrequency, maxJobDuration))
    
    #Kill any jobs on the batch system queue from the last time.
    assert len(batchSystem.getIssuedJobIDs()) == 0 #Batch system must start with no active jobs!
    logger.info("Checked batch system has no running jobs and no updated jobs")
    
    childJobFileToParentJob, childCounts, updatedJobFiles = {}, {}, set()
    parseJobFiles(getJobFileDirName(config.attrib["job_tree"]), updatedJobFiles, childJobFileToParentJob, childCounts, config)
    jobBatcher = JobBatcher(config, batchSystem)
    logger.info("Found %s jobs to start and %i parent jobs with children to run" % (len(updatedJobFiles), len(childCounts)))
    
    stats = config.attrib.has_key("stats")
    if stats:
        stop = Queue()
        worker = Process(target=statsAggregatorProcess, args=(config.attrib["job_tree"], makeTemporaryStatsDirs(config.attrib["job_tree"]), stop))
        worker.daemon = True
        worker.start()
        
    timeSinceJobsLastRescued = time.time() #Sets up the timing of the job rescuing method
    totalFailedJobs = 0
    logger.info("Starting the main loop")
    while True: 
        if len(updatedJobFiles) > 0:
            logger.debug("Built the jobs list, currently have %i jobs to update and %i jobs issued" % (len(updatedJobFiles), jobBatcher.getNumberOfJobsIssued()))
        
            for job in updatedJobFiles:         
                for message in job.messages:
                    logger.critical("Got message from job at time: %s : %s" % (time.time(), message))
                job.messages = []
                
                if len(job.children) > 0:
                    logger.debug("Job: %s has %i children to schedule" % (job.getJobFileName(), len(job.children)))
                    children = job.children
                    job.children = []
                    for childJobFile, memory, cpu in children:
                        childJobFileToParentJob[childJobFile] = job
                    assert job not in childCounts
                    childCounts[job] = len(children)
                    jobBatcher.issueJobs(children)
                else:
                    assert len(job.followOnCommands) > 0
                    if job.remainingRetryCount > 0:
                        logger.debug("Job: %s has a new command that we can now issue" % job.getJobFileName())
                        memory, cpu = job.followOnCommands[-1][1:3]
                        jobBatcher.issueJob(job.getJobFileName(), memory, cpu)
                    else:
                        totalFailedJobs += 1
                        logger.critical("Job: %s is completely failed" % job.getJobFileName())
            updatedJobFiles = set() #We've considered them all, so reset
             
        if jobBatcher.getNumberOfJobsIssued() == 0:
            logger.info("Only failed jobs and their dependents (%i total) are remaining, so exiting." % totalFailedJobs)
            break 
        
        updatedJob = batchSystem.getUpdatedJob(10) #Asks the batch system what jobs have been completed.
        if updatedJob != None: 
            jobID, result = updatedJob
            if jobBatcher.hasJob(jobID): 
                if result == 0:
                    logger.debug("Batch system is reporting that the job %s ended successfully" % jobBatcher.getJob(jobID))   
                else:
                    logger.critical("Batch system is reporting that the job %s %s failed with exit value %i" % (jobID, jobBatcher.getJob(jobID), result))  
                processFinishedJob(jobID, result, updatedJobFiles, jobBatcher, childJobFileToParentJob, childCounts, config)
            else:
                logger.critical("A result seems to already have been processed: %i" % jobID)
        else:
            #logger.debug("Waited but no job was finished, still have %i jobs issued" % jobBatcher.getNumberOfJobsIssued())
            if time.time() - timeSinceJobsLastRescued >= rescueJobsFrequency: #We only rescue jobs every N seconds, and when we have apparently exhausted the current job supply
                reissueOverLongJobs(updatedJobFiles, jobBatcher, config, batchSystem, childJobFileToParentJob, childCounts)
                logger.info("Reissued any over long jobs")
                
                hasNoMissingJobs = reissueMissingJobs(updatedJobFiles, jobBatcher, batchSystem, childJobFileToParentJob, childCounts, config)
                if hasNoMissingJobs:
                    timeSinceJobsLastRescued = time.time()
                else:
                    timeSinceJobsLastRescued += 60 #This means we'll try again in a minute, providing things are quiet
                logger.info("Rescued any (long) missing jobs")
    
    logger.info("Finished the main loop")   
    
    if stats:
        startTime = time.time()
        logger.info("Waiting for stats collator process to finish")  
        stop.put(True)
        worker.join()
        logger.info("Stats finished collating in %s seconds" % (time.time() - startTime))  
    
    return totalFailedJobs #Returns number of failed jobs
Example #41
0
def main():
    sys.path.append(sys.argv[1])
    sys.argv.remove(sys.argv[1])
    
    #Now we can import all the stuff..
    from sonLib.bioio import getBasicOptionParser
    from sonLib.bioio import parseBasicOptions
    from sonLib.bioio import logger
    from sonLib.bioio import addLoggingFileHandler, redirectLoggerStreamHandlers
    from sonLib.bioio import setLogLevel
    from sonLib.bioio import getTotalCpuTime, getTotalCpuTimeAndMemoryUsage
    from sonLib.bioio import getTempDirectory
    from sonLib.bioio import makeSubDir
    from jobTree.src.job import Job
    from jobTree.src.master import getEnvironmentFileName, getConfigFileName, listChildDirs, getTempStatsFile, setupJobAfterFailure
    from sonLib.bioio import system
    
    ########################################## 
    #Input args
    ##########################################
    
    jobTreePath = sys.argv[1]
    jobFile = sys.argv[2]
    
    ##########################################
    #Load the environment for the job
    ##########################################
    
    #First load the environment for the job.
    fileHandle = open(getEnvironmentFileName(jobTreePath), 'r')
    environment = cPickle.load(fileHandle)
    fileHandle.close()
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)
    #os.environ = environment
    #os.putenv(key, value)
        
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localSlaveTempDir = getTempDirectory()
    localTempDir = makeSubDir(os.path.join(localSlaveTempDir, "localTempDir"))
    
    ##########################################
    #Setup the logging
    ##########################################
    
    #Setup the logging
    tempSlaveLogFile = os.path.join(localSlaveTempDir, "slave_log.txt")
    slaveHandle = open(tempSlaveLogFile, 'w')
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    logger.addHandler(logging.StreamHandler(slaveHandle))
    origStdErr = sys.stderr
    origStdOut = sys.stdout
    sys.stderr = slaveHandle 
    sys.stdout = slaveHandle
    
    ##########################################
    #Parse input files
    ##########################################
    
    config = ET.parse(getConfigFileName(jobTreePath)).getroot()
    setLogLevel(config.attrib["log_level"])
    job = Job.read(jobFile)
    job.messages = [] #This is the only way to stop messages logging twice, as are read only in the master
    job.children = []
    if os.path.exists(job.getLogFileName()): #This cleans the old log file
        os.remove(job.getLogFileName())
    logger.info("Parsed arguments and set up logging")

     #Try loop for slave logging
    ##########################################
    #Setup the stats, if requested
    ##########################################
    
    if config.attrib.has_key("stats"):
        startTime = time.time()
        startClock = getTotalCpuTime()
        stats = ET.Element("slave")
    else:
        stats = None
    
    ##########################################
    #The max time 
    ##########################################
    
    maxTime = float(config.attrib["job_time"])
    assert maxTime > 0.0
    assert maxTime < sys.maxint

    ##########################################
    #Slave log file trapped from here on in
    ##########################################

    slaveFailed = False
    try:
        
        ##########################################
        #The next job
        ##########################################
        
        def globalTempDirName(job, depth):
            return job.getGlobalTempDirName() + str(depth)
        
        command, memoryAvailable, cpuAvailable, depth = job.followOnCommands[-1]
        defaultMemory = int(config.attrib["default_memory"])
        defaultCpu = int(config.attrib["default_cpu"])
        assert len(job.children) == 0
        
        startTime = time.time() 
        while True:
            job.followOnCommands.pop()
            
            ##########################################
            #Global temp dir
            ##########################################
            
            globalTempDir = makeSubDir(globalTempDirName(job, depth))
            i = 1
            while os.path.isdir(globalTempDirName(job, depth+i)):
                system("rm -rf %s" % globalTempDirName(job, depth+i))
                i += 1
                
            ##########################################
            #Old children, not yet deleted
            #
            #These may exist because of the lazy cleanup
            #we do
            ##########################################
        
            for childDir in listChildDirs(job.jobDir):
                logger.debug("Cleaning up old child %s" % childDir)
                system("rm -rf %s" % childDir)
        
            ##########################################
            #Run the job
            ##########################################
        
            if command != "": #Not a stub
                if command[:11] == "scriptTree ":
                    ##########################################
                    #Run the target
                    ##########################################
                    
                    loadStack(command).execute(job=job, stats=stats,
                                    localTempDir=localTempDir, globalTempDir=globalTempDir, 
                                    memoryAvailable=memoryAvailable, cpuAvailable=cpuAvailable, 
                                    defaultMemory=defaultMemory, defaultCpu=defaultCpu, depth=depth)
            
                else: #Is another command
                    system(command) 
            
            ##########################################
            #Cleanup/reset a successful job/checkpoint
            ##########################################
            
            job.remainingRetryCount = int(config.attrib["try_count"])
            system("rm -rf %s/*" % (localTempDir))
            job.update(depth=depth, tryCount=job.remainingRetryCount)
            
            ##########################################
            #Establish if we can run another job
            ##########################################
            
            if time.time() - startTime > maxTime:
                logger.info("We are breaking because the maximum time the job should run for has been exceeded")
                break
            
            #Deal with children
            if len(job.children) >= 1:  #We are going to have to return to the parent
                logger.info("No more jobs can run in series by this slave, its got %i children" % len(job.children))
                break
            
            if len(job.followOnCommands) == 0:
                logger.info("No more jobs can run by this slave as we have exhausted the follow ons")
                break
            
            #Get the next job and see if we have enough cpu and memory to run it..
            command, memory, cpu, depth = job.followOnCommands[-1]
            
            if memory > memoryAvailable:
                logger.info("We need more memory for the next job, so finishing")
                break
            if cpu > cpuAvailable:
                logger.info("We need more cpus for the next job, so finishing")
                break
            
            logger.info("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        
        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            tempStatsFile = getTempStatsFile(jobTreePath)
            fileHandle = open(tempStatsFile + ".new", "w")
            ET.ElementTree(stats).write(fileHandle)
            fileHandle.close()
            os.rename(tempStatsFile + ".new", tempStatsFile) #This operation is atomic
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds" % (time.time() - startTime))
    
    ##########################################
    #Where slave goes wrong
    ##########################################
    except: #Case that something goes wrong in slave
        traceback.print_exc(file = slaveHandle)
        logger.critical("Exiting the slave because of a failed job on host %s", socket.gethostname())
        job = Job.read(jobFile)
        setupJobAfterFailure(job, config)
        job.write()
        slaveFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the slave logging
    slaveHandle.flush()
    sys.stderr = origStdErr
    sys.stdout = origStdOut
    redirectLoggerStreamHandlers(slaveHandle, sys.stderr)
    slaveHandle.close()
    
    #Copy back the log file to the global dir, if needed
    if slaveFailed:
        truncateFile(tempSlaveLogFile)
        system("mv %s %s" % (tempSlaveLogFile, job.getLogFileName()))
    #Remove the temp dir
    system("rm -rf %s" % localSlaveTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not slaveFailed) and len(job.followOnCommands) == 0 and len(job.children) == 0 and len(job.messages) == 0:
        ##########################################
        #Cleanup global files at the end of the chain
        ##########################################
        job.delete()            
def checkArguments(args, parser):
  # check for setting
  for name, value in [('augustus_path', args.augustus_path),
                      ('hal_path', args.hal_path),
                      ('hal_file_path', args.hal_path),
                      ('tree_path', args.tree_path),
                      ('out_dir', args.out_dir),
                      ('sqlite_db', args.sqlite_db),
                      ('speciesfilenames', args.speciesfilenames),
                      ('ref_genome', args.ref_genome),
                      ]:
    if value is None:
      parser.error('Specify --%s' % name)
    else:
      value = os.path.abspath(value)
  # check for path existence
  for name, value in [('augustus_path', args.augustus_path),
                      ('hal_path', args.hal_path),
                      ('hal_file_path', args.hal_file_path),
                      ('tree_path', args.tree_path),
                      ('sqlite_db', args.sqlite_db),
                      ('speciesfilenames', args.speciesfilenames),
                      ]:
    if not os.path.exists(value):
      parser.error('--%s %s does not exist' % (name, value))
  if not os.path.exists(args.out_dir):
    os.mkdir(args.out_dir)
  # check for directories
  for name, value in [('augustus_path', args.augustus_path),
                      ('hal_path', args.hal_path),
                      ('out_dir', args.out_dir)]:
    if not os.path.isdir(value):
      parser.error('--%s %s is not a directory' % (name, value))
  # check for files
  for value in [os.path.join(args.augustus_path, 'bin', 'augustus'),
                os.path.join(args.hal_path, 'bin', 'hal2maf'),
                args.hal_file_path,
                args.tree_path,
                args.sqlite_db,
                args.speciesfilenames,
                ]:
    if not os.path.isfile(value):
      parser.error('%s is not a file' % value)
  # check for executability
  for value in [(os.path.join(args.augustus_path, 'bin', 'augustus')),
                      (os.path.join(args.hal_path, 'bin', 'hal2maf'))]:
    if not os.access(value, os.X_OK):
      parser.error('%s is not executable' % value)
  if args.extrinsicCfgFile is not None:
    if not os.path.exists(args.extrinsicCfgFile):
      parser.error('--extrinsicCfgFile %s does not exist'
                   % args.extrinsicCfgFile)
    if not os.path.isfile(args.extrinsicCfgFile):
      parser.error('--extrinsicCfgFile %s is not a file'
                   % args.extrinsicCfgFile)
    args.extrinsicCfgFile = os.path.abspath(args.extrinsicCfgFile)
  args.augustus_path = os.path.abspath(args.augustus_path)
  args.hal_path = os.path.abspath(args.hal_path)
  args.hal_file_path = os.path.abspath(args.hal_file_path)
  args.tree_path = os.path.abspath(args.tree_path)
  args.out_dir = os.path.abspath(args.out_dir)
  if args.window_length - args.window_overlap < 1:
    parser.error('--window_length must be greater than --window_overlap!')
  # verifyMySQLServer(args.out_dir, args)  # Verify for head node
  logger.debug('Arguments checked.\n'
               'augustus_path:%s\n'
               'hal_path:%s\n'
               'hal_file_path:%s\n'
               'tree_path:%s\n'
               'out_dir:%s\n'
               % (args.augustus_path, args.hal_path, args.hal_file_path,
                  args.tree_path, args.out_dir))
  args.calling_command = '%s' % ' '.join(sys.argv[0:])
  if args.maf_file_path is not None:
    if args.ref_sequence is None:
      parser.error('You have selected --maf_file_path, you must also specify '
                   '--ref_sequence')
    if args.window_start is None:
      parser.error('You have selected --maf_file_path, you must also specify '
                   '--window_start')
    if args.window_end is None:
      parser.error('You have selected --maf_file_path, you must also specify '
                   '--window_end')
Example #43
0
def makeBlastScoringMatrix(hmm, sequences):
    """Converts an hmm into a lastz style scoring matrix
    """
    #convert to a three state hmm
    hmm2 = Hmm("threeState")
    hmm2.transitions = hmm.transitions[:3] + hmm.transitions[
        hmm.stateNumber * 1:hmm.stateNumber * 1 +
        3] + hmm.transitions[hmm.stateNumber * 2:hmm.stateNumber * 2 + 3]
    hmm2.emissions = hmm.emissions[:3 * SYMBOL_NUMBER**2]
    hmm2.normalise()
    hmm = hmm2

    #Get gap distribution, assuming we include reverse complement sequences then it's fraction of GCs
    gcFraction = sum(
        map(lambda x: sum(map(lambda y: 1.0 if y in 'GC' else 0.0, x)),
            sequences)) / sum(map(len, sequences))
    logger.debug(
        "Got the GC fraction in the sequences for making the scoring matrix: %s"
        % gcFraction)
    baseProb = lambda x: gcFraction / 2.0 if x in (1, 2) else (1.0 - gcFraction
                                                               ) / 2.0

    #Calculate match matrix
    logger.debug("Original match probs: %s" %
                 " ".join(map(str, hmm.emissions[:SYMBOL_NUMBER**2])))
    matchProbs = [
        hmm.emissions[x * SYMBOL_NUMBER + y] / (baseProb(x) * baseProb(y))
        for x, y in product(range(SYMBOL_NUMBER), range(SYMBOL_NUMBER))
    ]
    logger.debug("Blast emission match probs: %s" %
                 " ".join(map(str, matchProbs)))
    matchContinue = hmm.transitions[0]
    #The 6.94 is the 1/100th the sum of the lastz scoring matrix
    nProb = math.sqrt(
        math.exp(
            (6.94 +
             sum(map(lambda x: math.log(x * matchContinue), matchProbs))) /
            len(matchProbs)))
    logger.debug("N prob is: %s" % nProb)  #Note it may go above 1!
    weight = 100
    matchProbs = map(
        lambda x: weight * math.log((x * matchContinue) / nProb**2),
        matchProbs)
    logger.debug("Blast match probs, %s: %s" %
                 (sum(matchProbs) / 4.0, " ".join(map(str, matchProbs))))

    #Calculate gap open
    gapOpen = weight*math.log((0.5 * (hmm.transitions[1]/nProb + hmm.transitions[2]/nProb)) * \
    ((hmm.transitions[hmm.stateNumber*1 + 0] + hmm.transitions[hmm.stateNumber*2 + 0])/(2*nProb**2)) * \
    ((nProb**2)/matchContinue))
    logger.debug("Gap open: %s" % gapOpen)

    #Calculate gap extend
    gapContinue = weight * math.log(
        0.5 * (hmm.transitions[hmm.stateNumber * 1 + 1] / nProb +
               hmm.transitions[hmm.stateNumber * 2 + 2] / nProb))
    logger.debug("Gap continue: %s" % gapContinue)

    return matchProbs, gapOpen, gapContinue
    def runProgressive(self):
        logger.debug("Going to put the alignment in %s" % self.outputDir)
        if not os.path.isdir(self.outputDir):
            os.mkdir(self.outputDir)

        if not os.path.exists(os.path.join(self.outputDir, "progressiveCactusAlignment")):
            xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml"))
            
            #Set the parameters
            tempLocalDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment")
            system("rm -rf %s" % tempLocalDir)
            os.mkdir(tempLocalDir)
            
            #Set the config parameters
            self.params.applyToXml(xmlTree)
            config = xmlTree.getroot()
            assert config is not None
            
            #Write the config file
            tempConfigFile = os.path.join(tempLocalDir, "config.xml")
            fileHandle = open(tempConfigFile, 'w')
            assert fileHandle is not None
            tree = ET.ElementTree(config)
            tree.write(fileHandle)
            fileHandle.close()
         
            #Make the experiment file
            tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml")
            
            if self.params.kyotoTycoon == True:
                dbConfElem = ET.Element("st_kv_database_conf", type="kyoto_tycoon")
                ktElem = ET.SubElement(dbConfElem, "kyoto_tycoon", host="localhost", port="1978", database_dir="dummy")
            else:
                dbConfElem = None
            
            cactusWorkflowExperiment = CactusWorkflowExperiment(
                                                 sequences=self.sequences, 
                                                 newickTreeString=self.newickTree, 
                                                 #requiredSpecies=self.requiredSpecies,
                                                 #singleCopySpecies=self.singleCopySpecies,
                                                 databaseName="cactusAlignment",
                                                 outputDir=tempLocalDir,
                                                 configFile=tempConfigFile,
                                                 databaseConf = dbConfElem)
            cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile)
            
            #The jobtree
            tempJobTreeDir = os.path.join(tempLocalDir, "jobTree")
            
            #The place to put the temporary experiment dir
            tempExperimentDir = os.path.join(tempLocalDir, "progressiveCactusAlignment")
            
      
            #The temporary experiment 
            runCactusCreateMultiCactusProject(tempExperimentFile, 
                                              tempExperimentDir)
            logger.info("Setup the cactus progressive experiment")
            
            runCactusProgressive(os.path.join(tempExperimentDir, "progressiveCactusAlignment_project.xml"), 
                                 tempJobTreeDir, 
                                 #batchSystem=batchSystem, 
                                 buildMaf=True,
                                 joinMaf=True,
                                 #buildTrees=buildTrees, buildFaces=buildFaces, buildReference=buildReference,
                                 jobTreeStats=True,
                                 maxThreads=4,
                                 logLevel="DEBUG")
            logger.info("Ran the progressive workflow")
            
            #Check if the jobtree completed sucessively.
            runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
            logger.info("Checked the job tree dir for the progressive run")
            
            #Run the cactus tree stats
            expPath = os.path.join(tempExperimentDir, "Anc0", "Anc0_experiment.xml")
            exp = ExperimentWrapper(ET.parse(expPath).getroot())
            if exp.getDbType() == "kyoto_tycoon":
                ktserver = KtserverLauncher()
                ktserver.spawnServer(exp) 
            treeStatsFile = os.path.join(self.outputDir, "treeStats.xml")
            system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(),
                                                                                        treeStatsFile))
            if exp.getDbType() == "kyoto_tycoon":
                ktserver.killServer(exp)
                
            #Now copy the true assembly back to the output
            system("mv %s %s/experiment.xml" % (tempExperimentFile, self.outputDir))
            system("mv %s %s" % (tempExperimentDir, self.outputDir))
            system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir, self.outputDir))
            system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir))
            
            #But keep a link to the multicactus project in its original path so we can navigate
            # the paths in the xml...
            actualResultsDir = os.path.join(os.path.abspath(self.outputDir), "progressiveCactusAlignment")
            tempResultsDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment")
            system("ln -s %s %s" % (actualResultsDir, tempResultsDir))
Example #45
0
    def testNewickIO(self):
        # feslenstein's own... (http://evolution.genetics.washington.edu/phylip/newicktree.html)
        tree1 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997, seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201, weasel:18.87953):2.0946):3.87382,dog:25.46154);'
        tree2 = '(Bovine:0.69395,(Gibbon:0.36079,(Orang:0.33636,(Gorilla:0.17147,(Chimp:0.19268, Human:0.11927):0.08386):0.06124):0.15057):0.54939,Mouse:1.2146):0.1;'
        tree3 = '(Bovine:0.69395,(Hylobates:0.36079,(Pongo:0.33636,(G._Gorilla:0.17147, (P._paniscus:0.19268,H._sapiens:0.11927):0.08386):0.06124):0.15057):0.54939, Rodent:1.2146);'
        tree4 = 'A;'
        tree5 = '((A,B):0.0,(C,D));'
        tree6 = '(Alpha,Beta,Gamma,Delta,,Epsilon,,,);'

        trees = [tree1, tree2, tree3, tree4, tree5, tree6]
        newickParser = NXNewick()

        # Parse newicks, adding implied roots
        for tree in trees:
            newickParser.parseString(tree, addImpliedRoots=True)
            answer = self.__cleanTree(tree)
            outputString = newickParser.writeString()
            logger.debug(" ***************** ")
            logger.debug(outputString)
            logger.debug(answer)
            assert outputString == answer

        # Parse newicks, not adding implied roots
        for tree in trees:
            newickParser.parseString(tree, addImpliedRoots=False)
            outputString = newickParser.writeString()
            answer = re.sub(r':[.0-9]+?;', ';', tree)
            answer = re.sub(r'\s+', '', answer)
            logger.debug(" ***************** ")
            logger.debug(outputString)
            logger.debug(answer)
            assert outputString == answer
Example #46
0
 def __round(i):
     if i < 0:
         logger.debug("I got a less than 0 value: %s" % i)
         return 0.0
     return i
Example #47
0
def qsub(qsubline):
    logger.debug("**" + " ".join(qsubline))
    process = subprocess.Popen(qsubline, stdout=subprocess.PIPE)
    result = int(process.stdout.readline().strip().split('.')[0])
    logger.debug("Got the job id: %s" % (str(result)))
    return result