def testScriptTree_Example(self): """Uses the jobTreeTest code to test the scriptTree Target wrapper. """ for test in xrange(self.testNo): command = "scriptTreeTest_Wrapper.py --jobTree %s --logLevel=DEBUG --retryCount=99" % self.jobTreeDir system(command) runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
def testScriptTree_Example2(self): """Tests that the global and local temp dirs of a job behave as expected. """ for test in xrange(self.testNo): command = "scriptTreeTest_Wrapper2.py --jobTree %s --logLevel=DEBUG --retryCount=99" % self.jobTreeDir system(command) runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode): """We compare the output with a naive run of the blast program, to check the results are nearly equivalent. """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six species = ("human", "mouse", "dog") #Other species to try "rat", "monodelphis", "macaque", "chimp" for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) for i in xrange(len(species)): species1 = species[i] for species2 in species[i+1:]: seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion)) seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion)) #Run the random runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile) logger.info("Ran the naive blast okay") #Run the blast jobTreeDir = os.path.join(getTempDirectory(self.tempDir), "jobTree") if blastMode == "allAgainstAll": runCactusBlast([ seqFile1, seqFile2 ], self.tempOutputFile2, jobTreeDir, chunkSize=500000, overlapSize=10000) else: runCactusBlast([ seqFile1 ], self.tempOutputFile2, jobTreeDir, chunkSize=500000, overlapSize=10000, targetSequenceFiles=[ seqFile2 ]) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) system("rm -rf %s " % jobTreeDir) logger.info("Ran cactus_blast okay") logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode) compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [tempAssemblyFile], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir( os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName()) ) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(), "jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget( MakeStats1(self.outputDir, cactusAlignment, self.options))
def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [ tempAssemblyFile ], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir(os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName())) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),"jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget(MakeStats1(self.outputDir, cactusAlignment, self.options))
def progressiveFunction(self, experimentFile, jobTreeDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, jobTreeStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCactusCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), jobTreeDir, batchSystem=batchSystem, buildAvgs=buildAvgs, jobTreeStats=jobTreeStats) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) system("rm -rf %s" % tempDir)
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) jobTreeDir = os.path.join(getTempDirectory(self.tempDir), "jobTree") runCactusBlast([ tempSeqFile ], self.tempOutputFile, jobTreeDir, chunkSize, overlapSize) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % jobTreeDir)
def runWorkflow_TestScript(sequences, newickTreeString, outputDir=None, batchSystem="single_machine", buildAvgs=False, buildReference=False, buildHal=False, buildFasta=False, configFile=None, buildJobTreeStats=False, constraints=None, progressive=False, cactusWorkflowFunction=runCactusWorkflow): """Runs the workflow and various downstream utilities. """ logger.info("Running cactus workflow test script") logger.info("Got the following sequence dirs/files: %s" % " ".join(sequences)) logger.info("Got the following tree %s" % newickTreeString) #Setup the output dir assert outputDir != None logger.info("Using the output dir: %s" % outputDir) #Setup the flower disk. experiment = getCactusWorkflowExperimentForTest(sequences, newickTreeString, outputDir=outputDir, configFile=configFile, constraints=constraints, progressive=progressive) experiment.cleanupDb() experimentFile = os.path.join(outputDir, "experiment.xml") experiment.writeXML(experimentFile) logger.info("The experiment file %s\n" % experimentFile) #Setup the job tree dir. jobTreeDir = os.path.join(outputDir, "jobTree") logger.info("Got a job tree dir for the test: %s" % jobTreeDir) #Run the actual workflow cactusWorkflowFunction(experimentFile, jobTreeDir, batchSystem=batchSystem, buildAvgs=buildAvgs, buildReference=buildReference, buildHal=buildHal, buildFasta=buildFasta, jobTreeStats=buildJobTreeStats) logger.info("Ran the the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(jobTreeDir) logger.info("Checked the job tree dir") #Check if the cactusDisk is okay.. #runCactusCheck(cactusDiskDatabaseString, recursive=True) #This should also occur during the workflow, so this #is redundant, but defensive #logger.info("Checked the cactus tree") #Now run various utilities.. if buildJobTreeStats: jobTreeStatsFile = os.path.join(outputDir, "jobTreeStats.xml") runJobTreeStats(jobTreeDir, jobTreeStatsFile) #Now remove everything we generate system("rm -rf %s %s" % (jobTreeDir, experimentFile)) #Return so calling function can cleanup return experiment
def testCactusWorkflow_Blanchette(self): """Runs the workflow on blanchette's simulated (colinear) regions. """ if "SON_TRACE_DATASETS" not in os.environ: return for test in xrange(self.testNo): tempFiles = [] tempDir = getTempDirectory(os.getcwd()) trueAlignment = os.path.join(TestStatus.getPathToDataSets(), "blanchettesSimulation", "00.job", "true.mfa") #Load the true alignment. columnAlignment = [ i for i in fastaAlignmentRead(trueAlignment) ] fastaHeaders = [ i for i in fastaReadHeaders(trueAlignment) ] sequenceNumber = 9 #The tree newickTreeString = "((((HUMAN:0.006969, CHIMP:0.009727):0.025291, BABOON:0.044568):0.11,(RAT:0.072818, MOUSE:0.081244):0.260342):0.023260,((DOG:0.07, CAT:0.07):0.087381,(PIG:0.06, COW:0.06):0.104728):0.04);" #Get random dir testDir = getTempDirectory(tempDir) #random alignment alignmentLength = 5000 randomStart = random.choice(xrange(len(columnAlignment)-alignmentLength)) subAlignment = columnAlignment[randomStart:randomStart+alignmentLength] logger.info("Got a sub alignment, it is %i columns long" % len(subAlignment)) #Get sequences sequences = [ (fastaHeaders[seqNo], "".join([ column[seqNo] for column in subAlignment if column[seqNo] != '-' ])) for seqNo in xrange(sequenceNumber) ] logger.info("Got the sequences") #Write sequences into temp files tempFastaFiles = [] for seqNo in xrange(sequenceNumber): header, sequence = sequences[seqNo] logger.info("Making temp file for header: %s, seq: %s" % (header, sequence)) tempFastaFile = os.path.join(testDir, "%i.fa" % seqNo) tempFastaFiles.append(tempFastaFile) fileHandle = open(tempFastaFile, "w") fastaWrite(fileHandle, header, sequence) fileHandle.close() logger.info("Got the temp sequence files") experiment = getCactusWorkflowExperimentForTest(tempFastaFiles, newickTreeString, testDir) experimentFile = os.path.join(testDir, "experiment.xml") experiment.writeXML(experimentFile) cactusDiskDatabaseString = experiment.getDiskDatabaseString() jobTree = os.path.join(testDir, "jobTree") runCactusWorkflow(experimentFile, jobTree) logger.info("Ran the the workflow") #Check the output alignment runJobTreeStatusAndFailIfNotComplete(jobTree) logger.info("Checked the job tree dir") #Output the 'TRUE' alignment file if os.system("mfaToMaf --help > /dev/null 2>&1") == 0 and\ os.system("cactus_MAFGenerator --help > /dev/null 2>&1") == 0 and\ os.system("mafComparator --help > /dev/null 2>&1") == 0 and\ os.system("cactus_treeStats --help > /dev/null 2>&1") == 0: trueMFAFile = os.path.join(testDir, "true.mfa") fastaAlignmentWrite(subAlignment, fastaHeaders, len(fastaHeaders), trueMFAFile) trueMAFFile = os.path.join(testDir, "true.maf") system("mfaToMaf --mfaFile %s --outputFile %s --logLevel %s" % (trueMFAFile, trueMAFFile, getLogLevelString())) system("cat %s" % trueMAFFile) #Now get mafs for the region. mAFFile = os.path.join(testDir, "flower.maf") system("cactus_MAFGenerator --flowerName 0 --cactusDisk '%s' --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, mAFFile, getLogLevelString())) logger.info("Got the MAFs from the flower disk") system("cat %s" % mAFFile) statsFile = os.path.join(testDir, "stats.xml") system("cactus_treeStats --cactusDisk '%s' --flowerName 0 --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, statsFile, getLogLevelString())) system("cat %s" % statsFile) logger.info("Got the cactus tree stats") #Now compare the mafs to the output. resultsFile = os.path.join(testDir, "results.xml") system("mafComparator --mafFile1 %s --mafFile2 %s --outputFile %s --logLevel %s" % (trueMAFFile, mAFFile, resultsFile, getLogLevelString())) logger.info("Ran the maf comparator") system("cat %s" % resultsFile) #Cleanup experiment.cleanupDb() system("rm -rf %s" % testDir) logger.info("Successfully ran test for the problem") for tempFile in tempFiles: os.remove(tempFile) system("rm -rf %s" % tempDir)