def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode): """We compare the output with a naive run of the blast program, to check the results are nearly equivalent. """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six species = ("human", "mouse", "dog") #Other species to try "rat", "monodelphis", "macaque", "chimp" for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) for i in xrange(len(species)): species1 = species[i] for species2 in species[i+1:]: seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion)) seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion)) #Run the random runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile) logger.info("Ran the naive blast okay") #Run the blast jobTreeDir = os.path.join(getTempDirectory(self.tempDir), "jobTree") if blastMode == "allAgainstAll": runCactusBlast([ seqFile1, seqFile2 ], self.tempOutputFile2, jobTreeDir, chunkSize=500000, overlapSize=10000) else: runCactusBlast([ seqFile1 ], self.tempOutputFile2, jobTreeDir, chunkSize=500000, overlapSize=10000, targetSequenceFiles=[ seqFile2 ]) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) system("rm -rf %s " % jobTreeDir) logger.info("Ran cactus_blast okay") logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode) compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
def testProgressiveOutgroupsVsAllOutgroups(self): """Tests the difference in outgroup coverage on an ingroup when running in "ingroups vs. outgroups" mode and "set against set" mode. """ encodeRegion = "ENm001" ingroup = "human" outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa") outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run in "set against set" mode, aligning the entire ingroup # vs each outgroup runCactusBlast([ingroupPath], self.tempOutputFile, os.path.join(self.tempDir, "setVsSetJobTree"), chunkSize=500000, overlapSize=10000, targetSequenceFiles=outgroupPaths) # Run in "ingroup vs outgroups" mode, aligning the ingroup vs # the outgroups in order, trimming away sequence that's # already been aligned. system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/outgroupJobTree" % (ingroupPath, ",".join(outgroupPaths), self.tempOutputFile2, self.tempDir)) # Get the coverage on the ingroup, in bases, from each run. coverageSetVsSet = int(popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile))) coverageIngroupVsOutgroups = int(popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile2))) print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet) print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups) # Make sure we're getting a reasonable fraction of the # alignments when using the trimming strategy. self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95) # Get the coverage on the ingroup, in bases, from just the # last outgroup. Obviously this should be much higher in set # vs set mode than in ingroup vs outgroup mode. coverageFromLastOutgroupSetVsSet = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total += $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile, ingroupPath))) coverageFromLastOutgroupInVsOut = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total += $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile2, ingroupPath))) print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet) print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut) self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) jobTreeDir = os.path.join(getTempDirectory(self.tempDir), "jobTree") runCactusBlast([ tempSeqFile ], self.tempOutputFile, jobTreeDir, chunkSize, overlapSize) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % jobTreeDir)