Example #1
0
 def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode):
     """We compare the output with a naive run of the blast program, to check the results are nearly
     equivalent.
     """
     encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six
     species = ("human", "mouse", "dog")
     #Other species to try "rat", "monodelphis", "macaque", "chimp"
     for encodeRegion in encodeRegions:
         regionPath = os.path.join(self.encodePath, encodeRegion)
         for i in xrange(len(species)):
             species1 = species[i]
             for species2 in species[i+1:]:
                 seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
                 seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))
                 
                 #Run the random
                 runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile)
                 logger.info("Ran the naive blast okay")
                 
                 #Run the blast
                 jobTreeDir = os.path.join(getTempDirectory(self.tempDir), "jobTree")
                 if blastMode == "allAgainstAll":
                     runCactusBlast([ seqFile1, seqFile2 ], self.tempOutputFile2, jobTreeDir,
                                    chunkSize=500000, overlapSize=10000)
                 else:
                     runCactusBlast([ seqFile1 ], self.tempOutputFile2, jobTreeDir,
                                    chunkSize=500000, overlapSize=10000, targetSequenceFiles=[ seqFile2 ])
                 runJobTreeStatusAndFailIfNotComplete(jobTreeDir)
                 system("rm -rf %s " % jobTreeDir)    
                 logger.info("Ran cactus_blast okay")
                 logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode)
                 compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
Example #2
0
    def testProgressiveOutgroupsVsAllOutgroups(self):
        """Tests the difference in outgroup coverage on an ingroup when
        running in "ingroups vs. outgroups" mode and "set against set"
        mode.
        """
        encodeRegion = "ENm001"
        ingroup = "human"
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa")
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run in "set against set" mode, aligning the entire ingroup
        # vs each outgroup
        runCactusBlast([ingroupPath], self.tempOutputFile, os.path.join(self.tempDir, "setVsSetJobTree"),
                       chunkSize=500000, overlapSize=10000,
                       targetSequenceFiles=outgroupPaths)
        # Run in "ingroup vs outgroups" mode, aligning the ingroup vs
        # the outgroups in order, trimming away sequence that's
        # already been aligned.
        system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/outgroupJobTree" % (ingroupPath, ",".join(outgroupPaths), self.tempOutputFile2, self.tempDir))

        # Get the coverage on the ingroup, in bases, from each run.
        coverageSetVsSet = int(popenCatch("cactus_coverage %s %s | awk '{ total +=  $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile)))
        coverageIngroupVsOutgroups = int(popenCatch("cactus_coverage %s %s | awk '{ total +=  $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile2)))

        print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet)
        print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups)

        # Make sure we're getting a reasonable fraction of the
        # alignments when using the trimming strategy.
        self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95)

        # Get the coverage on the ingroup, in bases, from just the
        # last outgroup. Obviously this should be much higher in set
        # vs set mode than in ingroup vs outgroup mode.
        coverageFromLastOutgroupSetVsSet = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total +=  $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile, ingroupPath)))
        coverageFromLastOutgroupInVsOut = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total +=  $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile2, ingroupPath)))

        print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet)
        print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut)

        self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
Example #3
0
 def testBlastRandom(self):
     """Make some sequences, put them in a file, call blast with random parameters 
     and check it runs okay.
     """
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     self.tempFiles.append(tempSeqFile)
     for test in xrange(self.testNo):
         seqNo = random.choice(xrange(0, 10))
         seq = getRandomSequence(8000)[1]
         fileHandle = open(tempSeqFile, 'w')
         for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]:
             if random.random() > 0.5:
                 seq = reverseComplement(seq)
             fastaWrite(fileHandle, fastaHeader, seq)
         fileHandle.close()
         chunkSize = random.choice(xrange(500, 9000))
         overlapSize = random.choice(xrange(2, 100))
         jobTreeDir = os.path.join(getTempDirectory(self.tempDir), "jobTree")
         runCactusBlast([ tempSeqFile ], self.tempOutputFile, jobTreeDir, chunkSize, overlapSize)
         runJobTreeStatusAndFailIfNotComplete(jobTreeDir)
         if getLogLevelString() == "DEBUG":
             system("cat %s" % self.tempOutputFile)
         system("rm -rf %s " % jobTreeDir)