Example #1
0
    def testKeepingCoverageOnIngroups(self):
        """Tests whether the --ingroupCoverageDir option works as
        advertised."""
        encodeRegion = "ENm001"
        ingroups = ["human", "cow"]
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run blast in "ingroup vs outgroups" mode, requesting to keep
        # the bed files that show outgroup coverage on the ingroup.
        toilDir = os.path.join(self.tempDir, "tmp_toil")
        outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups]
        ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups]
        runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir)
        for i, ingroupPath in enumerate(ingroupPaths):
            # Get the coverage from the outgroups independently and
            # check that it's the same as the file in
            # ingroupCoverageDir
            otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0]
            # To filter out alignments from the other ingroup and
            # self-alignments we need to create a fasta with all the
            # outgroup fragments in it.
            outgroupsCombined = getTempFile(rootDir=self.tempDir)
            for outgroupFragmentPath in outgroupFragmentPaths:
                system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined))
            independentCoverageFile = getTempFile(rootDir=self.tempDir)
            coverageWorkDir = getTempDirectory(rootDir=self.tempDir)
            calculateCoverage(work_dir=coverageWorkDir, fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile)

            # find the coverage file cactus_blast kept (should be
            # named according to the basename of the ingroup path
            # file)
            keptCoverageFile = ingroupCoveragePaths[i]
            self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
Example #2
0
def run_ingroup_coverage(job, cactusWorkflowArguments, project):
    """ for every ingroup genome, make a bed file by computing its coverge vs the outgroups """
    work_dir = job.fileStore.getLocalTempDir()
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g))
                              for g in exp.getGenomesWithSequence()
                              if g not in exp.getOutgroupGenomes()]
    outgroups = [
        job.fileStore.readGlobalFile(id)
        for id in cactusWorkflowArguments.outgroupFragmentIDs
    ]
    sequences = [
        job.fileStore.readGlobalFile(id)
        for id in map(itemgetter(1), ingroupsAndOriginalIDs)
    ]
    cactusWorkflowArguments.totalSequenceSize = sum(
        os.stat(x).st_size for x in sequences)
    ingroups = map(itemgetter(0), ingroupsAndOriginalIDs)
    cigar = job.fileStore.readGlobalFile(cactusWorkflowArguments.alignmentsID)
    if len(outgroups) > 0:
        # should we parallelize with child jobs?
        for ingroup, sequence in zip(ingroups, sequences):
            coverage_path = os.path.join(work_dir,
                                         '{}.coverage'.format(sequence))
            calculateCoverage(sequence,
                              cigar,
                              coverage_path,
                              fromGenome=outgroups,
                              work_dir=work_dir)
            cactusWorkflowArguments.ingroupCoverageIDs.append(
                job.fileStore.writeGlobalFile(coverage_path))
    return cactusWorkflowArguments
Example #3
0
    def testKeepingCoverageOnIngroups(self):
        """Tests whether the --ingroupCoverageDir option works as
        advertised."""
        encodeRegion = "ENm001"
        ingroups = ["human", "cow"]
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run blast in "ingroup vs outgroups" mode, requesting to keep
        # the bed files that show outgroup coverage on the ingroup.
        toilDir = os.path.join(self.tempDir, "tmp_toil")
        outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups]
        ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups]
        runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir)
        for i, ingroupPath in enumerate(ingroupPaths):
            # Get the coverage from the outgroups independently and
            # check that it's the same as the file in
            # ingroupCoverageDir
            otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0]
            # To filter out alignments from the other ingroup and
            # self-alignments we need to create a fasta with all the
            # outgroup fragments in it.
            outgroupsCombined = getTempFile(rootDir=self.tempDir)
            for outgroupFragmentPath in outgroupFragmentPaths:
                system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined))
            independentCoverageFile = getTempFile(rootDir=self.tempDir)
            calculateCoverage(fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile)

            # find the coverage file cactus_blast kept (should be
            # named according to the basename of the ingroup path
            # file)
            keptCoverageFile = ingroupCoveragePaths[i]
            self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
Example #4
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ]
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        # subselect 4 random ordered outgroups
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))]
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
            outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
            results = []
            for numOutgroups in xrange(1,5):
                # Align w/ increasing numbers of outgroups
                subResults = getTempFile()
                subOutgroupPaths = outgroupPaths[:numOutgroups]
                print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
                tmpToil = os.path.join(self.tempDir, "outgroupToil")
                runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil)
                results.append(subResults)

            # Print diagnostics about coverage
            for i, subResults in enumerate(results):
                for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                    ingroupCoverage = getTempFile(rootDir=self.tempDir)
                    coverageWorkDir = getTempDirectory(rootDir=self.tempDir)
                    calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage)
                    coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage)
                    print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

            resultsSets = map(lambda x : loadResults(x), results)
            for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
                # Make sure the results from (n+1) outgroups are
                # (very nearly) a superset of the results from n outgroups
                print "Using %d addl outgroup(s):" % (i + 1)
                comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
                print comparator
                self.assertTrue(comparator.sensitivity >= 0.99)

            # Ensure that the new alignments don't cover more than
            # x% of already existing alignments to human
            for i in xrange(1, len(resultsSets)):
                prevResults = resultsSets[i-1][0]
                curResults = resultsSets[i][0]
                prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
                newAlignments = curResults.difference(prevResults)
                newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
                print "addl outgroup %d:" % i
                print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
            for subResult in results:
                os.remove(subResult)
Example #5
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegion = "ENm001"
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        MAX_NUM_OUTGROUPS = 3
        # subselect a random set of outgroups in the same order
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), MAX_NUM_OUTGROUPS))]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        results = []
        for numOutgroups in xrange(1, len(outgroups) + 1):
            # Align w/ increasing numbers of outgroups
            subResults = getTempFile()
            subOutgroupPaths = outgroupPaths[:numOutgroups]
            print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
            tmpToil = os.path.join(self.tempDir, "outgroupToil")
            runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil)
            results.append(subResults)

        # Print diagnostics about coverage
        for i, subResults in enumerate(results):
            for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                ingroupCoverage = getTempFile(rootDir=self.tempDir)
                calculateCoverage(sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage)
                coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage)
                print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

        resultsSets = map(lambda x : loadResults(x), results)
        for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
            # Make sure the results from (n+1) outgroups are
            # (very nearly) a superset of the results from n outgroups
            print "Using %d addl outgroup(s):" % (i + 1)
            comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
            print comparator
            self.assertTrue(comparator.sensitivity >= 0.99)

        # Ensure that the new alignments don't cover more than
        # x% of already existing alignments to human
        for i in xrange(1, len(resultsSets)):
            prevResults = resultsSets[i-1][0]
            curResults = resultsSets[i][0]
            prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
            newAlignments = curResults.difference(prevResults)
            newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
            print "addl outgroup %d:" % i
            print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
        for subResult in results:
            os.remove(subResult)
Example #6
0
    def testProgressiveOutgroupsVsAllOutgroups(self):
        """Tests the difference in outgroup coverage on an ingroup when
        running in "ingroups vs. outgroups" mode and "set against set"
        mode.
        """
        encodeRegion = "ENm001"
        ingroup = "human"
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa")
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run in "set against set" mode, aligning the entire ingroup
        # vs each outgroup
        runCactusBlast([ingroupPath], alignmentsFile=self.tempOutputFile,
                       toilDir=os.path.join(self.tempDir, "setVsSetToil"),
                       chunkSize=500000, overlapSize=10000,
                       targetSequenceFiles=outgroupPaths)
        # Run in "ingroup vs outgroups" mode, aligning the ingroup vs
        # the outgroups in order, trimming away sequence that's
        # already been aligned.
        runCactusBlastIngroupsAndOutgroups([ingroupPath], outgroupPaths, alignmentsFile=self.tempOutputFile2, toilDir=os.path.join(self.tempDir, "outgroupToil"))

        # Get the coverage on the ingroup, in bases, from each run.
        coverageSetVsSetUnfiltered = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=coverageSetVsSetUnfiltered)
        coverageSetVsSet = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageSetVsSetUnfiltered))
        coverageIngroupVsOutgroupsUnfiltered = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile2, outputFile=coverageIngroupVsOutgroupsUnfiltered)
        coverageIngroupVsOutgroups = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageIngroupVsOutgroupsUnfiltered))

        print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet)
        print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups)

        # Make sure we're getting a reasonable fraction of the
        # alignments when using the trimming strategy.
        self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95)

        # Get the coverage on the ingroup, in bases, from just the
        # last outgroup. Obviously this should be much higher in set
        # vs set mode than in ingroup vs outgroup mode.
        outgroupAlignments = getTempFile(rootDir=self.tempDir)
        system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile, outgroupAlignments))
        coverageFileSetVsSet = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileSetVsSet)
        
        coverageFromLastOutgroupSetVsSet = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageFileSetVsSet))

        
        outgroupAlignments = getTempFile(rootDir=self.tempDir)
        system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile2, outgroupAlignments))
        coverageFileInVsOut = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileInVsOut)      
        coverageFromLastOutgroupInVsOut = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageFileInVsOut))

        print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet)
        print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut)

        self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
Example #7
0
    def testProgressiveOutgroupsVsAllOutgroups(self):
        """Tests the difference in outgroup coverage on an ingroup when
        running in "ingroups vs. outgroups" mode and "set against set"
        mode.
        """
        encodeRegion = "ENm001"
        ingroup = "human"
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa")
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run in "set against set" mode, aligning the entire ingroup
        # vs each outgroup
        runCactusBlast([ingroupPath], alignmentsFile=self.tempOutputFile,
                       toilDir=os.path.join(self.tempDir, "setVsSetToil"),
                       chunkSize=500000, overlapSize=10000,
                       targetSequenceFiles=outgroupPaths)
        # Run in "ingroup vs outgroups" mode, aligning the ingroup vs
        # the outgroups in order, trimming away sequence that's
        # already been aligned.
        runCactusBlastIngroupsAndOutgroups([ingroupPath], outgroupPaths, alignmentsFile=self.tempOutputFile2, toilDir=os.path.join(self.tempDir, "outgroupToil"))

        # Get the coverage on the ingroup, in bases, from each run.
        coverageSetVsSetUnfiltered = getTempFile(rootDir=self.tempDir)
        coverageSetVsSetUnfilteredWorkDir = getTempDirectory(rootDir=self.tempDir)
        calculateCoverage(work_dir=coverageSetVsSetUnfilteredWorkDir, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=coverageSetVsSetUnfiltered)
        coverageSetVsSet = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageSetVsSetUnfiltered))
        coverageIngroupVsOutgroupsUnfiltered = getTempFile(rootDir=self.tempDir)
        coverageIngroupsVsOutgroupsUnfilteredWorkDir = getTempDirectory(rootDir=self.tempDir)
        calculateCoverage(work_dir=coverageIngroupsVsOutgroupsUnfilteredWorkDir, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile2, outputFile=coverageIngroupVsOutgroupsUnfiltered)
        coverageIngroupVsOutgroups = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageIngroupVsOutgroupsUnfiltered))

        print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet)
        print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups)

        # Make sure we're getting a reasonable fraction of the
        # alignments when using the trimming strategy.
        self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95)

        # Get the coverage on the ingroup, in bases, from just the
        # last outgroup. Obviously this should be much higher in set
        # vs set mode than in ingroup vs outgroup mode.
        outgroupAlignments = getTempFile(rootDir=self.tempDir)
        system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile, outgroupAlignments))
        coverageFileSetVsSet = getTempFile(rootDir=self.tempDir)
        coverageSetVsSetWorkDir = getTempDirectory(rootDir=self.tempDir)
        calculateCoverage(work_dir=coverageSetVsSetWorkDir, sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileSetVsSet)
        
        coverageFromLastOutgroupSetVsSet = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageFileSetVsSet))

        
        outgroupAlignments = getTempFile(rootDir=self.tempDir)
        system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile2, outgroupAlignments))
        coverageFileInVsOut = getTempFile(rootDir=self.tempDir)
        coverageInVsOutWorkDir = getTempDirectory(rootDir=self.tempDir)
        calculateCoverage(work_dir=coverageInVsOutWorkDir, sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileInVsOut)      
        coverageFromLastOutgroupInVsOut = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageFileInVsOut))

        print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet)
        print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut)

        self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)