Ejemplo n.º 1
0
    def testScaffoldLengthStats(self):
        """Verify computation of scaffold length statistics."""
        binStats = BinStatistics(threads=1)

        scaffolds = {
            'S1': 'ACGT' + DefaultValues.CONTIG_BREAK + 'ACGT',
            'S2': 'ACGTACGT',
            'S3': 'TTtt'
        }

        scaffoldStats = defaultdict(dict)
        maxScaffoldLen, maxContigLen, totalScaffoldBps, _, _, numContigs = binStats.calculateScaffoldLengthStats(
            scaffolds, scaffoldStats)

        self.assertAlmostEqual(scaffoldStats['S1']['Length'],
                               len(DefaultValues.CONTIG_BREAK) + 8)
        self.assertAlmostEqual(scaffoldStats['S1']['Total contig length'], 8)
        self.assertAlmostEqual(scaffoldStats['S1']['# contigs'], 2)

        self.assertAlmostEqual(maxScaffoldLen,
                               len(DefaultValues.CONTIG_BREAK) + 8)
        self.assertAlmostEqual(maxContigLen, 8)
        self.assertAlmostEqual(totalScaffoldBps,
                               len(DefaultValues.CONTIG_BREAK) + 8 + 8 + 4)
        self.assertAlmostEqual(numContigs, 4)
Ejemplo n.º 2
0
    def tree(self, options):
        """Tree command"""
        self.logger.info(
            '[CheckM - tree] Placing bins in reference genome tree.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))

        # find phylogenetically informative genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_PHYLO_OUT,
                                 DefaultValues.HMMER_PHYLO_OUT,
                                 DefaultValues.PHYLO_HMM_MODELS,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        # write model information to file
        markerSetParser = MarkerSetParser(options.threads)
        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        # calculate statistics for each genome bin

        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_PHYLO_OUT)

        # align identified marker genes

        HA = HmmerAligner(options.threads)
        resultsParser = HA.makeAlignmentToPhyloMarkers(
            options.output_dir, DefaultValues.PHYLO_HMM_MODELS,
            DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False,
            DefaultValues.E_VAL, DefaultValues.LENGTH, False,
            os.path.join(options.output_dir, 'storage', 'tree'))

        # place bins into genome tree

        pplacer = PplacerRunner(
            threads=options.pplacer_threads
        )  # fix at one thread to keep memory requirements reasonable
        pplacer.run(binFiles, resultsParser, options.output_dir,
                    options.bReducedTree)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 3
0
 def testCodingBases(self):
     """Verify computation of coding bases."""
     
     binStats = BinStatistics(threads = 1)
        
     aaGenes = {'S1_C1':'ACGTACGT', 'S1_C2':'ACGTACGT', 'S3_C1':'TTtt'}
     
     seqStats = defaultdict(dict)
     codingBasePairs = binStats._BinStatistics__calculateCodingBases(aaGenes, seqStats)
     
     self.assertAlmostEqual(seqStats['S1']['# ORFs'], 2)
     self.assertAlmostEqual(seqStats['S1']['Coding bases'], len(aaGenes['S1_C1'])*3 + len(aaGenes['S1_C2'])*3)
     
     self.assertAlmostEqual(codingBasePairs, len(aaGenes['S1_C1'])*3 + len(aaGenes['S1_C2'])*3 + len(aaGenes['S3_C1'])*3)
Ejemplo n.º 4
0
    def testGC(self):
        """Verify computation of GC."""
        binStats = BinStatistics(threads=1)

        seqs = {'S1': 'ACgt', 'S2': 'GGgg', 'S3': 'TTtt', 'S4': 'NNNN'}

        seqStats = defaultdict(dict)
        meanGC, _ = binStats.calculateGC(seqs, seqStats)

        self.assertAlmostEqual(seqStats['S1']['GC'], 0.5)
        self.assertAlmostEqual(seqStats['S2']['GC'], 1.0)
        self.assertAlmostEqual(seqStats['S3']['GC'], 0.0)
        self.assertAlmostEqual(seqStats['S4']['GC'], 0.0)

        self.assertAlmostEqual(meanGC, 6.0 / 12.0)
Ejemplo n.º 5
0
 def testGC(self):
     """Verify computation of GC."""
     binStats = BinStatistics(threads = 1)
     
     seqs = {'S1':'ACgt', 'S2':'GGgg', 'S3':'TTtt', 'S4':'NNNN'}
     
     seqStats = defaultdict(dict)
     meanGC, _ = binStats.calculateGC(seqs, seqStats)
     
     self.assertAlmostEqual(seqStats['S1']['GC'], 0.5)
     self.assertAlmostEqual(seqStats['S2']['GC'], 1.0)
     self.assertAlmostEqual(seqStats['S3']['GC'], 0.0)
     self.assertAlmostEqual(seqStats['S4']['GC'], 0.0)
     
     self.assertAlmostEqual(meanGC, 6.0/12.0)
Ejemplo n.º 6
0
 def testScaffoldLengthStats(self):
     """Verify computation of scaffold length statistics."""
     binStats = BinStatistics(threads = 1)
        
     scaffolds = {'S1':'ACGT' + DefaultValues.CONTIG_BREAK + 'ACGT', 'S2':'ACGTACGT', 'S3':'TTtt'}
     
     scaffoldStats = defaultdict(dict)
     maxScaffoldLen, maxContigLen, totalScaffoldBps, _, _, numContigs = binStats.calculateScaffoldLengthStats(scaffolds, scaffoldStats)
     
     self.assertAlmostEqual(scaffoldStats['S1']['Length'], len(DefaultValues.CONTIG_BREAK) + 8)
     self.assertAlmostEqual(scaffoldStats['S1']['Total contig length'], 8)
     self.assertAlmostEqual(scaffoldStats['S1']['# contigs'], 2)
     
     self.assertAlmostEqual(maxScaffoldLen, len(DefaultValues.CONTIG_BREAK) + 8)
     self.assertAlmostEqual(maxContigLen, 8)
     self.assertAlmostEqual(totalScaffoldBps, len(DefaultValues.CONTIG_BREAK) + 8 + 8 + 4)
     self.assertAlmostEqual(numContigs, 4)
Ejemplo n.º 7
0
    def parallelCoordPlot(self, options):
        """Parallel coordinate plot command"""

        self.logger.info(
            '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)
        checkFileExists(options.coverage_file)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read coverage stats file
        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        # calculate sequence stats for all bins
        self.logger.info('Calculating sequence statistics for each bin.')
        binStats = BinStatistics()
        seqStats = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqStats[binId] = binStats.sequenceStats(options.results_dir, f)

        # create plot for each bin

        plot = ParallelCoordPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting parallel coordinates for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plot.plot(binId, seqStats, coverageStats)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.paralel_coord_plot.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 8
0
    def testCodingBases(self):
        """Verify computation of coding bases."""

        binStats = BinStatistics(threads=1)

        aaGenes = {'S1_C1': 'ACGTACGT', 'S1_C2': 'ACGTACGT', 'S3_C1': 'TTtt'}

        seqStats = defaultdict(dict)
        codingBasePairs = binStats._BinStatistics__calculateCodingBases(
            aaGenes, seqStats)

        self.assertAlmostEqual(seqStats['S1']['# ORFs'], 2)
        self.assertAlmostEqual(
            seqStats['S1']['Coding bases'],
            len(aaGenes['S1_C1']) * 3 + len(aaGenes['S1_C2']) * 3)

        self.assertAlmostEqual(
            codingBasePairs,
            len(aaGenes['S1_C1']) * 3 + len(aaGenes['S1_C2']) * 3 +
            len(aaGenes['S3_C1']) * 3)
Ejemplo n.º 9
0
    def analyze(self, options, db=None):
        """Analyze command"""
        self.logger.info(
            '[CheckM - analyze] Identifying marker genes in bins.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_OUT,
                                 DefaultValues.HMMER_OUT, options.marker_file,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        markerSetParser = MarkerSetParser(options.threads)
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, getBinIdsFromOutDir(options.output_dir),
            options.marker_file)

        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        self.timeKeeper.printTimeStamp()

        # HMM model file
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.HMM_MODELS_SET:
            markerFile = options.marker_file
        else:
            markerFile = DefaultValues.HMM_MODELS

        # align marker genes with multiple hits within a bin
        HA = HmmerAligner(options.threads)
        HA.makeAlignmentsOfMultipleHits(
            options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT,
            binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL,
            DefaultValues.LENGTH,
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        self.timeKeeper.printTimeStamp()

        # calculate statistics for each genome bin
        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_OUT)

        self.timeKeeper.printTimeStamp()

        # align top hit to each marker if requested
        if options.bAlignTopHit:
            alignmentOutputFolder = os.path.join(options.output_dir, 'storage',
                                                 'alignments')
            makeSurePathExists(alignmentOutputFolder)

            HA = HmmerAligner(options.threads)
            resultsParser = HA.makeAlignmentTopHit(
                options.output_dir, options.marker_file,
                DefaultValues.HMMER_TABLE_OUT, binIdToModels, False,
                DefaultValues.E_VAL, DefaultValues.LENGTH, True,
                alignmentOutputFolder)

            # report marker gene data
            fout = open(
                os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w')
            fout.write('Marker Id\tLength (bp)\n')
            markerIds = resultsParser.models[list(
                resultsParser.models.keys())[0]].keys()
            for markerId in markerIds:
                fout.write('%s\t%d\n' % (markerId, resultsParser.models[list(
                    resultsParser.models.keys())[0]][markerId].leng))
            fout.close()

            self.logger.info('Alignments to top hits stored in: ' +
                             alignmentOutputFolder)

            self.timeKeeper.printTimeStamp()