Esempio n. 1
0
    def nxPlot(self, options):
        """Nx-plot command"""

        self.logger.info('[CheckM - nx_plot] Creating Nx-plots.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        nx = NxPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info('Plotting Nx-plot for %s (%d of %d)' %
                             (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1
            nx.plot(f)

            outputFile = os.path.join(options.output_dir,
                                      binId) + '.nx_plot.' + options.image_type
            nx.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 2
0
    def distributionPlots(self, options):
        """Reference distribution plot command"""
        self.logger.info(
            '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        genomicSignatures = GenomicSignatures(K=4, threads=1)
        tetraSigs = genomicSignatures.read(options.tetra_profile)

        plots = DistributionPlots(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting reference distribution plots for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            binId = binIdFromFilename(f)
            plots.plot(f, tetraSigs, options.distributions)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.ref_dist_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 3
0
    def modify(self, options):
        """Modify command"""

        self.logger.info('[CheckM - modify] Modifying sequences in bin.')

        makeSurePathExists(os.path.dirname(options.output_file))

        if not (options.add or options.remove or options.outlier_file):
            self.logger.error('No modification to bin requested.\n')
            sys.exit(1)

        if (options.add or options.remove) and options.outlier_file:
            self.logger.error(
                "The 'outlier_file' option cannot be specified with 'add' or 'remove'.\n"
            )
            sys.exit(1)

        binTools = BinTools()

        if options.add or options.remove:
            binTools.modify(options.bin_file, options.seq_file, options.add,
                            options.remove, options.output_file)
        elif options.outlier_file:
            binTools.removeOutliers(options.bin_file, options.outlier_file,
                                    options.output_file)

        self.logger.info('Modified bin written to: ' + options.output_file)

        self.timeKeeper.printTimeStamp()
Esempio n. 4
0
    def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile, outDir, alignOutputDir, queueIn, queueOut):
        """Create multiple sequence alignment for markers with multiple hits in a bin."""

        HF = HMMERRunner(mode='fetch')

        while True:
            binId = queueIn.get(block=True, timeout=None)
            if binId == None:
                break

            markersWithMultipleHits = self.__extractMarkersWithMultipleHits(outDir, binId, resultsParser, binIdToBinMarkerSets[binId])

            if len(markersWithMultipleHits) != 0:
                # create multiple sequence alignments for markers with multiple hits
                binAlignOutputDir = os.path.join(alignOutputDir, binId)
                makeSurePathExists(binAlignOutputDir)
                for markerId in markersWithMultipleHits:
                    tempModelFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
                    HF.fetch(hmmModelFile, markerId, tempModelFile)

                    self.__alignMarker(markerId, markersWithMultipleHits[markerId], None, False, binAlignOutputDir, tempModelFile, bKeepUnmaskedAlign=False)

                    os.remove(tempModelFile)

            queueOut.put(binId)
Esempio n. 5
0
    def codingDensityPlot(self, options):
        """Coding density plot command"""
        self.logger.info(
            '[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        plots = CodingDensityPlots(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting coding density plots for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, options.distributions)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.coding_density_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 6
0
    def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile,
                    outDir, alignOutputDir, queueIn, queueOut):
        """Create multiple sequence alignment for markers with multiple hits in a bin."""

        HF = HMMERRunner(mode='fetch')

        while True:
            binId = queueIn.get(block=True, timeout=None)
            if binId == None:
                break

            markersWithMultipleHits = self.__extractMarkersWithMultipleHits(
                outDir, binId, resultsParser, binIdToBinMarkerSets[binId])

            if len(markersWithMultipleHits) != 0:
                # create multiple sequence alignments for markers with multiple hits
                binAlignOutputDir = os.path.join(alignOutputDir, binId)
                makeSurePathExists(binAlignOutputDir)
                for markerId in markersWithMultipleHits:
                    tempModelFile = os.path.join(tempfile.gettempdir(),
                                                 str(uuid.uuid4()))
                    HF.fetch(hmmModelFile, markerId, tempModelFile)

                    self.__alignMarker(markerId,
                                       markersWithMultipleHits[markerId],
                                       None,
                                       False,
                                       binAlignOutputDir,
                                       tempModelFile,
                                       bKeepUnmaskedAlign=False)

                    os.remove(tempModelFile)

            queueOut.put(binId)
Esempio n. 7
0
    def lengthHistogram(self, options):
        """Sequence length histogram command"""

        self.logger.info(
            '[CheckM - len_hist] Creating sequence length histogram.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        plot = LengthHistogram(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting sequence length histogram for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1
            plot.plot(f)

            outputFile = os.path.join(
                options.output_dir, binId) + '.len_hist.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 8
0
    def coveragePcaPlot(self, options):
        """PCA plot of coverage profiles"""
        self.logger.info(
            '[CheckM - cov_pca] Creating PCA plot of coverage profiles.')

        checkDirExists(options.bin_dir)
        checkFileExists(options.coverage_file)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        seqIds = []
        coverageProfiles = []
        for binId, seqDict in coverageStats.items():
            for seqId, bamDict in seqDict.items():
                seqIds.append(seqId)

                coverages = []
                for _, coverage in bamDict.items():
                    coverages.append(coverage)

                coverageProfiles.append(coverages)

        coverageProfiles = np.array(coverageProfiles)
        if coverageProfiles.shape[1] < 2:
            self.logger.error(
                'Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.'
            )
            sys.exit(1)

        self.logger.info('Computing PCA of coverage profiles.\n')
        pca = PCA()
        pc, variance = pca.pcaMatrix(coverageProfiles,
                                     fraction=1.0,
                                     bCenter=True,
                                     bScale=False)

        plots = PcaPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting PCA of coverage profiles for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, seqIds, pc, variance)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.cov_pca_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 9
0
    def tree(self, options):
        """Tree command"""
        self.logger.info(
            '[CheckM - tree] Placing bins in reference genome tree.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))

        # find phylogenetically informative genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_PHYLO_OUT,
                                 DefaultValues.HMMER_PHYLO_OUT,
                                 DefaultValues.PHYLO_HMM_MODELS,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        # write model information to file
        markerSetParser = MarkerSetParser(options.threads)
        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        # calculate statistics for each genome bin

        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_PHYLO_OUT)

        # align identified marker genes

        HA = HmmerAligner(options.threads)
        resultsParser = HA.makeAlignmentToPhyloMarkers(
            options.output_dir, DefaultValues.PHYLO_HMM_MODELS,
            DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False,
            DefaultValues.E_VAL, DefaultValues.LENGTH, False,
            os.path.join(options.output_dir, 'storage', 'tree'))

        # place bins into genome tree

        pplacer = PplacerRunner(
            threads=options.pplacer_threads
        )  # fix at one thread to keep memory requirements reasonable
        pplacer.run(binFiles, resultsParser, options.output_dir,
                    options.bReducedTree)

        self.timeKeeper.printTimeStamp()
Esempio n. 10
0
    def makeAlignmentsOfMultipleHits(self,
                                       outDir,
                                       markerFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       binIdToBinMarkerSets,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       alignOutputDir,
                                       ):
        """Align markers with multiple hits within a bin."""

        makeSurePathExists(alignOutputDir)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # align any markers with multiple hits in a bin
        self.logger.info('  Aligning marker genes with multiple hits in a single bin:')

        # process each bin in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for binId in binIdToModels:
            workerQueue.put(binId)

        for _ in range(self.totalThreads):
            workerQueue.put(None)

        try:
            calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Esempio n. 11
0
    def makeAlignmentsOfMultipleHits(self,
                                       outDir,
                                       markerFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       binIdToBinMarkerSets,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       alignOutputDir,
                                       ):
        """Align markers with multiple hits within a bin."""

        makeSurePathExists(alignOutputDir)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # align any markers with multiple hits in a bin
        self.logger.info('  Aligning marker genes with multiple hits in a single bin:')

        # process each bin in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for binId in binIdToModels:
            workerQueue.put(binId)

        for _ in range(self.totalThreads):
            workerQueue.put(None)

        try:
            calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Esempio n. 12
0
    def merge(self, options):
        """Merge command"""

        self.logger.info(
            '[CheckM - merge] Identifying bins with complementary sets of marker genes.'
        )

        checkDirExists(options.bin_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        markerSetParser = MarkerSetParser()
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.TREE_MARKER_SET:
            self.logger.error(
                'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n'
            )
            return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms'))

        binIds = []
        for binFile in binFiles:
            binIds.append(binIdFromFilename(binFile))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 "merger.table.txt", "merger.hmmer3",
                                 options.marker_file, False, False,
                                 options.bCalledGenes)

        # get HMM file for each bin
        markerSetParser = MarkerSetParser()
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, binIds, options.marker_file)

        # compare markers found in each bin

        merger = Merger()
        outputFile = merger.run(binFiles, options.output_dir,
                                "merger.table.txt", binIdToModels,
                                binIdToBinMarkerSets, options.delta_comp,
                                options.delta_cont, options.merged_comp,
                                options.merged_cont)

        self.logger.info('Merger information written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 13
0
    def binUnion(self, options):
        """Bin union command"""

        self.logger.info(
            '[CheckM - bin_union] Redundancy reduce multiple sets of bins into a single set.'
        )

        output_dir = options.output_dir
        makeSurePathExists(output_dir)

        bin_dirs = []
        checkmQaTsvs = []
        for i, arg in enumerate(options.bin_or_checkm_qa_table):
            if i % 2 == 0:
                checkDirExists(arg)
                bin_dirs.append(arg)
            else:
                checkFileExists(arg)
                checkmQaTsvs.append(arg)

        if len(bin_dirs) < 2:
            self.logger.error(
                "Need to specify at least two bin folders, found %i: " %
                len(bin_dirs))
            sys.exit(1)
        if len(bin_dirs) != len(checkmQaTsvs):
            self.logger.error(
                "Need to specify the same number of bin folders as checkm_qa_tsv files, found %i and %i, respectively: "
                % (len(bin_dirs), len(checkmQaTsvs)))
            sys.exit(1)

        binFileSets = []
        for bin_dir in bin_dirs:
            self.logger.info(
                "Reading fasta files with extension %s from bin folder %s" %
                (options.extension, bin_dir))
            binFileSets.append(self.binFiles(bin_dir, options.extension))

        binUnion = BinUnion()

        contigConflictsOutputFile = os.path.join(output_dir,
                                                 'contigConflicts.csv')
        unionBinOutputFile = os.path.join(output_dir, 'union.txt')
        binUnion.report(bin_dirs, binFileSets, checkmQaTsvs,
                        unionBinOutputFile, contigConflictsOutputFile,
                        options.min_completeness, options.max_contamination)
Esempio n. 14
0
    def tetraSignatures(self, options):
        """Tetranucleotide signature command"""

        self.logger.info(
            '[CheckM - tetra] Calculating tetranucleotide signature of sequences.'
        )

        checkFileExists(options.seq_file)
        makeSurePathExists(os.path.dirname(options.output_file))

        tetraSig = GenomicSignatures(4, options.threads)
        tetraSig.calculate(options.seq_file, options.output_file)

        self.logger.info('Tetranucletoide signatures written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
Esempio n. 15
0
    def ssuFinder(self, options):
        """SSU finder command"""

        self.logger.info(
            '[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.'
        )

        binFiles = self.binFiles(options.bin_dir, options.extension)

        checkFileExists(options.seq_file)
        makeSurePathExists(options.output_dir)

        ssuFinder = SSU_Finder(options.threads)
        ssuFinder.run(options.seq_file, binFiles, options.output_dir,
                      options.evalue, options.concatenate)

        self.timeKeeper.printTimeStamp()
Esempio n. 16
0
    def __processBin(self, outDir, tableOut, hmmerOut, markerFile,
                     bKeepAlignment, bNucORFs, bCalledGenes, queueIn,
                     queueOut):
        """Thread safe bin processing."""

        markerSetParser = MarkerSetParser(self.threadsPerSearch)

        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # run Prodigal
            if not bCalledGenes:
                prodigal = ProdigalRunner(binDir)
                if not prodigal.areORFsCalled(bNucORFs):
                    prodigal.run(binFile, bNucORFs)
                aaGeneFile = prodigal.aaGeneFile
            else:
                aaGeneFile = binFile
                shutil.copyfile(
                    aaGeneFile, os.path.join(binDir,
                                             DefaultValues.PRODIGAL_AA))

            # extract HMMs into temporary file
            hmmModelFile = markerSetParser.createHmmModelFile(
                binId, markerFile)

            # run HMMER
            hmmer = HMMERRunner()
            tableOutPath = os.path.join(binDir, tableOut)
            hmmerOutPath = os.path.join(binDir, hmmerOut)

            keepAlignStr = ''
            if not bKeepAlignment:
                keepAlignStr = '--noali'
            hmmer.search(
                hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath,
                '--cpu ' + str(self.threadsPerSearch) +
                ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment)

            queueOut.put((binId, hmmModelFile))
Esempio n. 17
0
    def parallelCoordPlot(self, options):
        """Parallel coordinate plot command"""

        self.logger.info(
            '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)
        checkFileExists(options.coverage_file)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read coverage stats file
        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        # calculate sequence stats for all bins
        self.logger.info('Calculating sequence statistics for each bin.')
        binStats = BinStatistics()
        seqStats = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqStats[binId] = binStats.sequenceStats(options.results_dir, f)

        # create plot for each bin

        plot = ParallelCoordPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting parallel coordinates for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plot.plot(binId, seqStats, coverageStats)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.paralel_coord_plot.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 18
0
    def taxonSet(self, options, db=None):
        """Taxon set command"""
        self.logger.info(
            '[CheckM - taxon_set] Generate taxonomic-specific marker set.')

        path = os.path.split(options.marker_file)[0]
        if path:
            makeSurePathExists(path)

        taxonParser = TaxonParser()
        bValidSet = taxonParser.markerSet(options.rank, options.taxon,
                                          options.marker_file)

        if bValidSet:

            self.logger.info('Marker set written to: ' + options.marker_file)

        self.timeKeeper.printTimeStamp()
Esempio n. 19
0
    def makeAlignmentToPhyloMarkers(self,
                                    outDir,
                                    hmmModelFile,
                                    hmmTableFile,
                                    binIdToModels,
                                    bIgnoreThresholds,
                                    evalueThreshold,
                                    lengthThreshold,
                                    bReportHitStats,
                                    alignOutputDir,
                                    bKeepUnmaskedAlign=False):
        """Align hits to a set of common marker genes."""

        self.logger.info("Extracting marker genes to align.")

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False,
                                   bIgnoreThresholds, evalueThreshold,
                                   lengthThreshold)

        # extract the ORFs to align
        markerSeqs, markerStats = self.__extractMarkerSeqsUnique(
            outDir, resultsParser)

        # generate individual HMMs required to create multiple sequence alignments
        binId = list(binIdToModels.keys())[0]
        hmmModelFiles = {}
        self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId],
                                   hmmModelFiles)

        # align each of the marker genes
        makeSurePathExists(alignOutputDir)
        self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats,
                                hmmModelFiles, alignOutputDir,
                                bKeepUnmaskedAlign)

        # remove the temporary HMM files
        for fileName in hmmModelFiles:
            os.remove(hmmModelFiles[fileName])

        return resultsParser
Esempio n. 20
0
    def __processBin(self, outDir, queueIn, queueOut):
        """Thread safe bin processing."""
        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binStats = {}

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # read scaffolds
            scaffolds = readFasta(binFile)

            # calculate GC statistics
            GC, stdGC = self.calculateGC(scaffolds)
            binStats['GC'] = GC
            binStats['GC std'] = stdGC

            # calculate statistics related to contigs and scaffolds
            maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, scaffoldAvgLen, contigAvgLen, numContigs, numAmbiguousBases = self.calculateSeqStats(
                scaffolds)
            binStats['Genome size'] = genomeSize
            binStats['# ambiguous bases'] = numAmbiguousBases
            binStats['# scaffolds'] = len(scaffolds)
            binStats['# contigs'] = numContigs
            binStats['Longest scaffold'] = maxScaffoldLen
            binStats['Longest contig'] = maxContigLen
            binStats['N50 (scaffolds)'] = scaffold_N50
            binStats['N50 (contigs)'] = contig_N50
            binStats['Mean scaffold length'] = scaffoldAvgLen
            binStats['Mean contig length'] = contigAvgLen

            # calculate coding density statistics
            codingDensity, translationTable, numORFs = self.calculateCodingDensity(
                binDir, scaffolds, genomeSize)
            binStats['Coding density'] = codingDensity
            binStats['Translation table'] = translationTable
            binStats['# predicted genes'] = numORFs

            queueOut.put((binId, binStats))
Esempio n. 21
0
    def __processBin(self, outDir, queueIn, queueOut):
        """Thread safe bin processing."""
        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binStats = {}
            scaffoldStats = {}

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # read scaffolds
            scaffolds = readFasta(binFile)
            for seqId in scaffolds:
                scaffoldStats[seqId] = {}

            # calculate GC statistics
            GC, stdGC = self.calculateGC(scaffolds, scaffoldStats)
            binStats['GC'] = GC
            binStats['GC std'] = stdGC

            # calculate statistics related to scaffold lengths
            maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds, scaffoldStats)
            binStats['Genome size'] = genomeSize
            binStats['# ambiguous bases'] = numAmbiguousBases
            binStats['# scaffolds'] = len(scaffolds)
            binStats['# contigs'] = numContigs
            binStats['Longest scaffold'] = maxScaffoldLen
            binStats['Longest contig'] = maxContigLen
            binStats['N50 (scaffolds)'] = scaffold_N50
            binStats['N50 (contigs)'] = contig_N50

            # calculate coding density statistics
            codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, genomeSize, scaffoldStats)
            binStats['Coding density'] = codingDensity
            binStats['Translation table'] = translationTable
            binStats['# predicted genes'] = numORFs

            queueOut.put((binId, binStats, scaffoldStats))
Esempio n. 22
0
    def coverage(self, options):
        """Coverage command"""

        self.logger.info(
            '[CheckM - coverage] Calculating coverage of sequences.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(os.path.dirname(options.output_file))

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverage = Coverage(options.threads)
        coverage.run(binFiles, options.bam_files, options.output_file,
                     options.all_reads, options.min_align,
                     options.max_edit_dist, options.min_qc)

        self.logger.info('Coverage information written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
Esempio n. 23
0
    def outliers(self, options):
        """Outlier command"""

        self.logger.info('[CheckM - outlier] Identifying outliers in bins.')

        checkDirExists(options.bin_dir)
        checkFileExists(options.tetra_profile)
        makeSurePathExists(os.path.dirname(options.output_file))

        binFiles = self.binFiles(options.bin_dir, options.extension)

        binTools = BinTools()
        binTools.identifyOutliers(options.results_dir, binFiles,
                                  options.tetra_profile, options.distributions,
                                  options.report_type, options.output_file)

        self.logger.info('Outlier information written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
Esempio n. 24
0
    def markerPlot(self, options):
        """Marker gene position plot command"""

        self.logger.info(
            '[CheckM - marker_plot] Creating marker gene position plot.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        # generate plot for each bin
        binFiles = self.binFiles(options.bin_dir, options.extension)

        resultsParser = ResultsParser(None)
        markerGeneStats = resultsParser.parseMarkerGeneStats(
            options.results_dir)
        binStats = resultsParser.parseBinStatsExt(options.results_dir)

        plot = MarkerGenePosPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting marker gene position plot for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            if binId not in markerGeneStats or binId not in binStats:
                continue  # bin has no marker genes

            bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId])

            if bPlotted:
                outputFile = os.path.join(
                    options.output_dir,
                    binId) + '.marker_pos_plot.' + options.image_type
                plot.savePlot(outputFile, dpi=options.dpi)
                self.logger.info('Plot written to: ' + outputFile)
            else:
                self.logger.info('No marker genes found in bin.')

        self.timeKeeper.printTimeStamp()
Esempio n. 25
0
    def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut):
        """Thread safe bin processing."""

        markerSetParser = MarkerSetParser(self.threadsPerSearch)

        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # run Prodigal
            if not bCalledGenes:
                prodigal = ProdigalRunner(binDir)
                if not prodigal.areORFsCalled(bNucORFs):
                    prodigal.run(binFile, bNucORFs)
                aaGeneFile = prodigal.aaGeneFile
            else:
                aaGeneFile = binFile
                shutil.copyfile(aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA))

            # extract HMMs into temporary file
            hmmModelFile = markerSetParser.createHmmModelFile(binId, markerFile)
  
            # run HMMER
            hmmer = HMMERRunner()
            tableOutPath = os.path.join(binDir, tableOut)
            hmmerOutPath = os.path.join(binDir, hmmerOut)

            keepAlignStr = ''
            if not bKeepAlignment:
                keepAlignStr = '--noali'
            hmmer.search(hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath,
                         '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr,
                         bKeepAlignment)
          
            queueOut.put((binId, hmmModelFile))
Esempio n. 26
0
    def makeAlignmentToPhyloMarkers(self,
                                       outDir,
                                       hmmModelFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       bReportHitStats,
                                       alignOutputDir,
                                       bKeepUnmaskedAlign=False
                                       ):
        """Align hits to a set of common marker genes."""

        self.logger.info("  Extracting marker genes to align.")

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # extract the ORFs to align
        markerSeqs, markerStats = self.__extractMarkerSeqsUnique(outDir, resultsParser)

        # generate individual HMMs required to create multiple sequence alignments
        binId = binIdToModels.keys()[0]
        hmmModelFiles = {}
        self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles)

        # align each of the marker genes
        makeSurePathExists(alignOutputDir)
        self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign)

        # remove the temporary HMM files
        for fileName in hmmModelFiles:
            os.remove(hmmModelFiles[fileName])

        return resultsParser
Esempio n. 27
0
    def binQAPlot(self, options):
        """Bin QA plot command"""

        self.logger.info(
            '[CheckM - bin_qa_plot] Creating bar plot of bin quality.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read model info
        # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO)
        # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # read sequence stats file
        resultsParser = ResultsParser(None)
        binStatsExt = resultsParser.parseBinStatsExt(options.results_dir)

        # create plot for each bin
        plot = BinQAPlot(options)
        bMakePlot = True
        if not options.bIgnoreHetero:
            aai = AminoAcidIdentity()
            aai.run(options.aai_strain, options.results_dir, None)
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  aai.aaiHetero)
        else:
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  None)

        if bMakePlot:
            outputFile = os.path.join(options.output_dir,
                                      'bin_qa_plot.' + options.image_type)
            plot.savePlot(outputFile, dpi=options.dpi)

            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 28
0
    def tetraPcaPlot(self, options):
        """PCA plot of tetranucleotide signatures"""
        self.logger.info(
            '[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        self.logger.info('Computing PCA of tetranuclotide signatures.\n')
        pca = PCA()
        seqIds, pc, variance = pca.pcaFile(options.tetra_profile,
                                           fraction=1.0,
                                           bCenter=True,
                                           bScale=False)

        plots = PcaPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting PCA of tetranuclotide signatures for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, seqIds, pc, variance)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.tetra_pca_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 29
0
    def gcBiasPlot(self, options):
        """GC bias plot command"""

        self.logger.info(
            '[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverageWindows = CoverageWindows(options.threads)
        coverageProfile = coverageWindows.run(binFiles, options.bam_file,
                                              options.all_reads,
                                              options.min_align,
                                              options.max_edit_dist,
                                              options.window_size)

        plots = GcBiasPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info('Plotting GC plots for %s (%d of %d)' %
                             (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, coverageProfile)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.gc_bias_plot.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Esempio n. 30
0
    def run(self, parser, outputDir):
        """Run standard E. coli genome to verify operation of CheckM."""

        ecoliFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data',
                                 '637000110.fna')
        checkFileExists(ecoliFile)

        options = Options()
        options.threads = 1
        options.extension = 'fna'
        options.bQuiet = True
        options.out_folder = os.path.join(outputDir, 'results')
        if os.path.exists(options.out_folder):
            shutil.rmtree(options.out_folder)
        makeSurePathExists(options.out_folder)

        print '[Step 1]: Verifying tree command.'
        options.bKeepAlignment = False
        options.bNucORFs = False
        options.bCalledGenes = False
        options.bReducedTree = True
        options.bin_folder = os.path.join(DefaultValues.CHECKM_DATA_DIR,
                                          'test_data')
        parser.tree(options)
        self.verifyTree(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 2]: Verifying tree_qa command.'
        options.tree_folder = options.out_folder
        options.out_format = 1
        options.file = os.path.join(options.out_folder, 'tree_qa_test.tsv')
        options.bTabTable = True
        parser.treeQA(options)
        self.verifyTreeQA(options.file)
        print '\n  [Passed]'

        print '\n'
        print '[Step 3]: Verifying lineage_set command.'
        options.marker_file = os.path.join(options.out_folder,
                                           'lineage_set_test.tsv')
        options.bForceDomain = False
        options.bootstrap = 0
        options.num_genomes_markers = 30
        options.num_genomes_refine = 5
        options.bNoLineageSpecificRefinement = False

        options.bRequireTaxonomy = False
        options.unique = 10
        options.multi = 10
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)

        options.bRequireTaxonomy = True
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)
        print '\n  [Passed]'

        print '\n'
        print '[Step 4]: Verifying analyze command.'
        options.bAlignTopHit = False
        parser.analyze(options)
        self.verifyAnalyze(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 5]: Verifying qa command.'
        options.alignment_file = None
        options.analyze_folder = options.out_folder
        options.out_format = 1
        options.exclude_markers = None
        options.bSkipPseudoGeneCorrection = False
        options.bSkipAdjCorrection = False
        options.file = os.path.join(options.out_folder, 'qa_test.tsv')
        options.bIndividualMarkers = False
        options.bIgnoreThresholds = False
        options.aai_strain = 0.9
        options.e_value = 1e-10
        options.length = 0.7
        options.coverage_file = None
        options.bTabTable = True
        parser.qa(options)
        self.verifyQA(options.file)
        print '\n  [Passed]'
Esempio n. 31
0
    def analyze(self, options, db=None):
        """Analyze command"""
        self.logger.info(
            '[CheckM - analyze] Identifying marker genes in bins.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_OUT,
                                 DefaultValues.HMMER_OUT, options.marker_file,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        markerSetParser = MarkerSetParser(options.threads)
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, getBinIdsFromOutDir(options.output_dir),
            options.marker_file)

        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        self.timeKeeper.printTimeStamp()

        # HMM model file
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.HMM_MODELS_SET:
            markerFile = options.marker_file
        else:
            markerFile = DefaultValues.HMM_MODELS

        # align marker genes with multiple hits within a bin
        HA = HmmerAligner(options.threads)
        HA.makeAlignmentsOfMultipleHits(
            options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT,
            binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL,
            DefaultValues.LENGTH,
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        self.timeKeeper.printTimeStamp()

        # calculate statistics for each genome bin
        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_OUT)

        self.timeKeeper.printTimeStamp()

        # align top hit to each marker if requested
        if options.bAlignTopHit:
            alignmentOutputFolder = os.path.join(options.output_dir, 'storage',
                                                 'alignments')
            makeSurePathExists(alignmentOutputFolder)

            HA = HmmerAligner(options.threads)
            resultsParser = HA.makeAlignmentTopHit(
                options.output_dir, options.marker_file,
                DefaultValues.HMMER_TABLE_OUT, binIdToModels, False,
                DefaultValues.E_VAL, DefaultValues.LENGTH, True,
                alignmentOutputFolder)

            # report marker gene data
            fout = open(
                os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w')
            fout.write('Marker Id\tLength (bp)\n')
            markerIds = resultsParser.models[list(
                resultsParser.models.keys())[0]].keys()
            for markerId in markerIds:
                fout.write('%s\t%d\n' % (markerId, resultsParser.models[list(
                    resultsParser.models.keys())[0]][markerId].leng))
            fout.close()

            self.logger.info('Alignments to top hits stored in: ' +
                             alignmentOutputFolder)

            self.timeKeeper.printTimeStamp()
Esempio n. 32
0
    def run(self, parser, outputDir):
        """Run standard E. coli genome to verify operation of CheckM."""

        ecoliFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data', '637000110.fna')
        checkFileExists(ecoliFile)

        options = Options()
        options.threads = 1
	options.pplacer_threads = 1
        options.extension = 'fna'
        options.bQuiet = True
        options.out_folder = os.path.join(outputDir, 'results')
        if os.path.exists(options.out_folder):
            shutil.rmtree(options.out_folder)
        makeSurePathExists(options.out_folder)

        print '[Step 1]: Verifying tree command.'
        options.bKeepAlignment = False
        options.bNucORFs = False
        options.bCalledGenes = False
        options.bReducedTree = True
        options.bin_folder = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data')
        parser.tree(options)
        self.verifyTree(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 2]: Verifying tree_qa command.'
        options.tree_folder = options.out_folder
        options.out_format = 1
        options.file = os.path.join(options.out_folder, 'tree_qa_test.tsv')
        options.bTabTable = True
        parser.treeQA(options)
        self.verifyTreeQA(options.file)
        print '\n  [Passed]'

        print '\n'
        print '[Step 3]: Verifying lineage_set command.'
        options.marker_file = os.path.join(options.out_folder, 'lineage_set_test.tsv')
        options.bForceDomain = False
        options.bootstrap = 0
        options.num_genomes_markers = 30
        options.num_genomes_refine = 5
        options.bNoLineageSpecificRefinement = False

        options.bRequireTaxonomy = False
        options.unique = 10
        options.multi = 10
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)

        options.bRequireTaxonomy = True
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)
        print '\n  [Passed]'

        print '\n'
        print '[Step 4]: Verifying analyze command.'
        options.bAlignTopHit = False
        parser.analyze(options)
        self.verifyAnalyze(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 5]: Verifying qa command.'
        options.alignment_file = None
        options.analyze_folder = options.out_folder
        options.out_format = 1
        options.exclude_markers = None
        options.bSkipPseudoGeneCorrection = False
        options.bSkipAdjCorrection = False
        options.file = os.path.join(options.out_folder, 'qa_test.tsv')
        options.bIndividualMarkers = False
        options.bIgnoreThresholds = False
        options.aai_strain = 0.9
        options.e_value = 1e-10
        options.length = 0.7
        options.coverage_file = None
        options.bTabTable = True
        parser.qa(options)
        self.verifyQA(options.file)
        print '\n  [Passed]'