def testScaffoldLengthStats(self): """Verify computation of scaffold length statistics.""" binStats = BinStatistics(threads=1) scaffolds = { 'S1': 'ACGT' + DefaultValues.CONTIG_BREAK + 'ACGT', 'S2': 'ACGTACGT', 'S3': 'TTtt' } scaffoldStats = defaultdict(dict) maxScaffoldLen, maxContigLen, totalScaffoldBps, _, _, numContigs = binStats.calculateScaffoldLengthStats( scaffolds, scaffoldStats) self.assertAlmostEqual(scaffoldStats['S1']['Length'], len(DefaultValues.CONTIG_BREAK) + 8) self.assertAlmostEqual(scaffoldStats['S1']['Total contig length'], 8) self.assertAlmostEqual(scaffoldStats['S1']['# contigs'], 2) self.assertAlmostEqual(maxScaffoldLen, len(DefaultValues.CONTIG_BREAK) + 8) self.assertAlmostEqual(maxContigLen, 8) self.assertAlmostEqual(totalScaffoldBps, len(DefaultValues.CONTIG_BREAK) + 8 + 8 + 4) self.assertAlmostEqual(numContigs, 4)
def tree(self, options): """Tree command""" self.logger.info( '[CheckM - tree] Placing bins in reference genome tree.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) # find phylogenetically informative genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_PHYLO_OUT, DefaultValues.HMMER_PHYLO_OUT, DefaultValues.PHYLO_HMM_MODELS, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) # write model information to file markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_PHYLO_OUT) # align identified marker genes HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentToPhyloMarkers( options.output_dir, DefaultValues.PHYLO_HMM_MODELS, DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, False, os.path.join(options.output_dir, 'storage', 'tree')) # place bins into genome tree pplacer = PplacerRunner( threads=options.pplacer_threads ) # fix at one thread to keep memory requirements reasonable pplacer.run(binFiles, resultsParser, options.output_dir, options.bReducedTree) self.timeKeeper.printTimeStamp()
def testCodingBases(self): """Verify computation of coding bases.""" binStats = BinStatistics(threads = 1) aaGenes = {'S1_C1':'ACGTACGT', 'S1_C2':'ACGTACGT', 'S3_C1':'TTtt'} seqStats = defaultdict(dict) codingBasePairs = binStats._BinStatistics__calculateCodingBases(aaGenes, seqStats) self.assertAlmostEqual(seqStats['S1']['# ORFs'], 2) self.assertAlmostEqual(seqStats['S1']['Coding bases'], len(aaGenes['S1_C1'])*3 + len(aaGenes['S1_C2'])*3) self.assertAlmostEqual(codingBasePairs, len(aaGenes['S1_C1'])*3 + len(aaGenes['S1_C2'])*3 + len(aaGenes['S3_C1'])*3)
def testGC(self): """Verify computation of GC.""" binStats = BinStatistics(threads=1) seqs = {'S1': 'ACgt', 'S2': 'GGgg', 'S3': 'TTtt', 'S4': 'NNNN'} seqStats = defaultdict(dict) meanGC, _ = binStats.calculateGC(seqs, seqStats) self.assertAlmostEqual(seqStats['S1']['GC'], 0.5) self.assertAlmostEqual(seqStats['S2']['GC'], 1.0) self.assertAlmostEqual(seqStats['S3']['GC'], 0.0) self.assertAlmostEqual(seqStats['S4']['GC'], 0.0) self.assertAlmostEqual(meanGC, 6.0 / 12.0)
def testGC(self): """Verify computation of GC.""" binStats = BinStatistics(threads = 1) seqs = {'S1':'ACgt', 'S2':'GGgg', 'S3':'TTtt', 'S4':'NNNN'} seqStats = defaultdict(dict) meanGC, _ = binStats.calculateGC(seqs, seqStats) self.assertAlmostEqual(seqStats['S1']['GC'], 0.5) self.assertAlmostEqual(seqStats['S2']['GC'], 1.0) self.assertAlmostEqual(seqStats['S3']['GC'], 0.0) self.assertAlmostEqual(seqStats['S4']['GC'], 0.0) self.assertAlmostEqual(meanGC, 6.0/12.0)
def testScaffoldLengthStats(self): """Verify computation of scaffold length statistics.""" binStats = BinStatistics(threads = 1) scaffolds = {'S1':'ACGT' + DefaultValues.CONTIG_BREAK + 'ACGT', 'S2':'ACGTACGT', 'S3':'TTtt'} scaffoldStats = defaultdict(dict) maxScaffoldLen, maxContigLen, totalScaffoldBps, _, _, numContigs = binStats.calculateScaffoldLengthStats(scaffolds, scaffoldStats) self.assertAlmostEqual(scaffoldStats['S1']['Length'], len(DefaultValues.CONTIG_BREAK) + 8) self.assertAlmostEqual(scaffoldStats['S1']['Total contig length'], 8) self.assertAlmostEqual(scaffoldStats['S1']['# contigs'], 2) self.assertAlmostEqual(maxScaffoldLen, len(DefaultValues.CONTIG_BREAK) + 8) self.assertAlmostEqual(maxContigLen, 8) self.assertAlmostEqual(totalScaffoldBps, len(DefaultValues.CONTIG_BREAK) + 8 + 8 + 4) self.assertAlmostEqual(numContigs, 4)
def parallelCoordPlot(self, options): """Parallel coordinate plot command""" self.logger.info( '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) checkFileExists(options.coverage_file) binFiles = self.binFiles(options.bin_dir, options.extension) # read coverage stats file coverage = Coverage(threads=1) coverageStats = coverage.parseCoverage(options.coverage_file) # calculate sequence stats for all bins self.logger.info('Calculating sequence statistics for each bin.') binStats = BinStatistics() seqStats = {} for f in binFiles: binId = binIdFromFilename(f) seqStats[binId] = binStats.sequenceStats(options.results_dir, f) # create plot for each bin plot = ParallelCoordPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting parallel coordinates for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 plot.plot(binId, seqStats, coverageStats) outputFile = os.path.join( options.output_dir, binId) + '.paralel_coord_plot.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def testCodingBases(self): """Verify computation of coding bases.""" binStats = BinStatistics(threads=1) aaGenes = {'S1_C1': 'ACGTACGT', 'S1_C2': 'ACGTACGT', 'S3_C1': 'TTtt'} seqStats = defaultdict(dict) codingBasePairs = binStats._BinStatistics__calculateCodingBases( aaGenes, seqStats) self.assertAlmostEqual(seqStats['S1']['# ORFs'], 2) self.assertAlmostEqual( seqStats['S1']['Coding bases'], len(aaGenes['S1_C1']) * 3 + len(aaGenes['S1_C2']) * 3) self.assertAlmostEqual( codingBasePairs, len(aaGenes['S1_C1']) * 3 + len(aaGenes['S1_C2']) * 3 + len(aaGenes['S3_C1']) * 3)
def analyze(self, options, db=None): """Analyze command""" self.logger.info( '[CheckM - analyze] Identifying marker genes in bins.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists( os.path.join(options.output_dir, 'storage', 'aai_qa')) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_OUT, DefaultValues.HMMER_OUT, options.marker_file, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) markerSetParser = MarkerSetParser(options.threads) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, getBinIdsFromOutDir(options.output_dir), options.marker_file) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) self.timeKeeper.printTimeStamp() # HMM model file if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.HMM_MODELS_SET: markerFile = options.marker_file else: markerFile = DefaultValues.HMM_MODELS # align marker genes with multiple hits within a bin HA = HmmerAligner(options.threads) HA.makeAlignmentsOfMultipleHits( options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT, binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL, DefaultValues.LENGTH, os.path.join(options.output_dir, 'storage', 'aai_qa')) self.timeKeeper.printTimeStamp() # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT) self.timeKeeper.printTimeStamp() # align top hit to each marker if requested if options.bAlignTopHit: alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments') makeSurePathExists(alignmentOutputFolder) HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentTopHit( options.output_dir, options.marker_file, DefaultValues.HMMER_TABLE_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, True, alignmentOutputFolder) # report marker gene data fout = open( os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w') fout.write('Marker Id\tLength (bp)\n') markerIds = resultsParser.models[list( resultsParser.models.keys())[0]].keys() for markerId in markerIds: fout.write('%s\t%d\n' % (markerId, resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng)) fout.close() self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder) self.timeKeeper.printTimeStamp()