def qa(self, options): """QA command""" self.logger.info('[CheckM - qa] Tabulating genome statistics.') checkDirExists(options.analyze_dir) if options.exclude_markers: checkFileExists(options.exclude_markers) # calculate AAI between marks with multiple hits in a single bin aai = AminoAcidIdentity() aai.run(options.aai_strain, options.analyze_dir, options.alignment_file) # get HMM file for each bin markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir), options.marker_file, options.exclude_markers) # get results for each bin RP = ResultsParser(binIdToModels) RP.analyseResults( options.analyze_dir, DefaultValues.BIN_STATS_OUT, DefaultValues.HMMER_TABLE_OUT, bIgnoreThresholds=options.bIgnoreThresholds, evalueThreshold=options.e_value, lengthThreshold=options.length, bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection, bSkipAdjCorrection=options.bSkipAdjCorrection) RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_dir) RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets, options.bIndividualMarkers) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def testMultiCopyStrainHetero(self): """Verify computation of strain heterogeneity score when there are multiple copies of a sequence.""" aai = AminoAcidIdentity() aaiScores = defaultdict(dict) aaiScores['b1'] = {'g1': [0.95, 0.95, 0.95], 'g2': [0.1, 0.1, 0.1], 'g3': [0.95, 0.1, 0.1]} aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9) self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0) self.assertAlmostEqual(aaiHetero['b1']['g2'], 0.0) self.assertAlmostEqual(aaiHetero['b1']['g3'], 1.0 / 3.0) self.assertAlmostEqual(aaiMeanBinHetero['b1'], 4.0 * 100 / 9.0)
def testMixedStrainHetero(self): """Verify computation of strain heterogeneity score on sequences with variable similarity.""" aai = AminoAcidIdentity() aaiScores = defaultdict(dict) aaiScores['b1'] = {'g1': [0.95], 'g2': [0.1], 'g3': [0.1]} aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9) self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0) self.assertAlmostEqual(aaiHetero['b1']['g2'], 0.0) self.assertAlmostEqual(aaiHetero['b1']['g3'], 0.0) self.assertAlmostEqual(aaiMeanBinHetero['b1'], 1.0 * 100 / 3.0)
def testAllStrainHetero(self): """Verify computation of strain heterogeneity score on highly similar sequences.""" aai = AminoAcidIdentity() aaiScores = defaultdict(dict) aaiScores['b1'] = {'g1': [0.95], 'g2': [0.95], 'g3': [0.95]} aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9) self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0) self.assertAlmostEqual(aaiHetero['b1']['g2'], 1.0) self.assertAlmostEqual(aaiHetero['b1']['g3'], 1.0) self.assertAlmostEqual(aaiMeanBinHetero['b1'], 100.0)
def testMultiCopyStrainHetero(self): """Verify computation of strain heterogeneity score when there are multiple copies of a sequence.""" aai = AminoAcidIdentity() aaiScores = defaultdict(dict) aaiScores['b1'] = { 'g1': [0.95, 0.95, 0.95], 'g2': [0.1, 0.1, 0.1], 'g3': [0.95, 0.1, 0.1] } aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9) self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0) self.assertAlmostEqual(aaiHetero['b1']['g2'], 0.0) self.assertAlmostEqual(aaiHetero['b1']['g3'], 1.0 / 3.0) self.assertAlmostEqual(aaiMeanBinHetero['b1'], 4.0 * 100 / 9.0)
def binQAPlot(self, options): """Bin QA plot command""" self.logger.info( '[CheckM - bin_qa_plot] Creating bar plot of bin quality.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) # read model info # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # read sequence stats file resultsParser = ResultsParser(None) binStatsExt = resultsParser.parseBinStatsExt(options.results_dir) # create plot for each bin plot = BinQAPlot(options) bMakePlot = True if not options.bIgnoreHetero: aai = AminoAcidIdentity() aai.run(options.aai_strain, options.results_dir, None) bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, aai.aaiHetero) else: bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, None) if bMakePlot: outputFile = os.path.join(options.output_dir, 'bin_qa_plot.' + options.image_type) plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def testDiffSeqs(self): """Verify computation of AAI on two completely different sequences.""" aai = AminoAcidIdentity() score = aai.aai('ACGT', 'TGCA') self.assertAlmostEqual(score, 0.0)
def testIdenticalInsertionSeqs(self): """Verify computation of AAI on two identical sequences with multiple insertions.""" aai = AminoAcidIdentity() score = aai.aai('A-C-G-T', 'A-C-G-T') self.assertAlmostEqual(score, 1.0)
def testDiffInsertionSeqs(self): """Verify computation of AAI on two different sequences with multiple insertions.""" aai = AminoAcidIdentity() score = aai.aai('A-C-G-T', 'AACCGGT') self.assertAlmostEqual(score, 4.0 / 7.0)
def testIncompleteSeqs(self): """Verify computation of AAI on incomplete sequence.""" aai = AminoAcidIdentity() score = aai.aai('AAAACGTTTT', '---ACGG---') self.assertAlmostEqual(score, 3.0 / 4.0)
def testSimilarSeqs(self): """Verify computation of AAI on two similar sequences.""" aai = AminoAcidIdentity() score = aai.aai('ACGT', 'ACGG') self.assertAlmostEqual(score, 3.0 / 4.0)
def testNonOverlappingSeqs(self): """Verify computation of AAI on two non-overlapping sequences.""" aai = AminoAcidIdentity() score = aai.aai('ACGT----', '----TGCA') self.assertAlmostEqual(score, 0.0)
def testPartialOverlappingSeqs(self): """Verify computation of AAI on two partially overlapping sequences.""" aai = AminoAcidIdentity() score = aai.aai('ACGT--', '--GTAC') self.assertAlmostEqual(score, 1.0)
def testIdentiticalSeqs(self): """Verify computation of AAI on identical sequences.""" aai = AminoAcidIdentity() score = aai.aai('ACGT', 'ACGT') self.assertAlmostEqual(score, 1.0)