Beispiel #1
0
    def removeOutliers(self, binFile, outlierFile, outputFile):
        """Remove sequences specified as outliers in the provided file."""

        binSeqs = readFasta(binFile)
        binIdToModify = binIdFromFilename(binFile)

        # get files to remove
        checkFileExists(outlierFile)
        seqsToRemove = []
        bHeader = True
        for line in open(outlierFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            binId = lineSplit[0]

            if binId == binIdToModify:
                seqId = lineSplit[1]
                seqsToRemove.append(seqId)

        # remove sequences from bin
        if len(seqsToRemove) > 0:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Beispiel #2
0
    def coveragePcaPlot(self, options):
        """PCA plot of coverage profiles"""
        self.logger.info(
            '[CheckM - cov_pca] Creating PCA plot of coverage profiles.')

        checkDirExists(options.bin_dir)
        checkFileExists(options.coverage_file)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        seqIds = []
        coverageProfiles = []
        for binId, seqDict in coverageStats.items():
            for seqId, bamDict in seqDict.items():
                seqIds.append(seqId)

                coverages = []
                for _, coverage in bamDict.items():
                    coverages.append(coverage)

                coverageProfiles.append(coverages)

        coverageProfiles = np.array(coverageProfiles)
        if coverageProfiles.shape[1] < 2:
            self.logger.error(
                'Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.'
            )
            sys.exit(1)

        self.logger.info('Computing PCA of coverage profiles.\n')
        pca = PCA()
        pc, variance = pca.pcaMatrix(coverageProfiles,
                                     fraction=1.0,
                                     bCenter=True,
                                     bScale=False)

        plots = PcaPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting PCA of coverage profiles for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, seqIds, pc, variance)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.cov_pca_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Beispiel #3
0
    def parseBinStatsExt(self, resultsFolder):
        """Read bin statistics from file."""
        binStatsExtFile = os.path.join(resultsFolder, 'storage', DefaultValues.BIN_STATS_EXT_OUT)

        checkFileExists(binStatsExtFile)

        with open(binStatsExtFile, 'r') as f:
            s = f.read()
            binStatsExt = ast.literal_eval(s)

        return binStatsExt
Beispiel #4
0
    def __init__(self, filename):
        checkFileExists(filename)

        self.genes = {}
        self.lastCodingBase = {}

        self.__parseGFF(filename)

        self.codingBaseMasks = {}
        for seqId in self.genes:
            self.codingBaseMasks[seqId] = self.__buildCodingBaseMask(seqId)
Beispiel #5
0
    def parseSeqStats(self, resultsFolder, seqStatsFile):
        """Read sequence statistics from file."""
        seqStatsFile = os.path.join(resultsFolder, 'storage', seqStatsFile)

        checkFileExists(seqStatsFile)

        with open(seqStatsFile, 'r') as f:
            s = f.read()
            seqStats = ast.literal_eval(s)

        return seqStats
Beispiel #6
0
    def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen):
        checkFileExists(seqFile)

        # get list of sequences in bins
        self.logger.info('  Reading binned sequences.')

        binnedSeqs = {}
        totalBinnedBases = 0
        for binFile in binFiles:
            seqs = readFasta(binFile)
            binnedSeqs.update(seqs)
            for seq in seqs.values():
                totalBinnedBases += len(seq)

        self.logger.info('    Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6))

        # get list of all sequences
        self.logger.info('  Reading all sequences.')
        allSeqs = readFasta(seqFile)
        totalBases = 0
        for seq in allSeqs.values():
            totalBases += len(seq)
        self.logger.info('    Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6))

        # write all unbinned sequences
        self.logger.info('  Identifying unbinned sequences >= %d bp.' % minSeqLen)
        seqOut = open(outSeqFile, 'w')

        statsOut = open(outStatsFile, 'w')
        statsOut.write('Sequence Id\tLength\tGC\n')

        unbinnedCount = 0
        unbinnedBases = 0
        for seqId, seq in allSeqs.iteritems():
            if seqId not in binnedSeqs:
                if len(seq) >= minSeqLen:
                    unbinnedCount += 1
                    seqOut.write('>' + seqId + '\n')
                    seqOut.write(seq + '\n')

                    unbinnedBases += len(seq)

                    a, c, g, t = baseCount(seq)

                    statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t)))

        seqOut.close()
        statsOut.close()

        self.logger.info('    Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6))

        self.logger.info('')
        self.logger.info('  Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs)))
        self.logger.info('  Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
Beispiel #7
0
    def __init__(self, filename):
        checkFileExists(filename)

        self.genes = {}
        self.lastCodingBase = {}

        self.__parseGFF(filename)

        self.codingBaseMasks = {}
        for seqId in self.genes:
            self.codingBaseMasks[seqId] = self.__buildCodingBaseMask(seqId)
Beispiel #8
0
    def parseBinStats(self, resultsFolder, binStatsFile):
        """Read bin statistics from file."""
        binStatsFile = os.path.join(resultsFolder, 'storage', binStatsFile)

        checkFileExists(binStatsFile)

        with open(binStatsFile, 'r') as f:
            s = f.read()
            binStats = ast.literal_eval(s)

        return binStats
Beispiel #9
0
    def parseMarkerGeneStats(self, resultsFolder):
        """Read bin statistics from file."""
        markerGeneStatsFile = os.path.join(resultsFolder, 'storage', DefaultValues.MARKER_GENE_STATS)

        checkFileExists(markerGeneStatsFile)

        with open(markerGeneStatsFile, 'r') as f:
            s = f.read()
            markerGeneStats = ast.literal_eval(s)

        return markerGeneStats
Beispiel #10
0
    def qa(self, options):
        """QA command"""
        self.logger.info('[CheckM - qa] Tabulating genome statistics.')

        checkDirExists(options.analyze_dir)

        if options.exclude_markers:
            checkFileExists(options.exclude_markers)

        # calculate AAI between marks with multiple hits in a single bin
        aai = AminoAcidIdentity()
        aai.run(options.aai_strain, options.analyze_dir,
                options.alignment_file)

        # get HMM file for each bin

        markerSetParser = MarkerSetParser(options.threads)

        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir),
            options.marker_file, options.exclude_markers)

        # get results for each bin
        RP = ResultsParser(binIdToModels)
        RP.analyseResults(
            options.analyze_dir,
            DefaultValues.BIN_STATS_OUT,
            DefaultValues.HMMER_TABLE_OUT,
            bIgnoreThresholds=options.bIgnoreThresholds,
            evalueThreshold=options.e_value,
            lengthThreshold=options.length,
            bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
            bSkipAdjCorrection=options.bSkipAdjCorrection)

        RP.printSummary(options.out_format,
                        aai,
                        binIdToBinMarkerSets,
                        options.bIndividualMarkers,
                        options.coverage_file,
                        options.bTabTable,
                        options.file,
                        anaFolder=options.analyze_dir)
        RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets,
                        options.bIndividualMarkers)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Beispiel #11
0
    def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen):
        checkFileExists(seqFile)

        # get list of sequences in bins
        self.logger.info('Reading binned sequences.')

        binnedSeqs = {}
        totalBinnedBases = 0
        for binFile in binFiles:
            seqs = readFasta(binFile)
            binnedSeqs.update(seqs)
            for seq in seqs.values():
                totalBinnedBases += len(seq)

        self.logger.info('  Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6))

        # get list of all sequences
        self.logger.info('Reading all sequences.')
        allSeqs = readFasta(seqFile)
        totalBases = 0
        for seq in allSeqs.values():
            totalBases += len(seq)
        self.logger.info('  Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6))

        # write all unbinned sequences
        self.logger.info('Identifying unbinned sequences >= %d bp.' % minSeqLen)
        seqOut = open(outSeqFile, 'w')

        statsOut = open(outStatsFile, 'w')
        statsOut.write('Sequence Id\tLength\tGC\n')

        unbinnedCount = 0
        unbinnedBases = 0
        for seqId, seq in allSeqs.iteritems():
            if seqId not in binnedSeqs:
                if len(seq) >= minSeqLen:
                    unbinnedCount += 1
                    seqOut.write('>' + seqId + '\n')
                    seqOut.write(seq + '\n')

                    unbinnedBases += len(seq)

                    a, c, g, t = baseCount(seq)

                    statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t)))

        seqOut.close()
        statsOut.close()

        self.logger.info('  Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6))

        self.logger.info('Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs)))
        self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
Beispiel #12
0
    def genesInClan(self):
        """Determine all genes within a each clan."""
        checkFileExists(self.pfamClanFile)

        d = defaultdict(set)
        for line in open(self.pfamClanFile):
            if '#=GF AC' in line:
                pfamAcc = line.split()[2].strip()
            elif '#=GF CL' in line:
                clanId = line.split()[2].strip()
                d[clanId].update([pfamAcc])

        return d
Beispiel #13
0
    def parseMarkerGeneStats(self, resultsFolder):
        """Read bin statistics from file."""
        markerGeneStatsFile = os.path.join(resultsFolder, 'storage', DefaultValues.MARKER_GENE_STATS)

        checkFileExists(markerGeneStatsFile)

        markerGeneStats = {}
        with open(markerGeneStatsFile, 'r') as f:
            for line in f:
                lineSplit = line.split('\t')
                markerGeneStats[lineSplit[0]] = ast.literal_eval(lineSplit[1])

        return markerGeneStats
Beispiel #14
0
    def parseBinStats(self, resultsFolder, binStatsFile):
        """Read bin statistics from file."""
        binStatsFile = os.path.join(resultsFolder, 'storage', binStatsFile)

        checkFileExists(binStatsFile)

        binStats = {}
        with open(binStatsFile, 'r') as f:
            for line in f:
                lineSplit = line.split('\t')
                binStats[lineSplit[0]] = ast.literal_eval(lineSplit[1])

        return binStats
Beispiel #15
0
    def genesInClan(self):
        """Determine all genes within a each clan."""
        checkFileExists(self.pfamClanFile)

        d = defaultdict(set)
        for line in open(self.pfamClanFile):
            if '#=GF AC' in line:
                pfamAcc = line.split()[2].strip()
            elif '#=GF CL' in line:
                clanId = line.split()[2].strip()
                d[clanId].update([pfamAcc])

        return d
Beispiel #16
0
    def parseMarkerGeneStats(self, resultsFolder):
        """Read bin statistics from file."""
        markerGeneStatsFile = os.path.join(resultsFolder, 'storage', DefaultValues.MARKER_GENE_STATS)

        checkFileExists(markerGeneStatsFile)

        markerGeneStats = {}
        with open(markerGeneStatsFile, 'r') as f:
            for line in f:
                lineSplit = line.split('\t')
                markerGeneStats[lineSplit[0]] = ast.literal_eval(lineSplit[1])

        return markerGeneStats
Beispiel #17
0
    def parseBinStatsExt(self, resultsFolder):
        """Read bin statistics from file."""
        binStatsExtFile = os.path.join(resultsFolder, 'storage', DefaultValues.BIN_STATS_EXT_OUT)

        checkFileExists(binStatsExtFile)

        binStatsExt = {}
        with open(binStatsExtFile, 'r') as f:
            for line in f:
                lineSplit = line.split('\t')
                binStatsExt[lineSplit[0]] = ast.literal_eval(lineSplit[1])

        return binStatsExt
Beispiel #18
0
    def parseBinStatsExt(self, resultsFolder):
        """Read bin statistics from file."""
        binStatsExtFile = os.path.join(resultsFolder, 'storage', DefaultValues.BIN_STATS_EXT_OUT)

        checkFileExists(binStatsExtFile)

        binStatsExt = {}
        with open(binStatsExtFile, 'r') as f:
            for line in f:
                lineSplit = line.split('\t')
                binStatsExt[lineSplit[0]] = ast.literal_eval(lineSplit[1])

        return binStatsExt
Beispiel #19
0
    def pfamIdToClanId(self):
        """Determine clan of each pfam."""
        checkFileExists(self.pfamClanFile)

        d = {}
        for line in open(self.pfamClanFile):
            if '#=GF AC' in line:
                pfamAcc = line.split()[2].strip()
            elif '#=GF CL' in line:
                clanId = line.split()[2].strip()
                d[pfamAcc] = clanId

        return d
Beispiel #20
0
    def parseBinStats(self, resultsFolder, binStatsFile):
        """Read bin statistics from file."""
        binStatsFile = os.path.join(resultsFolder, 'storage', binStatsFile)

        checkFileExists(binStatsFile)

        binStats = {}
        with open(binStatsFile, 'r') as f:
            for line in f:
                lineSplit = line.split('\t')
                binStats[lineSplit[0]] = ast.literal_eval(lineSplit[1])

        return binStats
Beispiel #21
0
    def pfamIdToClanId(self):
        """Determine clan of each pfam."""
        checkFileExists(self.pfamClanFile)

        d = {}
        for line in open(self.pfamClanFile):
            if '#=GF AC' in line:
                pfamAcc = line.split()[2].strip()
            elif '#=GF CL' in line:
                clanId = line.split()[2].strip()
                d[pfamAcc] = clanId

        return d
Beispiel #22
0
    def genePositions(self, filename):
        checkFileExists(filename)

        gp = {}
        for line in open(filename):
            if line[0] == '>':
                lineSplit = line[1:].split()

                geneId = lineSplit[0]
                startPos = int(lineSplit[2])
                endPos = int(lineSplit[4])

                gp[geneId] = [startPos, endPos]

        return gp
Beispiel #23
0
    def genePositions(self, filename):
        checkFileExists(filename)

        gp = {}
        for line in open(filename):
            if line[0] == '>':
                lineSplit = line[1:].split()

                geneId = lineSplit[0]
                startPos = int(lineSplit[2])
                endPos = int(lineSplit[4])

                gp[geneId] = [startPos, endPos]

        return gp
Beispiel #24
0
    def profile(self, options):
        """Profile command"""

        self.logger.info(
            '[CheckM - profile] Calculating percentage of reads mapped to each bin.'
        )

        checkFileExists(options.coverage_file)

        profile = Profile()
        profile.run(options.coverage_file, options.file, options.bTabTable)

        if options.file != '':
            self.logger.info('Profile information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Beispiel #25
0
    def tetraSignatures(self, options):
        """Tetranucleotide signature command"""

        self.logger.info(
            '[CheckM - tetra] Calculating tetranucleotide signature of sequences.'
        )

        checkFileExists(options.seq_file)
        makeSurePathExists(os.path.dirname(options.output_file))

        tetraSig = GenomicSignatures(4, options.threads)
        tetraSig.calculate(options.seq_file, options.output_file)

        self.logger.info('Tetranucletoide signatures written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
Beispiel #26
0
    def ssuFinder(self, options):
        """SSU finder command"""

        self.logger.info(
            '[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.'
        )

        binFiles = self.binFiles(options.bin_dir, options.extension)

        checkFileExists(options.seq_file)
        makeSurePathExists(options.output_dir)

        ssuFinder = SSU_Finder(options.threads)
        ssuFinder.run(options.seq_file, binFiles, options.output_dir,
                      options.evalue, options.concatenate)

        self.timeKeeper.printTimeStamp()
Beispiel #27
0
    def binUnion(self, options):
        """Bin union command"""

        self.logger.info(
            '[CheckM - bin_union] Redundancy reduce multiple sets of bins into a single set.'
        )

        output_dir = options.output_dir
        makeSurePathExists(output_dir)

        bin_dirs = []
        checkmQaTsvs = []
        for i, arg in enumerate(options.bin_or_checkm_qa_table):
            if i % 2 == 0:
                checkDirExists(arg)
                bin_dirs.append(arg)
            else:
                checkFileExists(arg)
                checkmQaTsvs.append(arg)

        if len(bin_dirs) < 2:
            self.logger.error(
                "Need to specify at least two bin folders, found %i: " %
                len(bin_dirs))
            sys.exit(1)
        if len(bin_dirs) != len(checkmQaTsvs):
            self.logger.error(
                "Need to specify the same number of bin folders as checkm_qa_tsv files, found %i and %i, respectively: "
                % (len(bin_dirs), len(checkmQaTsvs)))
            sys.exit(1)

        binFileSets = []
        for bin_dir in bin_dirs:
            self.logger.info(
                "Reading fasta files with extension %s from bin folder %s" %
                (options.extension, bin_dir))
            binFileSets.append(self.binFiles(bin_dir, options.extension))

        binUnion = BinUnion()

        contigConflictsOutputFile = os.path.join(output_dir,
                                                 'contigConflicts.csv')
        unionBinOutputFile = os.path.join(output_dir, 'union.txt')
        binUnion.report(bin_dirs, binFileSets, checkmQaTsvs,
                        unionBinOutputFile, contigConflictsOutputFile,
                        options.min_completeness, options.max_contamination)
Beispiel #28
0
    def parallelCoordPlot(self, options):
        """Parallel coordinate plot command"""

        self.logger.info(
            '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)
        checkFileExists(options.coverage_file)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read coverage stats file
        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        # calculate sequence stats for all bins
        self.logger.info('Calculating sequence statistics for each bin.')
        binStats = BinStatistics()
        seqStats = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqStats[binId] = binStats.sequenceStats(options.results_dir, f)

        # create plot for each bin

        plot = ParallelCoordPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting parallel coordinates for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plot.plot(binId, seqStats, coverageStats)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.paralel_coord_plot.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Beispiel #29
0
    def outliers(self, options):
        """Outlier command"""

        self.logger.info('[CheckM - outlier] Identifying outliers in bins.')

        checkDirExists(options.bin_dir)
        checkFileExists(options.tetra_profile)
        makeSurePathExists(os.path.dirname(options.output_file))

        binFiles = self.binFiles(options.bin_dir, options.extension)

        binTools = BinTools()
        binTools.identifyOutliers(options.results_dir, binFiles,
                                  options.tetra_profile, options.distributions,
                                  options.report_type, options.output_file)

        self.logger.info('Outlier information written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
Beispiel #30
0
    def __readClansAndNesting(self):
        checkFileExists(self.pfamClanFile)

        idNested = defaultdict(list)
        for line in open(self.pfamClanFile):
            if '#=GF ID' in line:
                ID = line.split()[2].strip()
            elif '#=GF AC' in line:
                pfamAcc = line.split()[2].strip()
                pfamAcc = pfamAcc[0:pfamAcc.rfind('.')]
                self.idToAcc[ID] = pfamAcc
            elif '#=GF CL' in line:
                clanId = line.split()[2].strip()
                self.clan[pfamAcc] = clanId
            elif '#=GF NE' in line:
                nestedId = line.split()[2].strip()
                idNested[nestedId].append(ID)
                idNested[ID].append(nestedId)

        # set nested structure to use pfam accessions instead of IDs
        for ID, nested in idNested.iteritems():
            pfamAcc = self.idToAcc[ID]
            self.nested[pfamAcc] = set([self.idToAcc[x] for x in nested])
Beispiel #31
0
    def __readClansAndNesting(self):
        checkFileExists(self.pfamClanFile)

        idNested = defaultdict(list)
        for line in open(self.pfamClanFile):
            if '#=GF ID' in line:
                ID = line.split()[2].strip()
            elif '#=GF AC' in line:
                pfamAcc = line.split()[2].strip()
                pfamAcc = pfamAcc[0:pfamAcc.rfind('.')]
                self.idToAcc[ID] = pfamAcc
            elif '#=GF CL' in line:
                clanId = line.split()[2].strip()
                self.clan[pfamAcc] = clanId
            elif '#=GF NE' in line:
                nestedId = line.split()[2].strip()
                idNested[nestedId].append(ID)
                idNested[ID].append(nestedId)

        # set nested structure to use pfam accessions instead of IDs
        for ID, nested in idNested.iteritems():
            pfamAcc = self.idToAcc[ID]
            self.nested[pfamAcc] = set([self.idToAcc[x] for x in nested])
Beispiel #32
0
    def run(self, parser, outputDir):
        """Run standard E. coli genome to verify operation of CheckM."""

        ecoliFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data',
                                 '637000110.fna')
        checkFileExists(ecoliFile)

        options = Options()
        options.threads = 1
        options.extension = 'fna'
        options.bQuiet = True
        options.out_folder = os.path.join(outputDir, 'results')
        if os.path.exists(options.out_folder):
            shutil.rmtree(options.out_folder)
        makeSurePathExists(options.out_folder)

        print '[Step 1]: Verifying tree command.'
        options.bKeepAlignment = False
        options.bNucORFs = False
        options.bCalledGenes = False
        options.bReducedTree = True
        options.bin_folder = os.path.join(DefaultValues.CHECKM_DATA_DIR,
                                          'test_data')
        parser.tree(options)
        self.verifyTree(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 2]: Verifying tree_qa command.'
        options.tree_folder = options.out_folder
        options.out_format = 1
        options.file = os.path.join(options.out_folder, 'tree_qa_test.tsv')
        options.bTabTable = True
        parser.treeQA(options)
        self.verifyTreeQA(options.file)
        print '\n  [Passed]'

        print '\n'
        print '[Step 3]: Verifying lineage_set command.'
        options.marker_file = os.path.join(options.out_folder,
                                           'lineage_set_test.tsv')
        options.bForceDomain = False
        options.bootstrap = 0
        options.num_genomes_markers = 30
        options.num_genomes_refine = 5
        options.bNoLineageSpecificRefinement = False

        options.bRequireTaxonomy = False
        options.unique = 10
        options.multi = 10
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)

        options.bRequireTaxonomy = True
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)
        print '\n  [Passed]'

        print '\n'
        print '[Step 4]: Verifying analyze command.'
        options.bAlignTopHit = False
        parser.analyze(options)
        self.verifyAnalyze(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 5]: Verifying qa command.'
        options.alignment_file = None
        options.analyze_folder = options.out_folder
        options.out_format = 1
        options.exclude_markers = None
        options.bSkipPseudoGeneCorrection = False
        options.bSkipAdjCorrection = False
        options.file = os.path.join(options.out_folder, 'qa_test.tsv')
        options.bIndividualMarkers = False
        options.bIgnoreThresholds = False
        options.aai_strain = 0.9
        options.e_value = 1e-10
        options.length = 0.7
        options.coverage_file = None
        options.bTabTable = True
        parser.qa(options)
        self.verifyQA(options.file)
        print '\n  [Passed]'
Beispiel #33
0
    def run(self, coverageFile, outFile, bTabTable):
        checkFileExists(coverageFile)

        # get number of reads mapped to each bin
        self.logger.info('Determining number of reads mapped to each bin.')

        readsMappedToBin = {}
        binSize = {}
        totalMappedReads = {}
        bHeader = True
        for line in open(coverageFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')

            # seqId = lineSplit[0]
            binId = lineSplit[1]

            seqLen = int(lineSplit[2])
            binSize[binId] = binSize.get(binId, 0) + seqLen

            if binId not in readsMappedToBin:
                readsMappedToBin[binId] = {}

            for i in range(3, len(lineSplit), 3):
                bamId = lineSplit[i]
                mappedReads = int(lineSplit[i + 2])

                totalMappedReads[bamId] = totalMappedReads.get(bamId,
                                                               0) + mappedReads
                readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(
                    bamId, 0) + mappedReads

        # calculate percentage of mapped reads to binned populations
        perMappedReads = {}
        normBinCoverage = {}
        sumNormBinCoverage = {}
        for binId, bamIds in readsMappedToBin.items():
            perMappedReads[binId] = {}
            normBinCoverage[binId] = {}

            for bamId in bamIds:
                perMR = float(
                    readsMappedToBin[binId][bamId]) / totalMappedReads[bamId]
                perMappedReads[binId][bamId] = perMR

                if binId == DefaultValues.UNBINNED:
                    continue

                normCoverage = perMR / binSize[binId]
                normBinCoverage[binId][bamId] = normCoverage
                sumNormBinCoverage[bamId] = sumNormBinCoverage.get(
                    bamId, 0) + normCoverage

        for binId, bamIds in normBinCoverage.items():
            for bamId in bamIds:
                if sumNormBinCoverage[bamId] != 0:
                    normBinCoverage[binId][bamId] /= sumNormBinCoverage[bamId]
                else:
                    normBinCoverage[binId][bamId] = 0

        # write community profile
        oldStdOut = reassignStdOut(outFile)

        sortedBinIds = sorted(readsMappedToBin.keys())
        sortedBamIds = sorted(readsMappedToBin[sortedBinIds[0]].keys())

        header = ['Bin Id', 'Bin size (Mbp)']
        for bamId in sortedBamIds:
            header += [bamId + ': mapped reads']
            header += [bamId + ': % mapped reads']
            header += [bamId + ': % binned populations']
            header += [bamId + ': % community']

        if bTabTable:
            print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        for binId in sortedBinIds:
            row = [binId]
            row += [float(binSize[binId]) / 1e6]

            for bamId in sortedBamIds:
                row += [readsMappedToBin[binId][bamId]]
                row += [perMappedReads[binId][bamId] * 100.0]

                if DefaultValues.UNBINNED in perMappedReads:
                    unbinnedPercentage = perMappedReads[
                        DefaultValues.UNBINNED][bamId]
                else:
                    unbinnedPercentage = 0

                if binId == DefaultValues.UNBINNED:
                    row += ['NA']
                    row += [unbinnedPercentage * 100.0]
                else:
                    row += [normBinCoverage[binId][bamId] * 100.0]
                    row += [
                        normBinCoverage[binId][bamId] * 100.0 *
                        (1.0 - unbinnedPercentage)
                    ]

            if bTabTable:
                print('\t'.join(list(map(str, row))))
            else:
                pTable.add_row(row)

        if not bTabTable:
            print(pTable.get_string())

        restoreStdOut(outFile, oldStdOut)
Beispiel #34
0
    def run(self, parser, outputDir):
        """Run standard E. coli genome to verify operation of CheckM."""

        ecoliFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data', '637000110.fna')
        checkFileExists(ecoliFile)

        options = Options()
        options.threads = 1
	options.pplacer_threads = 1
        options.extension = 'fna'
        options.bQuiet = True
        options.out_folder = os.path.join(outputDir, 'results')
        if os.path.exists(options.out_folder):
            shutil.rmtree(options.out_folder)
        makeSurePathExists(options.out_folder)

        print '[Step 1]: Verifying tree command.'
        options.bKeepAlignment = False
        options.bNucORFs = False
        options.bCalledGenes = False
        options.bReducedTree = True
        options.bin_folder = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data')
        parser.tree(options)
        self.verifyTree(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 2]: Verifying tree_qa command.'
        options.tree_folder = options.out_folder
        options.out_format = 1
        options.file = os.path.join(options.out_folder, 'tree_qa_test.tsv')
        options.bTabTable = True
        parser.treeQA(options)
        self.verifyTreeQA(options.file)
        print '\n  [Passed]'

        print '\n'
        print '[Step 3]: Verifying lineage_set command.'
        options.marker_file = os.path.join(options.out_folder, 'lineage_set_test.tsv')
        options.bForceDomain = False
        options.bootstrap = 0
        options.num_genomes_markers = 30
        options.num_genomes_refine = 5
        options.bNoLineageSpecificRefinement = False

        options.bRequireTaxonomy = False
        options.unique = 10
        options.multi = 10
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)

        options.bRequireTaxonomy = True
        parser.lineageSet(options)
        self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy)
        print '\n  [Passed]'

        print '\n'
        print '[Step 4]: Verifying analyze command.'
        options.bAlignTopHit = False
        parser.analyze(options)
        self.verifyAnalyze(options.out_folder)
        print '\n  [Passed]'

        print '\n'
        print '[Step 5]: Verifying qa command.'
        options.alignment_file = None
        options.analyze_folder = options.out_folder
        options.out_format = 1
        options.exclude_markers = None
        options.bSkipPseudoGeneCorrection = False
        options.bSkipAdjCorrection = False
        options.file = os.path.join(options.out_folder, 'qa_test.tsv')
        options.bIndividualMarkers = False
        options.bIgnoreThresholds = False
        options.aai_strain = 0.9
        options.e_value = 1e-10
        options.length = 0.7
        options.coverage_file = None
        options.bTabTable = True
        parser.qa(options)
        self.verifyQA(options.file)
        print '\n  [Passed]'