Esempio n. 1
0
    def sequenceStats(self, outDir, binFile):
        """Calculate statistics for all sequences within a bin."""

        # read scaffolds
        seqs = readFasta(binFile)

        seqStats = {}
        for seqId in seqs:
            seqStats[seqId] = {}

        self.calculateGC(seqs, seqStats)
        self.calculateSeqStats(seqs, seqStats)

        binId = binIdFromFilename(binFile)
        aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA)
        if os.path.exists(aaFile):
            aaGenes = readFasta(aaFile)
            for geneId, gene in aaGenes.items():
                seqId = geneId[0:geneId.rfind('_')]
                seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs',
                                                                0) + 1
                seqStats[seqId]['Coding bases'] = seqStats[seqId].get(
                    'Coding bases', 0) + len(gene) * 3
        else:
            # missing amino acid file likely indicates users used a pre-called gene file, so
            # just set some defaults
            seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1
            seqStats[seqId]['Coding bases'] = seqStats[seqId].get(
                'Coding bases', 0) + len(gene) * 3

        return seqStats
Esempio n. 2
0
    def sequenceStats(self, outDir, binFile):
        """Calculate statistics for all sequences within a bin."""

        # read scaffolds
        seqs = readFasta(binFile)

        seqStats = {}
        for seqId in seqs:
            seqStats[seqId] = {}

        self.calculateGC(seqs, seqStats)
        self.calculateSeqStats(seqs, seqStats)

        binId = binIdFromFilename(binFile)
        aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA)
        if os.path.exists(aaFile):
            aaGenes = readFasta(aaFile)
            for geneId, gene in aaGenes.iteritems():
                seqId = geneId[0:geneId.rfind('_')]
                seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1
                seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3
        else:
            # missing amino acid file likely indicates users used a pre-called gene file, so
            # just set some defaults
            seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1
            seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3

        return seqStats
Esempio n. 3
0
    def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen):
        checkFileExists(seqFile)

        # get list of sequences in bins
        self.logger.info('  Reading binned sequences.')

        binnedSeqs = {}
        totalBinnedBases = 0
        for binFile in binFiles:
            seqs = readFasta(binFile)
            binnedSeqs.update(seqs)
            for seq in seqs.values():
                totalBinnedBases += len(seq)

        self.logger.info('    Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6))

        # get list of all sequences
        self.logger.info('  Reading all sequences.')
        allSeqs = readFasta(seqFile)
        totalBases = 0
        for seq in allSeqs.values():
            totalBases += len(seq)
        self.logger.info('    Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6))

        # write all unbinned sequences
        self.logger.info('  Identifying unbinned sequences >= %d bp.' % minSeqLen)
        seqOut = open(outSeqFile, 'w')

        statsOut = open(outStatsFile, 'w')
        statsOut.write('Sequence Id\tLength\tGC\n')

        unbinnedCount = 0
        unbinnedBases = 0
        for seqId, seq in allSeqs.iteritems():
            if seqId not in binnedSeqs:
                if len(seq) >= minSeqLen:
                    unbinnedCount += 1
                    seqOut.write('>' + seqId + '\n')
                    seqOut.write(seq + '\n')

                    unbinnedBases += len(seq)

                    a, c, g, t = baseCount(seq)

                    statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t)))

        seqOut.close()
        statsOut.close()

        self.logger.info('    Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6))

        self.logger.info('')
        self.logger.info('  Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs)))
        self.logger.info('  Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
Esempio n. 4
0
    def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen):
        checkFileExists(seqFile)

        # get list of sequences in bins
        self.logger.info('Reading binned sequences.')

        binnedSeqs = {}
        totalBinnedBases = 0
        for binFile in binFiles:
            seqs = readFasta(binFile)
            binnedSeqs.update(seqs)
            for seq in seqs.values():
                totalBinnedBases += len(seq)

        self.logger.info('  Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6))

        # get list of all sequences
        self.logger.info('Reading all sequences.')
        allSeqs = readFasta(seqFile)
        totalBases = 0
        for seq in allSeqs.values():
            totalBases += len(seq)
        self.logger.info('  Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6))

        # write all unbinned sequences
        self.logger.info('Identifying unbinned sequences >= %d bp.' % minSeqLen)
        seqOut = open(outSeqFile, 'w')

        statsOut = open(outStatsFile, 'w')
        statsOut.write('Sequence Id\tLength\tGC\n')

        unbinnedCount = 0
        unbinnedBases = 0
        for seqId, seq in allSeqs.iteritems():
            if seqId not in binnedSeqs:
                if len(seq) >= minSeqLen:
                    unbinnedCount += 1
                    seqOut.write('>' + seqId + '\n')
                    seqOut.write(seq + '\n')

                    unbinnedBases += len(seq)

                    a, c, g, t = baseCount(seq)

                    statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t)))

        seqOut.close()
        statsOut.close()

        self.logger.info('  Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6))

        self.logger.info('Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs)))
        self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
Esempio n. 5
0
    def modify(self, binFile, seqFile, seqsToAdd, seqsToRemove, outputFile):
        """Add and remove sequences from a file."""
        binSeqs = readFasta(binFile)

        # add sequences to bin
        if seqsToAdd != None:
            refSeqs = readFasta(seqFile)
            self.__addSeqs(binSeqs, refSeqs, seqsToAdd)

        # remove sequences from bin
        if seqsToRemove != None:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Esempio n. 6
0
    def modify(self, binFile, seqFile, seqsToAdd, seqsToRemove, outputFile):
        """Add and remove sequences from a file."""
        binSeqs = readFasta(binFile)

        # add sequences to bin
        if seqsToAdd != None:
            refSeqs = readFasta(seqFile)
            self.__addSeqs(binSeqs, refSeqs, seqsToAdd)

        # remove sequences from bin
        if seqsToRemove != None:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Esempio n. 7
0
 def __init__(self, binningIndex, completeness, contamination, binFile):
     self.binningIndex = binningIndex
     self.completeness = completeness
     self.contamination = contamination
     self.binId = binIdFromFilename(binFile)
     self.seqs = readFasta(binFile)
     self.binFile = binFile
Esempio n. 8
0
    def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir,
                      outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in allTrustedGenomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' +
                                 genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'),
                        markerSeqFile,
                        os.path.join(outputGeneDir, modelName + '.aln.faa'),
                        trim=False,
                        outputFormat='Pfam')
            self.__maskAlignment(
                os.path.join(outputGeneDir, modelName + '.aln.faa'),
                os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)
Esempio n. 9
0
    def run(self, aaiStrainThreshold, outDir, alignmentOutputFile):
        """Calculate AAI between input alignments."""

        self.logger.info('Calculating AAI between multi-copy marker genes.')

        if alignmentOutputFile:
            fout = open(alignmentOutputFile, 'w')

        # calculate AAI for duplicate marker genes
        binIds = getBinIdsFromOutDir(outDir)
        aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa')
        for binId in binIds:
            binPath = os.path.join(aaiOutputDir, binId)
            if not os.path.exists(binPath):
                continue

            for f in os.listdir(binPath):
                if not f.endswith('.masked.faa'):
                    continue

                markerId = f[0:f.find('.')]

                seqs = readFasta(os.path.join(binPath, f))

                # calculate AAI between all pairs of seqs
                for i in range(0, len(seqs)):
                    seqIdI = list(seqs.keys())[i]
                    binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    seqI = seqs[seqIdI]

                    for j in range(i + 1, len(seqs)):
                        seqIdJ = list(seqs.keys())[j]
                        binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)]

                        seqJ = seqs[seqIdJ]

                        if binIdI == binIdJ:
                            aai = self.aai(seqI, seqJ)

                            if alignmentOutputFile:
                                fout.write(binId + ',' + markerId + '\n')
                                fout.write(seqIdI + '\t' + seqI + '\n')
                                fout.write(seqIdJ + '\t' + seqJ + '\n')
                                fout.write('AAI: %.3f\n' % aai)
                                fout.write('\n')

                            if binIdI not in self.aaiRawScores:
                                self.aaiRawScores[binIdI] = defaultdict(list)
                            self.aaiRawScores[binIdI][markerId].append(aai)
                        else:
                            # something is wrong as the bin Ids should always be the same
                            self.logger.error('  [Error] Bin ids do not match.')
                            sys.exit(1)

        if alignmentOutputFile:
            fout.close()

        # calculate strain heterogeneity for each marker gene in each bin
        self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
Esempio n. 10
0
    def removeOutliers(self, binFile, outlierFile, outputFile):
        """Remove sequences specified as outliers in the provided file."""

        binSeqs = readFasta(binFile)
        binIdToModify = binIdFromFilename(binFile)

        # get files to remove
        checkFileExists(outlierFile)
        seqsToRemove = []
        bHeader = True
        for line in open(outlierFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            binId = lineSplit[0]

            if binId == binIdToModify:
                seqId = lineSplit[1]
                seqsToRemove.append(seqId)

        # remove sequences from bin
        if len(seqsToRemove) > 0:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Esempio n. 11
0
    def removeOutliers(self, binFile, outlierFile, outputFile):
        """Remove sequences specified as outliers in the provided file."""

        binSeqs = readFasta(binFile)
        binIdToModify = binIdFromFilename(binFile)

        # get files to remove
        checkFileExists(outlierFile)
        seqsToRemove = []
        bHeader = True
        for line in open(outlierFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            binId = lineSplit[0]

            if binId == binIdToModify:
                seqId = lineSplit[1]
                seqsToRemove.append(seqId)

        # remove sequences from bin
        if len(seqsToRemove) > 0:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Esempio n. 12
0
    def run(self, aaiStrainThreshold, outDir, alignmentOutputFile):
        """Calculate AAI between input alignments."""

        self.logger.info('  Calculating AAI between multi-copy marker genes.')

        if alignmentOutputFile:
            fout = open(alignmentOutputFile, 'w')

        # calculate AAI for duplicate marker genes
        binIds = getBinIdsFromOutDir(outDir)
        aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa')
        for binId in binIds:
            binPath = os.path.join(aaiOutputDir, binId)
            if not os.path.exists(binPath):
                continue

            for f in os.listdir(binPath):
                if not f.endswith('.masked.faa'):
                    continue

                markerId = f[0:f.find('.')]

                seqs = readFasta(os.path.join(binPath, f))

                # calculate AAI between all pairs of seqs
                for i in xrange(0, len(seqs)):
                    seqIdI = seqs.keys()[i]
                    binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    seqI = seqs[seqIdI]

                    for j in xrange(i+1, len(seqs)):
                        seqIdJ = seqs.keys()[j]
                        binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)]

                        seqJ = seqs[seqIdJ]

                        if binIdI == binIdJ:
                            aai = self.aai(seqI, seqJ)

                            if alignmentOutputFile:
                                fout.write(binId + ',' + markerId + '\n')
                                fout.write(seqIdI + '\t' + seqI + '\n')
                                fout.write(seqIdJ + '\t' + seqJ + '\n')
                                fout.write('AAI: %.3f\n' % aai)
                                fout.write('\n')

                            if binIdI not in self.aaiRawScores:
                                self.aaiRawScores[binIdI] = defaultdict(list)
                            self.aaiRawScores[binIdI][markerId].append(aai)
                        else:
                            # something is wrong as the bin Ids should always be the same
                            self.logger.error('  [Error] Bin ids do not match.')
                            sys.exit()

        if alignmentOutputFile:
            fout.close()

        # calculate strain heterogeneity for each marker gene in each bin
        self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
Esempio n. 13
0
    def reportFullMSA(self, outDir, outFile):
        """Create MSA with all reference and bin alignments."""

        # write bin alignments to file
        oldStdOut = reassignStdOut(outFile)
        for line in open(
                os.path.join(outDir, 'storage', 'tree',
                             DefaultValues.PPLACER_CONCAT_SEQ_OUT)):
            print((line.rstrip()))

        # read duplicate seqs
        duplicateNodes = self.__readDuplicateSeqs()

        # write reference alignments to file
        seqs = readFasta(
            os.path.join(DefaultValues.PPLACER_REF_PACKAGE_FULL,
                         DefaultValues.GENOME_TREE_FASTA))
        for seqId, seq in seqs.items():
            print(('>' + seqId))
            print(seq)

            if seqId in duplicateNodes:
                for dupSeqId in duplicateNodes[seqId]:
                    print(('>' + dupSeqId))
                    print(seq)

        restoreStdOut(outFile, oldStdOut)
Esempio n. 14
0
    def __extractMarkerSeqsTopHits(self, outDir, resultsParser):
        """Extract marker sequences from top hits (assume all bins use the same HMM file)."""

        markerSeqs = defaultdict(dict)
        markerStats = defaultdict(dict)
        for binId in resultsParser.results:
            # read ORFs for bin
            aaGeneFile = os.path.join(outDir, 'bins', binId,
                                      DefaultValues.PRODIGAL_AA)
            binORFs = readFasta(aaGeneFile)

            # extract ORFs hitting a marker
            for markerId, hits in resultsParser.results[
                    binId].markerHits.items():
                markerSeqs[markerId][binId] = {}
                markerStats[markerId][binId] = {}

                # sort hits from highest to lowest e-value in order to ensure only the best hit
                # to a given target is retained
                hits.sort(key=lambda x: x.full_e_value, reverse=True)
                topHit = hits[0]
                markerSeqs[markerId][binId][
                    topHit.target_name] = self.__extractSeq(
                        topHit.target_name, binORFs)
                markerStats[markerId][binId][topHit.target_name] = [
                    topHit.full_e_value, topHit.full_score
                ]

        return markerSeqs, markerStats
    def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""
        
        while True:
            markerId = queueIn.get(block=True, timeout=None) 
            if markerId == None:
                break 
            
            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in allTrustedGenomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()
            
            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam')
            self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))
            
            queueOut.put(modelName)
Esempio n. 16
0
    def __extractMarkerSeqsUnique(self, outDir, resultsParser):
        """Extract marker sequences with a single unique hit."""

        markerSeqs = defaultdict(dict)
        markerStats = defaultdict(dict)
        for binId in resultsParser.results:
            # read ORFs for bin
            aaGeneFile = os.path.join(outDir, 'bins', binId,
                                      DefaultValues.PRODIGAL_AA)
            binORFs = readFasta(aaGeneFile)

            # extract ORFs hitting a marker
            for markerId, hits in resultsParser.results[
                    binId].markerHits.items():
                markerSeqs[markerId][binId] = {}
                markerStats[markerId][binId] = {}

                # only record hits which are unique
                if len(hits) == 1:
                    hit = hits[0]
                    markerSeqs[markerId][binId][
                        hit.target_name] = self.__extractSeq(
                            hit.target_name, binORFs)
                    markerStats[markerId][binId][hit.target_name] = [
                        hit.full_e_value, hit.full_score
                    ]

        return markerSeqs, markerStats
Esempio n. 17
0
 def __init__(self, binningIndex, completeness, contamination, binFile):
     self.binningIndex = binningIndex
     self.completeness = completeness
     self.contamination = contamination
     self.binId = binIdFromFilename(binFile)
     self.seqs = readFasta(binFile)
     self.binFile = binFile
Esempio n. 18
0
    def __extractMarkersWithMultipleHits(self, outDir, binId, resultsParser,
                                         binMarkerSet):
        """Extract markers with multiple hits within a single bin."""

        markersWithMultipleHits = defaultdict(dict)

        aaGeneFile = os.path.join(outDir, 'bins', binId,
                                  DefaultValues.PRODIGAL_AA)
        binORFs = readFasta(aaGeneFile)

        markerGenes = binMarkerSet.selectedMarkerSet().getMarkerGenes()
        for markerId, hits in resultsParser.results[binId].markerHits.items():
            if markerId not in markerGenes or len(hits) < 2:
                continue

            # sort hits from highest to lowest e-value in order to ensure only the best hit
            # to a given target is retained
            hits.sort(key=lambda x: x.full_e_value, reverse=True)

            # Note: this data structure is used to mimic that used by __extractMarkerSeqsTopHits()
            markersWithMultipleHits[markerId][binId] = {}
            for hit in hits:
                markersWithMultipleHits[markerId][binId][
                    hit.target_name] = self.__extractSeq(
                        hit.target_name, binORFs)

        return markersWithMultipleHits
Esempio n. 19
0
    def run(self, outputDir):
        # make sure output directory exists
        if not os.path.exists(outputDir):
            os.mkdir(outputDir)

        # remove similar taxa
        print 'Filtering out highly similar taxa in order to reduce size of tree:'
        seqs = readFasta(self.derepConcatenatedAlignFile)

        nearlyIdentical = self.__nearlyIdenticalGenomes(seqs, outputDir)

        reducedSeqs = {}
        for s in nearlyIdentical:
            rndGenome = random.choice(tuple(s))
            reducedSeqs[rndGenome] = seqs[rndGenome]

        # write out reduced alignment
        reducedAlignmentFile = os.path.join(outputDir, "genome_tree.fasta")
        writeFasta(reducedSeqs, reducedAlignmentFile)

        # prune tree to retained taxa
        print ''
        print 'Pruning tree:'
        tree = dendropy.Tree.get_from_path(self.tree, schema='newick', as_rooted=False, preserve_underscores=True)

        for seqId in reducedSeqs:
            node = tree.find_node_with_taxon_label(seqId)
            if not node:
                print 'Missing taxa: %s' % seqId

        tree.retain_taxa_with_labels(reducedSeqs.keys())

        outputTree = os.path.join(outputDir, 'genome_tree.tre')
        tree.write_to_path(outputTree, schema='newick', suppress_rooting=True, unquoted_underscores=True)

        for t in tree.internal_nodes():
            t.label = None

        for t in tree.leaf_nodes():
            if t.taxon.label not in reducedSeqs:
                print 'missing in sequence file: %s' % t.taxon.label

        outputTreeWithoutLabels = os.path.join(outputDir, 'genome_tree.small.no_internal_labels.tre')
        tree.write_to_path(outputTreeWithoutLabels, schema='newick', suppress_rooting=True, unquoted_underscores=True)
        print '  Pruned tree written to: %s' % outputTree

        # calculate model parameters for pruned tree
        print ''
        print 'Determining model parameters for new tree.'
        outputTreeLog = os.path.join(outputDir, 'genome_tree.log')
        fastTreeOutput = os.path.join(outputDir, 'genome_tree.no_internal_labels.fasttree.tre')
        # os.system('FastTreeMP -nome -mllen -intree %s -log %s < %s > %s' % (outputTreeWithoutLabels, outputTreeLog, reducedAlignmentFile, fastTreeOutput))

        # calculate reference package for pruned tree
        print ''
        print 'Creating reference package.'
        os.system('taxit create -l %s -P %s --aln-fasta %s --tree-stats %s --tree-file %s' % ('genome_tree_reduced', os.path.join(outputDir, 'genome_tree_reduced.refpkg'), reducedAlignmentFile, outputTreeLog, outputTree))
Esempio n. 20
0
    def __genomeSeqLens(self, genomeId):
        """Determine length of contigs/scaffolds comprising genome."""
        genomeFile = os.path.join(self.genomeDir, genomeId, genomeId + '.fna')
        seqs = readFasta(genomeFile)

        seqLens = {}
        for seqId, seq in seqs.iteritems():
            seqLens[seqId] = len(seq)

        return seqLens
Esempio n. 21
0
    def __genomeSeqLens(self, genomeId):
        """Determine length of contigs/scaffolds comprising genome."""
        genomeFile = os.path.join(self.genomeDir, genomeId, genomeId + '.fna')
        seqs = readFasta(genomeFile)

        seqLens = {}
        for seqId, seq in seqs.items():
            seqLens[seqId] = len(seq)

        return seqLens
Esempio n. 22
0
    def plot(self, fastaFile):
        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axes = self.fig.add_subplot(111)

        # calculate sequence lengths (in kb)
        seqs = readFasta(fastaFile)

        seqLens = []
        for seq in seqs.values():
            seqLens.append(float(len(seq)) / 1e3)

        # set unequal bin sizes (in kb)
        bins = [0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1e12]
        counts, _edges = np.histogram(seqLens, bins=bins)

        # create histogram
        axes.bar(x=np.arange(0.1, len(counts)), height=counts, width=0.8, color=(0.5, 0.5, 0.5))
        axes.set_xlabel('Sequence length (kbp)')
        axes.set_ylabel('Number sequences (out of %d)' % len(seqs))

        # ensure y-axis include zero
        _, end = axes.get_ylim()
        axes.set_ylim([0, end])
        axes.get_yaxis().set_major_locator(MaxNLocator(integer=True))

        # Change sequence lengths from bp to kbp
        axes.set_xlim([0, len(counts)])
        axes.set_xticks(np.arange(0.5, len(counts)))
        axes.set_xticklabels(['<1', '1-2', '2-5', '5-10', '10-20', '20-50', '50-100', '100-200', '200-500', '>500'])

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axes.spines.items():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1)
        self.draw()
Esempio n. 23
0
    def plot(self, fastaFile):
        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axes = self.fig.add_subplot(111)

        # calculate sequence lengths (in kb)
        seqs = readFasta(fastaFile)

        seqLens = []
        for seq in seqs.values():
            seqLens.append(float(len(seq)) / 1e3)

        # set unequal bin sizes (in kb)
        bins = [0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1e12]
        counts, _edges = np.histogram(seqLens, bins=bins)

        # create histogram
        axes.bar(left=np.arange(0.1, len(counts)), height=counts, width=0.8, color=(0.5, 0.5, 0.5))
        axes.set_xlabel('Sequence length (kbp)')
        axes.set_ylabel('Number sequences (out of %d)' % len(seqs))

        # ensure y-axis include zero
        _, end = axes.get_ylim()
        axes.set_ylim([0, end])
        axes.get_yaxis().set_major_locator(MaxNLocator(integer=True))

        # Change sequence lengths from bp to kbp
        axes.set_xlim([0, len(counts)])
        axes.set_xticks(np.arange(0.5, len(counts)))
        axes.set_xticklabels(['<1', '1-2', '2-5', '5-10', '10-20', '20-50', '50-100', '100-200', '200-500', '>500'])

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axes.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1)
        self.draw()
Esempio n. 24
0
    def __createConcatenatedAlignment(self, binFiles, resultsParser,
                                      alignOutputDir):
        """Create a concatenated alignment of marker genes for each bin."""

        # read alignment files
        self.logger.info('  Reading marker alignment files.')
        alignments = defaultdict(dict)
        files = os.listdir(alignOutputDir)
        binIds = set()
        for f in files:
            if f.endswith('.masked.faa'):
                markerId = f[0:f.find('.masked.faa')]
                seqs = readFasta(os.path.join(alignOutputDir, f))

                for seqId, seq in seqs.items():
                    binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    alignments[markerId][binId] = seq
                    binIds.add(binId)

        # get all markers and their lengths
        markerIds = list(resultsParser.models[list(
            resultsParser.models.keys())[0]].keys())
        markerIdLens = {}
        for markerId in markerIds:
            markerIdLens[markerId] = resultsParser.models[list(
                resultsParser.models.keys())[0]][markerId].leng

        # create concatenated alignment
        self.logger.info('  Concatenating alignments.')
        concatenatedSeqs = {}
        for markerId in sorted(markerIds):
            seqs = alignments[markerId]

            for binId in binIds:
                if binId in seqs:
                    # append alignment
                    concatenatedSeqs[binId] = concatenatedSeqs.get(
                        binId, '') + seqs[binId]
                else:
                    # missing gene
                    concatenatedSeqs[binId] = concatenatedSeqs.get(
                        binId, '') + '-' * markerIdLens[markerId]

        # save concatenated alignment
        concatenatedAlignFile = os.path.join(
            alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT)
        writeFasta(concatenatedSeqs, concatenatedAlignFile)

        return concatenatedAlignFile
Esempio n. 25
0
    def plot(self, fastaFile):
        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axes = self.fig.add_subplot(111)

        # calculate Nx
        seqs = readFasta(fastaFile)
        x = np.arange(0, 1.0 + 0.5 * self.options.step_size,
                      self.options.step_size)
        nx = self.calculateNx(x, seqs)

        # Create plot
        axes.plot(x, nx, 'ko-', ms=4)
        axes.set_xlabel('Nx')
        axes.set_ylabel('Sequence length (kbp)')

        # Change sequence lengths from bp to kbp
        yticks = axes.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axes.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axes.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1)
        self.draw()
Esempio n. 26
0
    def plot(self, fastaFile):
        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axes = self.fig.add_subplot(111)

        # calculate Nx
        seqs = readFasta(fastaFile)
        x = np.arange(0, 1.0 + 0.5 * self.options.step_size, self.options.step_size)
        nx = self.calculateNx(x, seqs)

        # Create plot
        axes.plot(x, nx, 'ko-', ms=4)
        axes.set_xlabel('Nx')
        axes.set_ylabel('Sequence length (kbp)')

        # Change sequence lengths from bp to kbp
        yticks = axes.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axes.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axes.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1)
        self.draw()
Esempio n. 27
0
    def calculateCodingDensity(self, outDir, genomeSize, seqStats):
        """Calculate coding density of putative genome bin."""
        gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF)
        if os.path.exists(gffFile):
            prodigalParserGFF = ProdigalGeneFeatureParser(gffFile)

            aaFile = os.path.join(outDir, DefaultValues.PRODIGAL_AA)  # use AA file as nucleotide file is optional
            aaGenes = readFasta(aaFile)

            codingBasePairs = self.__calculateCodingBases(aaGenes, seqStats)

            return float(codingBasePairs) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes)
        else:
            # there is not gene feature file (perhaps the user specified precalculated genes)
            # so calculting the coding density is not possible
            return -1, -1, -1
Esempio n. 28
0
    def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir):
        """Create a concatenated alignment of marker genes for each bin."""

        # read alignment files
        self.logger.info('  Reading marker alignment files.')
        alignments = defaultdict(dict)
        files = os.listdir(alignOutputDir)
        binIds = set()
        for f in files:
            if f.endswith('.masked.faa'):
                markerId = f[0:f.find('.masked.faa')]
                seqs = readFasta(os.path.join(alignOutputDir, f))

                for seqId, seq in seqs.iteritems():
                    binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    alignments[markerId][binId] = seq
                    binIds.add(binId)

        # get all markers and their lengths
        markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys()
        markerIdLens = {}
        for markerId in markerIds:
            markerIdLens[markerId] = resultsParser.models[resultsParser.models.keys()[0]][markerId].leng

        # create concatenated alignment
        self.logger.info('  Concatenating alignments.')
        concatenatedSeqs = {}
        for markerId in sorted(markerIds):
            seqs = alignments[markerId]

            for binId in binIds:
                if binId in seqs:
                    # append alignment
                    concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + seqs[binId]
                else:
                    # missing gene
                    concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + '-' * markerIdLens[markerId]

        # save concatenated alignment
        concatenatedAlignFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT)
        writeFasta(concatenatedSeqs, concatenatedAlignFile)

        return concatenatedAlignFile
Esempio n. 29
0
    def calculate(self, seqFile, outputFile):
        """Calculate genomic signature of each sequence."""

        self.logger.info(
            '  Determining tetranucleotide signature of each sequence.')

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        seqs = readFasta(seqFile)

        for seqId, seq in seqs.items():
            workerQueue.put((seqId, seq))

        for _ in range(self.totalThreads):
            workerQueue.put((None, None))

        try:
            calcProc = [
                mp.Process(target=self.__calculateResults,
                           args=(workerQueue, writerQueue))
                for _ in range(self.totalThreads)
            ]
            writeProc = mp.Process(target=self.__storeResults,
                                   args=(seqFile, outputFile, len(seqs),
                                         writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put((None, None))
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Esempio n. 30
0
    def __processBin(self, outDir, queueIn, queueOut):
        """Thread safe bin processing."""
        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binStats = {}

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # read scaffolds
            scaffolds = readFasta(binFile)

            # calculate GC statistics
            GC, stdGC = self.calculateGC(scaffolds)
            binStats['GC'] = GC
            binStats['GC std'] = stdGC

            # calculate statistics related to contigs and scaffolds
            maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, scaffoldAvgLen, contigAvgLen, numContigs, numAmbiguousBases = self.calculateSeqStats(
                scaffolds)
            binStats['Genome size'] = genomeSize
            binStats['# ambiguous bases'] = numAmbiguousBases
            binStats['# scaffolds'] = len(scaffolds)
            binStats['# contigs'] = numContigs
            binStats['Longest scaffold'] = maxScaffoldLen
            binStats['Longest contig'] = maxContigLen
            binStats['N50 (scaffolds)'] = scaffold_N50
            binStats['N50 (contigs)'] = contig_N50
            binStats['Mean scaffold length'] = scaffoldAvgLen
            binStats['Mean contig length'] = contigAvgLen

            # calculate coding density statistics
            codingDensity, translationTable, numORFs = self.calculateCodingDensity(
                binDir, scaffolds, genomeSize)
            binStats['Coding density'] = codingDensity
            binStats['Translation table'] = translationTable
            binStats['# predicted genes'] = numORFs

            queueOut.put((binId, binStats))
Esempio n. 31
0
    def __processBin(self, outDir, queueIn, queueOut):
        """Thread safe bin processing."""
        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binStats = {}
            scaffoldStats = {}

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # read scaffolds
            scaffolds = readFasta(binFile)
            for seqId in scaffolds:
                scaffoldStats[seqId] = {}

            # calculate GC statistics
            GC, stdGC = self.calculateGC(scaffolds, scaffoldStats)
            binStats['GC'] = GC
            binStats['GC std'] = stdGC

            # calculate statistics related to scaffold lengths
            maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds, scaffoldStats)
            binStats['Genome size'] = genomeSize
            binStats['# ambiguous bases'] = numAmbiguousBases
            binStats['# scaffolds'] = len(scaffolds)
            binStats['# contigs'] = numContigs
            binStats['Longest scaffold'] = maxScaffoldLen
            binStats['Longest contig'] = maxContigLen
            binStats['N50 (scaffolds)'] = scaffold_N50
            binStats['N50 (contigs)'] = contig_N50

            # calculate coding density statistics
            codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, genomeSize, scaffoldStats)
            binStats['Coding density'] = codingDensity
            binStats['Translation table'] = translationTable
            binStats['# predicted genes'] = numORFs

            queueOut.put((binId, binStats, scaffoldStats))
Esempio n. 32
0
    def calculateCodingDensity(self, outDir, scaffolds, genomeSize):
        """Calculate coding density of putative genome bin."""
        gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF)
        if os.path.exists(gffFile):
            prodigalParserGFF = ProdigalGeneFeatureParser(gffFile)

            aaFile = os.path.join(
                outDir, DefaultValues.PRODIGAL_AA
            )  # use AA file as nucleotide file is optional
            aaGenes = readFasta(aaFile)

            codingBasePairs = 0  # self.__calculateCodingBases(aaGenes)
            for scaffold_id in scaffolds.keys():
                codingBasePairs += prodigalParserGFF.codingBases(scaffold_id)

            return float(
                codingBasePairs
            ) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes)
        else:
            # there is no gene feature file (perhaps the user specified pre-calculated genes)
            # so calculating the coding density is not possible
            return -1, -1, -1
Esempio n. 33
0
    def __extractMarkerSeqsUnique(self, outDir, resultsParser):
        """Extract marker sequences with a single unique hit."""

        markerSeqs = defaultdict(dict)
        markerStats = defaultdict(dict)
        for binId in resultsParser.results:
            # read ORFs for bin
            aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA)
            binORFs = readFasta(aaGeneFile)

            # extract ORFs hitting a marker
            for markerId, hits in resultsParser.results[binId].markerHits.iteritems():
                markerSeqs[markerId][binId] = {}
                markerStats[markerId][binId] = {}

                # only record hits which are unique
                if len(hits) == 1:
                    hit = hits[0]
                    markerSeqs[markerId][binId][hit.target_name] = self.__extractSeq(hit.target_name, binORFs)
                    markerStats[markerId][binId][hit.target_name] = [hit.full_e_value, hit.full_score]

        return markerSeqs, markerStats
Esempio n. 34
0
    def __extractMarkersWithMultipleHits(self, outDir, binId, resultsParser, binMarkerSet):
        """Extract markers with multiple hits within a single bin."""

        markersWithMultipleHits = defaultdict(dict)

        aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA)
        binORFs = readFasta(aaGeneFile)

        markerGenes = binMarkerSet.selectedMarkerSet().getMarkerGenes()
        for markerId, hits in resultsParser.results[binId].markerHits.iteritems():
            if markerId not in markerGenes or len(hits) < 2:
                continue

            # sort hits from highest to lowest e-value in order to ensure only the best hit
            # to a given target is retained
            hits.sort(key=lambda x: x.full_e_value, reverse=True)

            # Note: this data structure is used to mimic that used by __extractMarkerSeqsTopHits()
            markersWithMultipleHits[markerId][binId] = {}
            for hit in hits:
                markersWithMultipleHits[markerId][binId][hit.target_name] = self.__extractSeq(hit.target_name, binORFs)

        return markersWithMultipleHits
Esempio n. 35
0
    def reportFullMSA(self, outDir, outFile):
        """Create MSA with all reference and bin alignments."""

        # write bin alignments to file
        oldStdOut = reassignStdOut(outFile)
        for line in open(os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)):
            print(line.rstrip())

        # read duplicate seqs
        duplicateNodes = self.__readDuplicateSeqs()

        # write reference alignments to file
        seqs = readFasta(os.path.join(DefaultValues.PPLACER_REF_PACKAGE, 'genome_tree.concatenated.derep.fasta'))
        for seqId, seq in seqs.iteritems():
            print('>' + seqId)
            print(seq)

            if seqId in duplicateNodes:
                for dupSeqId in duplicateNodes[seqId]:
                    print('>' + dupSeqId)
                    print(seq)

        restoreStdOut(outFile, oldStdOut)
Esempio n. 36
0
    def __extractMarkerSeqsTopHits(self, outDir, resultsParser):
        """Extract marker sequences from top hits (assume all bins use the same HMM file)."""

        markerSeqs = defaultdict(dict)
        markerStats = defaultdict(dict)
        for binId in resultsParser.results:
            # read ORFs for bin
            aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA)
            binORFs = readFasta(aaGeneFile)

            # extract ORFs hitting a marker
            for markerId, hits in resultsParser.results[binId].markerHits.iteritems():
                markerSeqs[markerId][binId] = {}
                markerStats[markerId][binId] = {}

                # sort hits from highest to lowest e-value in order to ensure only the best hit
                # to a given target is retained
                hits.sort(key=lambda x: x.full_e_value, reverse=True)
                topHit = hits[0]
                markerSeqs[markerId][binId][topHit.target_name] = self.__extractSeq(topHit.target_name, binORFs)
                markerStats[markerId][binId][topHit.target_name] = [topHit.full_e_value, topHit.full_score]

        return markerSeqs, markerStats
Esempio n. 37
0
    def calculate(self, seqFile, outputFile):
        """Calculate genomic signature of each sequence."""

        self.logger.info('  Determining tetranucleotide signature of each sequence.')

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        seqs = readFasta(seqFile)

        for seqId, seq in seqs.iteritems():
            workerQueue.put((seqId, seq))

        for _ in range(self.totalThreads):
            workerQueue.put((None, None))

        try:
            calcProc = [mp.Process(target=self.__calculateResults, args=(workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__storeResults, args=(seqFile, outputFile, len(seqs), writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put((None, None))
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Esempio n. 38
0
    def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD):
        # Read reference distributions from file
        dist = readDistribution("td_dist")

        # get tetranucleotide signature for bin
        seqs = readFasta(fastaFile)

        binTools = BinTools()
        binSig = binTools.binTetraSig(seqs, tetraSigs)

        # get tetranucleotide distances for windows
        genomicSig = GenomicSignatures(K=4, threads=1)

        data = []
        seqLens = []
        deltaTDs = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.td_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)
            deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig))

            while end < seqLen:
                windowSig = genomicSig.seqSignature(seq[start:end])
                data.append(genomicSig.distance(windowSig, binSig))

                start = end
                end += self.options.td_window_size

        if len(data) == 0:
            axesHist.set_xlabel("[Error] No seqs >= %d, the specified window size" % self.options.td_window_size)
            return

        deltaTDs = np.array(deltaTDs)

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.td_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel(r"$\Delta$ TD")
        axesHist.set_ylabel("% windows (" + str(self.options.td_window_size) + " bp)")

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ["right", "top"]:
                spine.set_color("none")
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig)

        # Delta-TD vs Sequence length plot
        axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap="gray_r")
        axesDeltaTD.set_xlabel(r"$\Delta$ TD (mean TD = %.2f)" % meanTD)
        axesDeltaTD.set_ylabel("Sequence length (kbp)")

        _, yMaxSeqs = axesDeltaTD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot)

            x = []
            y = []
            for windowSize in dist:
                x.append(dist[windowSize][boundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            x = np.array(x)[sortIndexY]
            y = np.array(y)[sortIndexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axesDeltaTD.plot(x, y, "r--", lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaTD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle="dashed", color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaTD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = "%.1f" % (float(seqLen) / 1000)
            label = label.replace(".0", "")  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaTD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaTD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaTD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaTD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaTD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaTD.spines.iteritems():
            if loc in ["right", "top"]:
                spine.set_color("none")
            else:
                spine.set_color(self.axesColour)
Esempio n. 39
0
    def __workerThread(self, tree, metadata, genomeIdsToTest,
                       ubiquityThreshold, singleCopyThreshold, numReplicates,
                       queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            # build marker sets for evaluating test genome
            testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId)
            binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(
                tree,
                testNode.parent_node,
                ubiquityThreshold,
                singleCopyThreshold,
                bMarkerSet=True,
                genomeIdsToRemove=[testGenomeId])

            # determine distribution of all marker genes within the test genome
            geneDistTable = self.img.geneDistTable(
                [testGenomeId],
                binMarkerSets.getMarkerGenes(),
                spacingBetweenContigs=0)

            # estimate completeness of unmodified genome
            unmodifiedComp = {}
            unmodifiedCont = {}
            for ms in binMarkerSets.markerSetIter():
                hits = {}
                for mg in ms.getMarkerGenes():
                    if mg in geneDistTable[testGenomeId]:
                        hits[mg] = geneDistTable[testGenomeId][mg]
                completeness, contamination = ms.genomeCheck(
                    hits, bIndividualMarkers=True)
                unmodifiedComp[ms.lineageStr] = completeness
                unmodifiedCont[ms.lineageStr] = contamination

            # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets
            testSeqs = readFasta(
                os.path.join(self.img.genomeDir, testGenomeId,
                             testGenomeId + '.fna'))
            testSeqLens, genomeSize = self.__seqLens(testSeqs)

            for contigLen in self.contigLens:
                for percentComp in self.percentComps:
                    for percentCont in self.percentConts:
                        deltaComp = defaultdict(list)
                        deltaCont = defaultdict(list)
                        deltaCompSet = defaultdict(list)
                        deltaContSet = defaultdict(list)

                        deltaCompRefined = defaultdict(list)
                        deltaContRefined = defaultdict(list)
                        deltaCompSetRefined = defaultdict(list)
                        deltaContSetRefined = defaultdict(list)

                        trueComps = []
                        trueConts = []

                        numDescendants = {}

                        for i in xrange(0, numReplicates):
                            # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove
                            # (this will sample >= the desired level of completeness)
                            retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(
                                percentComp, testSeqLens, genomeSize)
                            trueComps.append(trueComp)

                            # select a random genome to use as a source of contamination
                            contGenomeId = random.sample(
                                genomeIdsToTest - set([testGenomeId]), 1)[0]
                            contSeqs = readFasta(
                                os.path.join(self.img.genomeDir, contGenomeId,
                                             contGenomeId + '.fna'))
                            contSeqLens, contGenomeSize = self.__seqLens(
                                contSeqs)
                            seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(
                                1 - percentCont, contSeqLens, contGenomeSize)

                            contSampledSeqIds = set(
                                contSeqs.keys()).difference(seqsToRetain)
                            trueCont = 100.0 - trueRetainedPer
                            trueConts.append(trueCont)

                            for ms in binMarkerSets.markerSetIter():
                                numDescendants[ms.lineageStr] = ms.numGenomes
                                containedMarkerGenes = defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), testGenomeId,
                                    retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), contGenomeId,
                                    contSampledSeqIds, containedMarkerGenes)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=True)
                                deltaComp[ms.lineageStr].append(completeness -
                                                                trueComp)
                                deltaCont[ms.lineageStr].append(contamination -
                                                                trueCont)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=False)
                                deltaCompSet[ms.lineageStr].append(
                                    completeness - trueComp)
                                deltaContSet[ms.lineageStr].append(
                                    contamination - trueCont)

                            for ms in refinedBinMarkerSet.markerSetIter():
                                containedMarkerGenes = defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), testGenomeId,
                                    retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), contGenomeId,
                                    contSampledSeqIds, containedMarkerGenes)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=True)
                                deltaCompRefined[ms.lineageStr].append(
                                    completeness - trueComp)
                                deltaContRefined[ms.lineageStr].append(
                                    contamination - trueCont)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=False)
                                deltaCompSetRefined[ms.lineageStr].append(
                                    completeness - trueComp)
                                deltaContSetRefined[ms.lineageStr].append(
                                    contamination - trueCont)

                        taxonomy = ';'.join(metadata[testGenomeId]['taxonomy'])
                        queueOut.put(
                            (testGenomeId, contigLen, percentComp, percentCont,
                             taxonomy, numDescendants, unmodifiedComp,
                             unmodifiedCont, deltaComp, deltaCont,
                             deltaCompSet, deltaContSet, deltaCompRefined,
                             deltaContRefined, deltaCompSetRefined,
                             deltaContSetRefined, trueComps, trueConts))
Esempio n. 40
0
    def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist,
                   axesDeltaTD):
        # Read reference distributions from file
        dist = readDistribution('td_dist')

        # get tetranucleotide signature for bin
        seqs = readFasta(fastaFile)

        binTools = BinTools()
        binSig = binTools.binTetraSig(seqs, tetraSigs)

        # get tetranucleotide distances for windows
        genomicSig = GenomicSignatures(K=4, threads=1)

        data = []
        seqLens = []
        deltaTDs = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.td_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)
            deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig))

            while (end < seqLen):
                windowSig = genomicSig.seqSignature(seq[start:end])
                data.append(genomicSig.distance(windowSig, binSig))

                start = end
                end += self.options.td_window_size

        if len(data) == 0:
            axesHist.set_xlabel(
                '[Error] No seqs >= %d, the specified window size' %
                self.options.td_window_size)
            return

        deltaTDs = np.array(deltaTDs)

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.td_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel(r'$\Delta$ TD')
        axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) +
                            ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs,
                                                  binSig)

        # Delta-TD vs Sequence length plot
        axesDeltaTD.scatter(deltaTDs,
                            seqLens,
                            c=abs(deltaTDs),
                            s=10,
                            lw=0.5,
                            cmap='gray_r')
        axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD)
        axesDeltaTD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaTD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot)

            x = []
            y = []
            for windowSize in dist:
                x.append(dist[windowSize][boundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            x = np.array(x)[sortIndexY]
            y = np.array(y)[sortIndexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]
                                    ) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axesDeltaTD.plot(x, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaTD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaTD.vlines(0,
                           0,
                           yMaxSeqs,
                           linestyle='dashed',
                           color=self.axesColour,
                           zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaTD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaTD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaTD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaTD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaTD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaTD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaTD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 41
0
    def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold):
        # make sure output directory exists
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        # get bin id of binned contigs
        self.logger.info('  Determining bin assignment of sequences.')
        seqIdToBinId = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqIds = readFastaSeqIds(f)
            for seqId in seqIds:
                seqIdToBinId[seqId] = binId

        # identify 16S reads from contigs/scaffolds
        self.logger.info('  Identifying SSU rRNAs on sequences.')
        self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu'))

        # read HMM hits
        hitsPerDomain = {}
        for domain in ['archaea', 'bacteria', 'euk']:
            hits = {}

            seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold)
            if len(seqInfo) > 0:
                for seqId, seqHits in seqInfo.iteritems():
                    for hit in seqHits:
                        self.__addHit(hits, seqId, hit, concatenateThreshold)

            hitsPerDomain[domain] = hits

        # find best domain hit for each sequence
        bestHits = {}
        for _, hits in hitsPerDomain.iteritems():
            for seqId, info in hits.iteritems():
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                self.__addDomainHit(bestHits, seqId, info)

        # write summary file and putative SSU rRNAs to file
        summaryFile = os.path.join(outputDir, 'ssu_summary.tsv')
        summaryOut = open(summaryFile, 'w')
        summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n')

        seqFile = os.path.join(outputDir, 'ssu.fna')
        seqOut = open(seqFile, 'w')

        seqs = readFasta(contigFile)

        hitsToBins = {}
        for seqId in bestHits:
            origSeqId = seqId
            if '-#' in seqId:
                seqId = seqId[0:seqId.rfind('-#')]

            if seqId in seqIdToBinId:
                binId = seqIdToBinId[seqId]
            else:
                binId = DefaultValues.UNBINNED

            seqInfo = [origSeqId] + bestHits[origSeqId]
            hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo]

        for binId in sorted(hitsToBins.keys()):
            for seqInfo in hitsToBins[binId]:
                seqId = seqInfo[0]
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                seq = seqs[seqId]
                summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n')
                seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n')
                seqOut.write(seq[int(seqInfo[3]):int(seqInfo[4])] + '\n')

        summaryOut.close()
        seqOut.close()

        self.logger.info('')
        self.logger.info('  Identified ' + str(len(bestHits)) + ' putative SSU genes:')
        self.logger.info('    Summary of identified hits written to: ' + summaryFile)
        self.logger.info('    SSU sequences written to: ' + seqFile)
Esempio n. 42
0
    def report(self, binFiles1, binFiles2, seqFile, outputFile):
        # determine total number of sequences
        self.logger.info('  Reading sequences.')
        seqs = readFasta(seqFile)

        seqLens = {}
        totalBases = 0
        numSeq1K = 0
        totalBases1K = 0
        numSeq5K = 0
        totalBases5K = 0
        for seqId, seq in seqs.items():
            seqLen = len(seq)
            seqLens[seqId] = seqLen
            totalBases += seqLen
            if seqLen >= 1000:
                numSeq1K += 1
                totalBases1K += seqLen
            if seqLen >= 5000:
                numSeq5K += 1
                totalBases5K += seqLen

        # determine sequences in each bin
        bins1 = self.__readBins(binFiles1)
        bins2 = self.__readBins(binFiles2)

        # determine bin stats
        binStats1, totalUniqueBinnedSeqs1, totalUniqueBinnedBases1, numRepeats1 = self.__binningStats(
            bins1, seqLens)
        binStats2, totalUniqueBinnedSeqs2, totalUniqueBinnedBases2, numRepeats2 = self.__binningStats(
            bins2, seqLens)

        # sort bins by size
        binStats1 = sorted(iter(binStats1.items()),
                           key=lambda x: x[1][1],
                           reverse=True)
        binStats2 = sorted(iter(binStats2.items()),
                           key=lambda x: x[1][1],
                           reverse=True)

        # report summary results
        self.logger.info('    Total seqs = %d (%.2f Mbp)' %
                         (len(seqs), float(totalBases) / 1e6))
        self.logger.info('      # seqs > 1 kbp = %d (%.2f Mbp)' %
                         (numSeq1K, float(totalBases1K) / 1e6))
        self.logger.info('      # seqs > 5 kbp = %d (%.2f Mbp)' %
                         (numSeq5K, float(totalBases5K) / 1e6))
        self.logger.info('')
        self.logger.info('  Binned seqs statistics:')
        self.logger.info(
            '    1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
            % (len(bins1), totalUniqueBinnedSeqs1,
               float(totalUniqueBinnedSeqs1) * 100 / len(seqs),
               float(totalUniqueBinnedBases1) / 1e6,
               float(totalUniqueBinnedBases1) * 100 / totalBases, numRepeats1))
        self.logger.info(
            '    2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
            % (len(bins2), totalUniqueBinnedSeqs2,
               float(totalUniqueBinnedSeqs2) * 100 / len(seqs),
               float(totalUniqueBinnedBases2) / 1e6,
               float(totalUniqueBinnedBases2) * 100 / totalBases, numRepeats2))

        # output report
        fout = open(outputFile, 'w')
        for data in binStats2:
            fout.write('\t' + data[0])
        fout.write(
            '\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n'
        )

        totalSeqsInCommon2 = defaultdict(int)
        maxBasesInCommon2 = defaultdict(int)
        maxSeqsInCommon2 = defaultdict(int)
        bestMatchingBin2 = {}
        binnedSeqs2 = defaultdict(set)
        for data1 in binStats1:
            binId1 = data1[0]
            fout.write(binId1)

            seqs1 = bins1[binId1]

            maxBasesInCommon = 0
            maxSeqsInCommon = 0
            bestMatchingBin = 'n/a'
            binnedSeqs = set()
            for data2 in binStats2:
                binId2 = data2[0]
                seqs2 = bins2[binId2]

                seqsInCommon = seqs1.intersection(seqs2)
                binnedSeqs.update(seqsInCommon)
                numSeqsInCommon = len(seqsInCommon)
                fout.write('\t' + str(numSeqsInCommon))

                basesInCommon = 0
                for seqId in seqsInCommon:
                    basesInCommon += seqLens[seqId]

                if basesInCommon > maxBasesInCommon:
                    maxBasesInCommon = basesInCommon
                    maxSeqsInCommon = numSeqsInCommon
                    bestMatchingBin = binId2

                if basesInCommon > maxBasesInCommon2[binId2]:
                    maxBasesInCommon2[binId2] = basesInCommon
                    maxSeqsInCommon2[binId2] = numSeqsInCommon
                    bestMatchingBin2[binId2] = binId1

                binnedSeqs2[binId2].update(seqsInCommon)
            fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (
                len(seqs1) - len(binnedSeqs),
                data1[1][0],
                float(data1[1][1]) / 1e6,
                bestMatchingBin,
                float(maxBasesInCommon) * 100 / data1[1][1],
                float(maxSeqsInCommon) * 100 / data1[1][0],
            ))

        fout.write('unbinned')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%d' % (len(bins2[binId]) - len(binnedSeqs2[binId])))
        fout.write('\n')

        fout.write('# seqs')
        for data in binStats2:
            fout.write('\t%d' % data[1][0])
        fout.write('\n')

        fout.write('# bases (Mbp)')
        for data in binStats2:
            fout.write('\t%.2f' % (float(data[1][1]) / 1e6))
        fout.write('\n')

        fout.write('Best match')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%s' % bestMatchingBin2.get(binId, 'n/a'))
        fout.write('\n')

        fout.write('% bases in common')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%.2f' %
                       (float(maxBasesInCommon2[binId]) * 100 / data[1][1]))
        fout.write('\n')

        fout.write('% seqs in common')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%.2f' %
                       (float(maxSeqsInCommon2[binId]) * 100 / data[1][0]))
        fout.write('\n')

        fout.close()
Esempio n. 43
0
    def plot(self, f, seqIds, pc, variance):
        # ensure pc matrix has at least 3 dimensions
        if pc.shape[1] == 1:
            pc = np.append(pc, np.zeros((pc.shape[0], 2)), 1)
            variance = np.append(variance[0], np.ones(2))
        elif pc.shape[1] == 2:
            pc = np.append(pc, np.zeros((pc.shape[0], 1)), 1)
            variance = np.append(variance[0:2], np.ones(1))

        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)

        axesPC1vsPC2 = self.fig.add_subplot(221)
        axesPC2vsPC3 = self.fig.add_subplot(222)
        axesPC1vsPC3 = self.fig.add_subplot(223)
        axesVariance = self.fig.add_subplot(224)

        # get sequence in bin
        seqs = readFasta(f)

        binIndices = []
        for rowIndex, seqId in enumerate(seqIds):
            if seqId in seqs.keys():
                binIndices.append(rowIndex)

        # plot sequence in bin
        axesPC1vsPC2.scatter(pc[:, 0], pc[:, 1], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o")
        axesPC1vsPC2.scatter(pc[binIndices, 0], pc[binIndices, 1], s=10, lw=0.5, facecolor="r", marker="o")
        axesPC1vsPC2.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100))
        axesPC1vsPC2.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100))

        axesPC2vsPC3.scatter(pc[:, 2], pc[:, 1], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o")
        axesPC2vsPC3.scatter(pc[binIndices, 2], pc[binIndices, 1], s=10, lw=0.5, facecolor="r", marker="o")
        axesPC2vsPC3.set_xlabel('PC3 (%.1f%%)' % (variance[2] * 100))
        axesPC2vsPC3.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100))

        axesPC1vsPC3.scatter(pc[:, 0], pc[:, 2], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o")
        axesPC1vsPC3.scatter(pc[binIndices, 0], pc[binIndices, 2], s=10, lw=0.5, facecolor="r", marker="o")
        axesPC1vsPC3.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100))
        axesPC1vsPC3.set_ylabel('PC3 (%.1f%%)' % (variance[2] * 100))

        axesVariance.plot(np.arange(len(variance), dtype=int) + 1, np.cumsum(variance))
        axesVariance.set_xlabel('Principal Component')
        axesVariance.set_ylabel('Percentage of Cumulative Variance')
        # axesVariance.vlines(3, 0, 1.0, linestyle='dashed', color=self.axesColour, zorder=0, lw=0.5)
        axesVariance.set_ylim([0, 1.02])
        axesVariance.set_xlim([0, len(variance)])

        axesVariance.get_xaxis().set_major_locator(MaxNLocator(integer=True))
        xticks = axesVariance.get_xticks()
        if 0 in xticks and 1 not in xticks:
            xticks = np.append(np.array([1]), xticks[1:])
        axesVariance.set_xticks(xticks)

        # Prettify plot
        for axes in [axesPC1vsPC2, axesPC2vsPC3, axesPC1vsPC3, axesVariance]:
            for a in axes.yaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for a in axes.xaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for line in axes.yaxis.get_ticklines():
                line.set_color(self.axesColour)

            for line in axes.xaxis.get_ticklines():
                line.set_color(self.axesColour)

            for loc, spine in axes.spines.iteritems():
                if loc in ['right', 'top']:
                    spine.set_color('none')
                else:
                    spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1, w_pad=2, h_pad=2)
        self.draw()
Esempio n. 44
0
    def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes):

        # get GC for windows
        seqs = readFasta(binFile)

        gcProfile = {}
        for seqId, seq in seqs.items():
            start = 0
            end = self.options.window_size

            windowGCs = []
            while (end < len(seq)):
                a, c, g, t = baseCount(seq[start:end])
                windowGCs.append(float(g + c) / (a + c + g + t))

                start = end
                end += self.options.window_size

            a, c, g, t = baseCount(seq)
            seqGC = float(g + c) / (a + c + g + t)
            gcProfile[seqId] = [seqGC, windowGCs]

        # plot GC vs coverage for windows
        gc = []
        coverage = []
        for seqId, gcInfo in gcProfile.items():
            gc += gcInfo[1]
            coverage += coverageProfile[seqId][1]

        windowAxes.scatter(gc,
                           coverage,
                           c=abs(array(coverage)),
                           s=10,
                           lw=0.5,
                           cmap='gray_r')
        windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100))
        windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))

        # plot linear regression line
        if len(gc) > 1:
            slope, inter = polyfit(gc, coverage, 1)
            fit_fn = poly1d(
                [slope, inter]
            )  # fit_fn is now a function which takes in x and returns an estimate for y
            windowAxes.plot([min(gc), max(gc)],
                            fit_fn([min(gc), max(gc)]),
                            '--r',
                            lw=0.5)
            windowAxes.set_title(
                'GC vs. Coverage\n(window size = %d bp, slope = %.2f)' %
                (self.options.window_size, slope))
        else:
            # not possible to calculate best fit line
            windowAxes.set_title(
                'GC vs. Coverage\n(window size = %d bp, no best fit line)' %
                self.options.window_size)

        # Prettify plot
        for a in windowAxes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in windowAxes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in windowAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in windowAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in windowAxes.spines.items():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # plot GC vs coverage for entire sequences
        gc = []
        coverage = []
        seqLen = []
        for seqId, gcInfo in gcProfile.items():
            gc.append(gcInfo[0])
            coverage.append(coverageProfile[seqId][0])
            seqLen.append(len(seqs[seqId]))

        # set marker size proportional to sequence length
        markerSize = log(array(seqLen))  # log-scale
        markerSize = (markerSize - min(markerSize)) / max(
            markerSize)  # normalize between 0 and 1
        markerSize = markerSize * 200 + 10  # normalize between 10 and 200

        seqAxes.scatter(gc,
                        coverage,
                        c=abs(array(coverage)),
                        s=markerSize,
                        lw=0.5,
                        cmap='gray_r')
        seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100))
        seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))
        seqAxes.set_title('GC vs. Coverage\nIndividual Sequences')

        # Prettify plot
        for a in seqAxes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in seqAxes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in seqAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in seqAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in seqAxes.spines.items():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 45
0
    def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes):

        # get GC for windows
        seqs = readFasta(binFile)

        gcProfile = {}
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.window_size

            windowGCs = []
            while(end < len(seq)):
                a, c, g, t = baseCount(seq[start:end])
                windowGCs.append(float(g + c) / (a + c + g + t))

                start = end
                end += self.options.window_size

            a, c, g, t = baseCount(seq)
            seqGC = float(g + c) / (a + c + g + t)
            gcProfile[seqId] = [seqGC, windowGCs]

        # plot GC vs coverage for windows
        gc = []
        coverage = []
        for seqId, gcInfo in gcProfile.iteritems():
            gc += gcInfo[1]
            coverage += coverageProfile[seqId][1]

        windowAxes.scatter(gc, coverage, c=abs(array(coverage)), s=10, lw=0.5, cmap=pylab.cm.Greys)
        windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100))
        windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))

        # plot linear regression line
        if len(gc) > 1:
            slope, inter = polyfit(gc, coverage,1)
            fit_fn = poly1d([slope, inter]) # fit_fn is now a function which takes in x and returns an estimate for y
            windowAxes.plot([min(gc), max(gc)], fit_fn([min(gc), max(gc)]), '--r', lw=0.5)
            windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, slope = %.2f)' % (self.options.window_size, slope))
        else:
            # not possible to calculate best fit line
            windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, no best fit line)' % self.options.window_size)

        # Prettify plot
        for a in windowAxes.yaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for a in windowAxes.xaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for line in windowAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in windowAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in windowAxes.spines.iteritems():
            if loc in ['right','top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # plot GC vs coverage for entire sequences
        gc = []
        coverage = []
        seqLen = []
        for seqId, gcInfo in gcProfile.iteritems():
            gc.append(gcInfo[0])
            coverage.append(coverageProfile[seqId][0])
            seqLen.append(len(seqs[seqId]))

        # set marker size proportional to sequence length
        markerSize = log(array(seqLen)) # log-scale
        markerSize = (markerSize - min(markerSize)) / max(markerSize) # normalize between 0 and 1
        markerSize = markerSize*200 + 10 # normalize between 10 and 200

        seqAxes.scatter(gc, coverage, c=abs(array(coverage)), s=markerSize, lw=0.5, cmap=pylab.cm.Greys)
        seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100))
        seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))
        seqAxes.set_title('GC vs. Coverage\nIndividual Sequences')

        # Prettify plot
        for a in seqAxes.yaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for a in seqAxes.xaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for line in seqAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in seqAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in seqAxes.spines.iteritems():
            if loc in ['right','top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 46
0
    def plot(self, fastaFile):
        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axes = self.fig.add_subplot(111)

        # calculate cumulative sequence length
        seqs = readFasta(fastaFile)

        seqLens = []
        for seq in seqs.values():
            seqLens.append(len(seq))

        seqLens.sort(reverse=True)
        x = np.arange(0, len(seqLens))

        y = []
        cumLen = 0
        for seqLen in seqLens:
            cumLen += seqLen
            y.append(cumLen)

        # Create plot
        axes.plot(x, y, 'k-',)
        axes.set_xlabel('Sequence index')
        axes.set_ylabel('Cumulative sequence length (Mbp)')

        # ensure y-axis include zero
        _, end = axes.get_ylim()
        axes.set_ylim([0, end])

        # Change sequence lengths from bp to kbp
        yticks = axes.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.2f' % (float(seqLen) / 1e6)
            label = label.replace('.00', '')  # remove trailing zeros
            if label[-1] == '0':
                label = label[0:-1]
            kbpLabels.append(label)
        axes.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axes.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1)
        self.draw()
Esempio n. 47
0
    def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer,
            maxEditDistPer, minQC):
        """Calculate coverage of sequences for each BAM file."""

        # determine bin assignment of each sequence
        self.logger.info('  Determining bin assignment of each sequence.')

        seqIdToBinId = {}
        seqIdToSeqLen = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            seqs = readFasta(binFile)
            for seqId, seq in seqs.iteritems():
                seqIdToBinId[seqId] = binId
                seqIdToSeqLen[seqId] = len(seq)

        # process each fasta file
        self.logger.info("  Processing %d file(s) with %d threads.\n" %
                         (len(bamFiles), self.totalThreads))

        # make sure all BAM files are sorted
        self.numFiles = len(bamFiles)
        for bamFile in bamFiles:
            if not os.path.exists(bamFile + '.bai'):
                self.logger.error(
                    '  [Error] BAM file is either unsorted or not indexed: ' +
                    bamFile + '\n')
                sys.exit()

        # calculate coverage of each BAM file
        coverageInfo = {}
        numFilesStarted = 0
        for bamFile in bamFiles:
            numFilesStarted += 1
            self.logger.info(
                '  Processing %s (%d of %d):' %
                (ntpath.basename(bamFile), numFilesStarted, len(bamFiles)))

            coverageInfo[bamFile] = mp.Manager().dict()
            coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads,
                                                      minAlignPer,
                                                      maxEditDistPer, minQC,
                                                      coverageInfo[bamFile])

        # redirect output
        self.logger.info('  Writing coverage information to file.')
        oldStdOut = reassignStdOut(outFile)

        header = 'Sequence Id\tBin Id\tSequence length (bp)'
        for bamFile in bamFiles:
            header += '\tBam Id\tCoverage\tMapped reads'

        print(header)

        # get length of all seqs
        for bamFile, seqIds in coverageInfo.iteritems():
            for seqId in seqIds.keys():
                seqIdToSeqLen[seqId] = seqIds[seqId].seqLen

        # write coverage stats for all scaffolds to file
        for seqId, seqLen in seqIdToSeqLen.iteritems():
            rowStr = seqId + '\t' + seqIdToBinId.get(
                seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen)
            for bamFile in bamFiles:
                bamId = binIdFromFilename(bamFile)

                if seqId in coverageInfo[bamFile]:
                    rowStr += '\t%s\t%f\t%d' % (
                        bamId, coverageInfo[bamFile][seqId].coverage,
                        coverageInfo[bamFile][seqId].mappedReads)
                else:
                    rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0)

            print(rowStr)

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
    def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break
                        
            # build marker sets for evaluating test genome
            testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId)
            binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = [testGenomeId])

            # determine distribution of all marker genes within the test genome
            geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0)
                
            # estimate completeness of unmodified genome
            unmodifiedComp = {}
            unmodifiedCont = {}
            for ms in binMarkerSets.markerSetIter():     
                hits = {}
                for mg in ms.getMarkerGenes():
                    if mg in geneDistTable[testGenomeId]:
                        hits[mg] = geneDistTable[testGenomeId][mg]
                completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) 
                unmodifiedComp[ms.lineageStr] = completeness
                unmodifiedCont[ms.lineageStr] = contamination

            # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets 
            testSeqs = readFasta(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna'))
            testSeqLens, genomeSize = self.__seqLens(testSeqs)
            
            
            for contigLen in self.contigLens: 
                for percentComp in self.percentComps:
                    for percentCont in self.percentConts:
                        deltaComp = defaultdict(list)
                        deltaCont = defaultdict(list)
                        deltaCompSet = defaultdict(list)
                        deltaContSet = defaultdict(list)
                        
                        deltaCompRefined = defaultdict(list)
                        deltaContRefined = defaultdict(list)
                        deltaCompSetRefined = defaultdict(list)
                        deltaContSetRefined = defaultdict(list)
                        
                        trueComps = []
                        trueConts = []
                        
                        numDescendants = {}
            
                        for i in xrange(0, numReplicates):
                            # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove 
                            # (this will sample >= the desired level of completeness)
                            retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(percentComp, testSeqLens, genomeSize)
                            trueComps.append(trueComp)
    
                            # select a random genome to use as a source of contamination
                            contGenomeId = random.sample(genomeIdsToTest - set([testGenomeId]), 1)[0]
                            contSeqs = readFasta(os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna'))
                            contSeqLens, contGenomeSize = self.__seqLens(contSeqs) 
                            seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(1 - percentCont, contSeqLens, contGenomeSize) 
                            
                            contSampledSeqIds = set(contSeqs.keys()).difference(seqsToRetain)
                            trueCont = 100.0 - trueRetainedPer
                            trueConts.append(trueCont)
              
                            for ms in binMarkerSets.markerSetIter():  
                                numDescendants[ms.lineageStr] = ms.numGenomes
                                containedMarkerGenes= defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes)

                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)
                                deltaComp[ms.lineageStr].append(completeness - trueComp)
                                deltaCont[ms.lineageStr].append(contamination - trueCont)
                                
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False)
                                deltaCompSet[ms.lineageStr].append(completeness - trueComp)
                                deltaContSet[ms.lineageStr].append(contamination - trueCont)
                                
                            for ms in refinedBinMarkerSet.markerSetIter():  
                                containedMarkerGenes= defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes)
                                
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)
                                deltaCompRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContRefined[ms.lineageStr].append(contamination - trueCont)
                                
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False)
                                deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContSetRefined[ms.lineageStr].append(contamination - trueCont)
                                
                        taxonomy = ';'.join(metadata[testGenomeId]['taxonomy'])
                        queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts))
Esempio n. 49
0
    def plot(self, f, seqIds, pc, variance):
        # ensure pc matrix has at least 3 dimensions
        if pc.shape[1] == 1:
            pc = np.append(pc, np.zeros((pc.shape[0], 2)), 1)
            variance = np.append(variance[0], np.ones(2))
        elif pc.shape[1] == 2:
            pc = np.append(pc, np.zeros((pc.shape[0], 1)), 1)
            variance = np.append(variance[0:2], np.ones(1))

        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)

        axesPC1vsPC2 = self.fig.add_subplot(221)
        axesPC2vsPC3 = self.fig.add_subplot(222)
        axesPC1vsPC3 = self.fig.add_subplot(223)
        axesVariance = self.fig.add_subplot(224)

        # get sequence in bin
        seqs = readFasta(f)

        binIndices = []
        for rowIndex, seqId in enumerate(seqIds):
            if seqId in seqs.keys():
                binIndices.append(rowIndex)

        # plot sequence in bin
        axesPC1vsPC2.scatter(pc[:, 0],
                             pc[:, 1],
                             s=10,
                             lw=0.5,
                             facecolor=(0.8, 0.8, 0.8),
                             marker="o")
        axesPC1vsPC2.scatter(pc[binIndices, 0],
                             pc[binIndices, 1],
                             s=10,
                             lw=0.5,
                             facecolor="r",
                             marker="o")
        axesPC1vsPC2.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100))
        axesPC1vsPC2.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100))

        axesPC2vsPC3.scatter(pc[:, 2],
                             pc[:, 1],
                             s=10,
                             lw=0.5,
                             facecolor=(0.8, 0.8, 0.8),
                             marker="o")
        axesPC2vsPC3.scatter(pc[binIndices, 2],
                             pc[binIndices, 1],
                             s=10,
                             lw=0.5,
                             facecolor="r",
                             marker="o")
        axesPC2vsPC3.set_xlabel('PC3 (%.1f%%)' % (variance[2] * 100))
        axesPC2vsPC3.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100))

        axesPC1vsPC3.scatter(pc[:, 0],
                             pc[:, 2],
                             s=10,
                             lw=0.5,
                             facecolor=(0.8, 0.8, 0.8),
                             marker="o")
        axesPC1vsPC3.scatter(pc[binIndices, 0],
                             pc[binIndices, 2],
                             s=10,
                             lw=0.5,
                             facecolor="r",
                             marker="o")
        axesPC1vsPC3.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100))
        axesPC1vsPC3.set_ylabel('PC3 (%.1f%%)' % (variance[2] * 100))

        axesVariance.plot(
            np.arange(len(variance), dtype=int) + 1, np.cumsum(variance))
        axesVariance.set_xlabel('Principal Component')
        axesVariance.set_ylabel('Percentage of Cumulative Variance')
        # axesVariance.vlines(3, 0, 1.0, linestyle='dashed', color=self.axesColour, zorder=0, lw=0.5)
        axesVariance.set_ylim([0, 1.02])
        axesVariance.set_xlim([0, len(variance)])

        axesVariance.get_xaxis().set_major_locator(MaxNLocator(integer=True))
        xticks = axesVariance.get_xticks()
        if 0 in xticks and 1 not in xticks:
            xticks = np.append(np.array([1]), xticks[1:])
        axesVariance.set_xticks(xticks)

        # Prettify plot
        for axes in [axesPC1vsPC2, axesPC2vsPC3, axesPC1vsPC3, axesVariance]:
            for a in axes.yaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for a in axes.xaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for line in axes.yaxis.get_ticklines():
                line.set_color(self.axesColour)

            for line in axes.xaxis.get_ticklines():
                line.set_color(self.axesColour)

            for loc, spine in axes.spines.iteritems():
                if loc in ['right', 'top']:
                    spine.set_color('none')
                else:
                    spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1, w_pad=2, h_pad=2)
        self.draw()
Esempio n. 50
0
    def report(self, binFiles1, binFiles2, seqFile, outputFile):
        # determine total number of sequences
        seqs = readFasta(seqFile)

        seqLens = {}
        totalBases = 0
        numSeq1K = 0
        totalBases1K = 0
        numSeq5K = 0
        totalBases5K = 0
        for seqId, seq in seqs.iteritems():
            seqLen = len(seq)
            seqLens[seqId] = seqLen
            totalBases += seqLen
            if seqLen >= 1000:
                numSeq1K += 1
                totalBases1K += seqLen
            if seqLen >= 5000:
                numSeq5K += 1
                totalBases5K += seqLen


        # determine sequences in each bin
        bins1 = self.__readBins(binFiles1)
        bins2 = self.__readBins(binFiles2)

        # determine bin stats
        binStats1, totalBinnedSeqs1, totalBinnedBases1 = self.__binningStats(bins1, seqLens)
        binStats2, totalBinnedSeqs2, totalBinnedBases2  = self.__binningStats(bins2, seqLens)

        # sort bins by size
        binStats1 = sorted(binStats1.iteritems(), key = lambda x: x[1][1], reverse = True)
        binStats2 = sorted(binStats2.iteritems(), key = lambda x: x[1][1], reverse = True)

        # report summary results
        self.logger.info('')
        self.logger.info('  Total seqs = %d (%.2f Mbp)' % (len(seqs), float(totalBases)/1e6))
        self.logger.info('    # seqs > 1 kbp = %d (%.2f Mbp)' % (numSeq1K, float(totalBases1K)/1e6))
        self.logger.info('    # seqs > 5 kbp= %d (%.2f Mbp)' % (numSeq5K, float(totalBases5K)/1e6))
        self.logger.info('')
        self.logger.info('  Binned seqs statistics:')
        self.logger.info('    1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%)' % (len(bins1), totalBinnedSeqs1, float(totalBinnedSeqs1)*100 / len(seqs), float(totalBinnedBases1)/1e6, float(totalBinnedBases1)*100/totalBases))
        self.logger.info('    2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%)' % (len(bins2), totalBinnedSeqs2, float(totalBinnedSeqs2)*100 / len(seqs), float(totalBinnedBases2)/1e6, float(totalBinnedBases2)*100/totalBases))

        # output report
        fout = open(outputFile, 'w')
        for data in binStats2:
            fout.write('\t' + data[0])
        fout.write('\tunbinned\t% bases in common\t% seqs in common\tBest match\t# seqs\t# bases (Mbp)\n')

        totalSeqsInCommon2 = defaultdict(int)
        maxBasesInCommon2 = defaultdict(int)
        maxSeqsInCommon2 = defaultdict(int)
        bestMatchingBin2 = {}
        for data1 in binStats1:
            binId1 = data1[0]
            fout.write(binId1)

            seqs1 = bins1[binId1]

            totalSeqsInCommon = 0
            maxBasesInCommon = 0
            maxSeqsInCommon = 0
            bestMatchingBin = 'n/a'
            for data2 in binStats2:
                binId2 = data2[0]
                seqs2 = bins2[binId2]

                seqsInCommon = seqs1.intersection(seqs2)
                numSeqsInCommon = len(seqsInCommon)
                fout.write('\t' + str(numSeqsInCommon))

                basesInCommon = 0
                for seqId in seqsInCommon:
                    basesInCommon += seqLens[seqId]

                if basesInCommon > maxBasesInCommon:
                    maxBasesInCommon = basesInCommon
                    maxSeqsInCommon = numSeqsInCommon
                    bestMatchingBin = binId2

                if basesInCommon > maxBasesInCommon2[binId2]:
                    maxBasesInCommon2[binId2] = basesInCommon
                    maxSeqsInCommon2[binId2] = numSeqsInCommon
                    bestMatchingBin2[binId2] = binId1

                totalSeqsInCommon += numSeqsInCommon
                totalSeqsInCommon2[binId2] += numSeqsInCommon
            fout.write('\t%d\t%.2f\t%.2f\t%s\t%d\t%.2f\n' % (len(seqs1) - totalSeqsInCommon,
                                                             float(maxBasesInCommon)*100 / data1[1][1],
                                                             float(maxSeqsInCommon)*100 / data1[1][0],
                                                             bestMatchingBin,
                                                             data1[1][0],
                                                             float(data1[1][1])/1e6))

        fout.write('unbinned')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%d' % (len(bins2[binId]) - totalSeqsInCommon2[binId]))
        fout.write('\n')

        fout.write('% bases in common')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%.2f' % (float(maxBasesInCommon2[binId])*100 / data[1][1]))
        fout.write('\n')

        fout.write('% seqs in common')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%.2f' % (float(maxSeqsInCommon2[binId])*100 / data[1][0]))
        fout.write('\n')

        fout.write('Best match')
        for data in binStats2:
            binId = data[0]
            fout.write('\t%s' % bestMatchingBin2.get(binId, 'n/a'))
        fout.write('\n')

        fout.write('# seqs')
        for data in binStats2:
            fout.write('\t%d' % data[1][0])
        fout.write('\n')

        fout.write('# bases (Mbp)')
        for data in binStats2:
            fout.write('\t%.2f' % (float(data[1][1])/1e6))
        fout.write('\n')

        fout.close()
Esempio n. 51
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF)
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r')
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 52
0
    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None):

        """Print out information about bin."""
        if outputFormat == 1:
            selectedMarkerSet = binMarkerSets.selectedMarkerSet()

            lineageStr = selectedMarkerSet.lineageStr
            if selectedMarkerSet.UID != '0':
                lineageStr += ' (' + str(selectedMarkerSet.UID) + ')'

            data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers)
            row = "%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, lineageStr,
                                                selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(),
                                                "\t".join([str(data[i]) for i in range(6)]),
                                                data[6],
                                                data[7],
                                                aai.aaiMeanBinHetero.get(self.binId, 0.0)
                                                )
            if table == None:
                print(row)
            else:
                table.add_row([self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)])
        elif outputFormat == 2:
            selectedMarkerSet = binMarkerSets.selectedMarkerSet()

            lineageStr = selectedMarkerSet.lineageStr
            if selectedMarkerSet.UID != '0':
                lineageStr += ' (' + str(selectedMarkerSet.UID) + ')'

            data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers)

            if table == None:
                row = self.binId
                row += '\t%s\t%d\t%d\t%d' % (lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets())
                row += '\t%0.2f\t%0.2f\t%0.2f' % (data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0))
                row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (self.binStats['Genome size'], self.binStats['# ambiguous bases'],
                                                                         self.binStats['# scaffolds'], self.binStats['# contigs'],
                                                                         self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
                                                                         self.binStats['Mean scaffold length'], self.binStats['Mean contig length'],
                                                                         self.binStats['Longest scaffold'], self.binStats['Longest contig'])
                row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100)
                row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes'])
                row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)])

                if coverageBinProfiles:
                    for _, coverageStats in coverageBinProfiles[self.binId].iteritems():
                        row += '\t%.2f\t%.2f' % (coverageStats[0], coverageStats[1])

                print(row)
            else:
                row = [self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()]
                row.extend([data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)])
                row.extend([self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'],
                                                 self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
                                                 int(self.binStats['Mean scaffold length']), int(self.binStats['Mean contig length']),
                                                 self.binStats['Longest scaffold'], self.binStats['Longest contig']])
                row.extend([self.binStats['GC'] * 100, self.binStats['GC std'] * 100])
                row.extend([self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']])
                row.extend(data[0:6])

                if coverageBinProfiles:
                    for _, coverageStats in coverageBinProfiles[self.binId].iteritems():
                        row.extend(coverageStats)

                table.add_row(row)
        elif outputFormat == 3:
            for ms in binMarkerSets.markerSetIter():
                data = self.geneCounts(ms, self.markerHits, bIndividualMarkers)
                row = "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, ms.UID, ms.lineageStr, ms.numGenomes,
                                                    ms.numMarkers(), ms.numSets(),
                                                    "\t".join([str(data[i]) for i in range(6)]),
                                                    data[6],
                                                    data[7],
                                                    aai.aaiMeanBinHetero.get(self.binId, 0.0)
                                                    )
                if table == None:
                    print(row)
                else:
                    table.add_row([self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)])

        elif outputFormat == 4:
            selectedMarkerSet = binMarkerSets.selectedMarkerSet()
            data = self.hitsToMarkerGene(binMarkerSets.selectedMarkerSet())
            row = "Node Id: %s; Marker lineage: %s" % (selectedMarkerSet.UID, selectedMarkerSet.lineageStr)
            for marker in data:
                row += '\t' + marker
            print(row)

            row = self.binId
            for count in data.values():
                row += '\t' + str(count)
            print(row)

            print()
        elif outputFormat == 5:
            # tabular of bin_id, marker, contig_id
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            for marker, hit_list in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                for hit in hit_list:
                    print(self.binId, marker, hit.target_name, sep='\t', end='\n')

        elif outputFormat == 6:
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            seqsReported = 0
            for marker, hitList in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                if len(hitList) >= 2:
                    print(self.binId, marker, sep='\t', end='\t')

                    scaffoldIds = []
                    for hit in hitList:
                        scaffoldIds.append(hit.target_name)

                    print(','.join(sorted(scaffoldIds)), end='\n')

                    seqsReported += 1

            return seqsReported

        elif outputFormat == 7:
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            seqsReported = 0
            for marker, hitList in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                if len(hitList) >= 2:
                    scaffoldsWithMultipleHits = set()
                    for i in xrange(0, len(hitList)):
                        scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind('_')]
                        for j in xrange(i + 1, len(hitList)):
                            if scaffoldId == hitList[j].target_name[0:hitList[j].target_name.rfind('_')]:
                                scaffoldsWithMultipleHits.add(hitList[i].target_name)
                                scaffoldsWithMultipleHits.add(hitList[j].target_name)

                    if len(scaffoldsWithMultipleHits) >= 2:
                        print(self.binId, marker, sep='\t', end='\t')
                        print(','.join(sorted(list(scaffoldsWithMultipleHits))), end='\n')
                        seqsReported += 1

            return seqsReported

        elif outputFormat == 8:
            # tabular - print only position of marker genes
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            genesWithMarkers = {}
            for marker, hit_list in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                for hit in hit_list:
                    genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit]

            for geneId, hits in genesWithMarkers.iteritems():
                rowStr = self.binId + '\t' + geneId
                for hit in hits:
                    rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to)
                print(rowStr)

        # Hunter Cameron, May 29, 2015 - print a fasta of marker genes
        elif outputFormat == 9:
            # tabular of bin_id, marker, contig_id

            # check for the analyze folder for later use
            if anaFolder is None:
                raise ValueError("AnaFolder must not be None for outputFormat 9")

            # ## build a dict to link target_names with marker gene alignment information
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
            hitInfo = {}
            for marker, hit_list in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                for hit in hit_list:
                    name = hit.target_name
                    hitInfo[name] = {
                            "marker": marker,
                            "ali_from": str(hit.ali_from),
                            "ali_to": str(hit.ali_to)
                            }

            # ## Open genes.faa and print the ones that were found with some descriptive info in the header
            path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])

            # get only the seqs we need and their information as a dict
            seqs = readFasta(path_to_genes, trimHeader=False)

            filt_seqs = []
            # remove seqs without markers
            for header in seqs.keys():
                gene_name = header.split(" # ")[0]
                if gene_name in hitInfo:
                    filt_seqs.append(header)

            def sort_header(header):
                """ sorts headers by contig and gene number """
                name = header.split(" # ")[0]
                ctg_name, gene_num = name.rsplit("_", 1)
                return ctg_name, int(gene_num)

            for header in sorted(filt_seqs, key=sort_header):
                elems = header.split(" # ")
                gene_name = elems[0]

                # remove the gene number from Prodigal to get the original contig name
                contig_name, gene_num = gene_name.rsplit("_", 1)

                # parse some info about the gene from the header line
                gene_start = elems[1]
                gene_end = elems[2]
                gene_strand = elems[3]

                # if table output not specified, print FASTA
                if table != None:
                    gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
                            gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header])))

                    marker_info = "marker={};mstart={};mend={}".format(
                            hitInfo[gene_name]["marker"],
                            hitInfo[gene_name]["ali_from"],
                            hitInfo[gene_name]["ali_to"])

                    # new header will be the bin name, contig name, gene info, and marker info separated by spaces
                    new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])

                    print(new_header, seqs[header], sep="\n")
                # otherwise, print a table
                else:
                    print("\t".join([
                            self.binId,
                            contig_name,
                            gene_num,
                            gene_start,
                            gene_end,
                            gene_strand,
                            str(len(seqs[header])),
                            hitInfo[gene_name]["marker"],
                            hitInfo[gene_name]["ali_from"],
                            hitInfo[gene_name]["ali_to"],
                            seqs[header]
                            ]))
        else:
            self.logger.error("Unknown output format: %d", outputFormat)

        return 0

        '''
Esempio n. 53
0
    def plot(self, binFile, markerGeneStats, binStats):
        binId = binIdFromFilename(binFile)

        markerGenesPerSeq, _markerGeneNum = self.getMarkerGenesPerSeq(
            markerGeneStats)

        if len(markerGenesPerSeq) == 0:
            return False

        # Get length of sequences with one or more marker genes
        seqs = readFasta(binFile)
        seqLens = {}
        longestSeq = 0
        binSize = 0
        for seqId, seq in seqs.iteritems():
            seqLen = len(seq)
            binSize += seqLen

            if seqId not in markerGenesPerSeq:
                continue

            seqLens[seqId] = seqLen
            if seqLen > longestSeq:
                longestSeq = seqLen

        sortedSeqLens = sorted(seqLens.iteritems(),
                               key=operator.itemgetter(1),
                               reverse=True)

        MAX_BINS = 100
        plotBinSize = self.roundUpToNearest100(float(longestSeq) / MAX_BINS)
        yLabels = [x[0] for x in sortedSeqLens]

        # get position of genes in bin
        prodigalFastaParser = ProdigalFastaParser()
        geneFile = os.path.join(self.options.results_dir, 'bins', binId,
                                DefaultValues.PRODIGAL_AA)
        genePos = prodigalFastaParser.genePositions(geneFile)

        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        yLabelBounds = self.yLabelExtents(yLabels, self.options.font_size)

        heightBottomLabels = 0.4 + self.options.fig_padding  # inches
        widthSideLabel = yLabelBounds.width * self.options.width + self.options.fig_padding  # inches

        widthPerBin = (self.options.width - widthSideLabel -
                       self.options.fig_padding) / MAX_BINS

        titleHeight = 0.2
        HEIGHT_PER_ROW = 0.2
        height = HEIGHT_PER_ROW * len(
            sortedSeqLens
        ) + heightBottomLabels + self.options.fig_padding + titleHeight
        rowBinHeight = widthPerBin / HEIGHT_PER_ROW

        self.fig.set_size_inches(self.options.width, height)
        axes = self.fig.add_axes([widthSideLabel / self.options.width, heightBottomLabels / height, \
                                                                        1.0 - (widthSideLabel + self.options.fig_padding) / self.options.width, \
                                                                        1.0 - (heightBottomLabels + self.options.fig_padding + titleHeight) / height])

        # set plot axis
        axes.set_xlim([0, MAX_BINS + 0.1])
        axes.set_xlabel('Position (' + str(plotBinSize) + ' bp/bin)')

        axes.set_ylim([0, len(sortedSeqLens)])
        axes.set_yticks(np.arange(0.5, len(sortedSeqLens) + 0.5, 1.0))

        axes.set_yticklabels(yLabels)

        # legend
        colours = [(1.0, 1.0, 1.0), (127 / 255.0, 201 / 255.0, 127 / 255.0),
                   (255 / 255.0, 192 / 255.0, 134 / 255.0),
                   (190 / 255.0, 174 / 255.0, 212 / 255.0), (0.0, 0.0, 0.0)]
        discreteColourMap = mpl.colors.ListedColormap(colours)
        axisColourMap = self.fig.add_axes([
            self.options.fig_padding / self.options.width,
            self.options.fig_padding / height, 0.15,
            0.03 * (self.options.width / height)
        ])
        colourBar = mpl.colorbar.ColorbarBase(axisColourMap,
                                              cmap=discreteColourMap,
                                              norm=mpl.colors.Normalize(
                                                  vmin=0, vmax=1),
                                              orientation='horizontal',
                                              drawedges=True)
        colourBar.set_ticks([0.1, 0.3, 0.5, 0.7, 0.9])
        colourBar.set_ticklabels(['0', '1', '2', '3', '4+'])
        # colourBar.outline.set_color(self.axesColour)
        colourBar.outline.set_linewidth(0.5)
        # colourBar.dividers.set_color(self.axesColour)
        colourBar.dividers.set_linewidth(0.5)

        for a in axisColourMap.xaxis.majorTicks:
            a.tick1On = False
            a.tick2On = False

        # plot each bin
        binPosX = 0.5
        for seqId, seqLen in sortedSeqLens:
            markerCount = [0] * int(math.ceil(float(seqLen) / plotBinSize))
            for geneId, _markerGeneId, geneStartPos, _geneEndPos in markerGenesPerSeq[
                    seqId]:
                binPos = int(
                    float(genePos[geneId][0] + geneStartPos) / plotBinSize)
                markerCount[binPos] += 1

            for i in xrange(0, len(markerCount)):
                if markerCount[i] < len(colours):
                    axes.add_patch(
                        Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight),
                                  0.8,
                                  0.8 * rowBinHeight,
                                  facecolor=colours[markerCount[i]],
                                  lw=0.2))
                else:
                    axes.add_patch(
                        Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight),
                                  0.8,
                                  0.8 * rowBinHeight,
                                  facecolor=colours[-1],
                                  lw=0.2))

            binPosX += 1.0

        # set plot title
        titleStr = binId + '\n'
        titleStr += '(%.2f Mbp, %d seqs, %.2f%% complete, %.2f%% contamination)' % (
            float(binSize) / 1e6, len(seqs), binStats['Completeness'],
            binStats['Contamination'])
        axes.set_title(titleStr)

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = False
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)
            line.set_ms(2)

        for loc, spine in axes.spines.iteritems():
            if loc in ['left', 'right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.draw()

        return True
Esempio n. 54
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):
        # Read reference distributions from file
        dist = readDistribution('gc_dist')

        # get GC for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for _, seq in seqs.iteritems():
            start = 0
            end = self.options.gc_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                a, c, g, t = baseCount(seq[start:end])
                try:
                    data.append(float(g + c) / (a + c + g + t))
                except:
                    # it is possible to reach a long stretch of
                    # N's that causes a division by zero error

                    pass

                start = end
                end += self.options.gc_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.gc_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% GC')
        axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get GC bin statistics
        binTools = BinTools()
        meanGC, deltaGCs, _ = binTools.gcDist(seqs)

        # Delta-GC vs Sequence length plot
        axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100))
        axesDeltaGC.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaGC.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestGC = findNearest(np.array(dist.keys()), meanGC)

            # find closest distribution values
            sampleSeqLen = dist[closestGC].keys()[0]
            d = dist[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestGC]:
                xL.append(dist[closestGC][windowSize][gcLowerBoundKey])
                xU.append(dist[closestGC][windowSize][gcUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaGC.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaGC.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaGC.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaGC.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaGC.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaGC.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaGC.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaGC.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 55
0
    def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC):
        """Calculate coverage of sequences for each BAM file."""

        # determine bin assignment of each sequence
        self.logger.info('  Determining bin assignment of each sequence.')

        seqIdToBinId = {}
        seqIdToSeqLen = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            seqs = readFasta(binFile)
            for seqId, seq in seqs.iteritems():
                seqIdToBinId[seqId] = binId
                seqIdToSeqLen[seqId] = len(seq)

        # process each fasta file
        self.logger.info("  Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads))

        # make sure all BAM files are sorted
        self.numFiles = len(bamFiles)
        for bamFile in bamFiles:
            if not os.path.exists(bamFile + '.bai'):
                self.logger.error('  [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n')
                sys.exit(1)

        # calculate coverage of each BAM file
        coverageInfo = {}
        numFilesStarted = 0
        for bamFile in bamFiles:
            numFilesStarted += 1
            self.logger.info('  Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles)))

            coverageInfo[bamFile] = mp.Manager().dict()
            coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile])

        # redirect output
        self.logger.info('  Writing coverage information to file.')
        oldStdOut = reassignStdOut(outFile)

        header = 'Sequence Id\tBin Id\tSequence length (bp)'
        for bamFile in bamFiles:
            header += '\tBam Id\tCoverage\tMapped reads'

        print(header)

        # get length of all seqs
        for bamFile, seqIds in coverageInfo.iteritems():
            for seqId in seqIds.keys():
                seqIdToSeqLen[seqId] = seqIds[seqId].seqLen

        # write coverage stats for all scaffolds to file
        for seqId, seqLen in seqIdToSeqLen.iteritems():
            rowStr = seqId + '\t' + seqIdToBinId.get(seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen)
            for bamFile in bamFiles:
                bamId = binIdFromFilename(bamFile)

                if seqId in coverageInfo[bamFile]:
                    rowStr += '\t%s\t%f\t%d' % (bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads)
                else:
                    rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0)

            print(rowStr)

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Esempio n. 56
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 57
0
    def printSummary(self,
                     outputFormat,
                     aai,
                     binMarkerSets,
                     bIndividualMarkers,
                     coverageBinProfiles=None,
                     table=None,
                     anaFolder=None):
        """Print out information about bin."""
        if outputFormat == 1:
            selectedMarkerSet = binMarkerSets.selectedMarkerSet()

            lineageStr = selectedMarkerSet.lineageStr
            if selectedMarkerSet.UID != '0':
                lineageStr += ' (' + str(selectedMarkerSet.UID) + ')'

            data = self.geneCountsForSelectedMarkerSet(binMarkerSets,
                                                       bIndividualMarkers)
            row = "%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (
                self.binId, lineageStr, selectedMarkerSet.numGenomes,
                selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(),
                "\t".join([str(data[i]) for i in range(6)]), data[6], data[7],
                aai.aaiMeanBinHetero.get(self.binId, 0.0))
            if table == None:
                print(row)
            else:
                table.add_row([
                    self.binId, lineageStr, selectedMarkerSet.numGenomes,
                    selectedMarkerSet.numMarkers(),
                    selectedMarkerSet.numSets()
                ] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)])
        elif outputFormat == 2:
            selectedMarkerSet = binMarkerSets.selectedMarkerSet()

            lineageStr = selectedMarkerSet.lineageStr
            if selectedMarkerSet.UID != '0':
                lineageStr += ' (' + str(selectedMarkerSet.UID) + ')'

            data = self.geneCountsForSelectedMarkerSet(binMarkerSets,
                                                       bIndividualMarkers)

            if table == None:
                row = self.binId
                row += '\t%s\t%d\t%d\t%d' % (lineageStr,
                                             selectedMarkerSet.numGenomes,
                                             selectedMarkerSet.numMarkers(),
                                             selectedMarkerSet.numSets())
                row += '\t%0.2f\t%0.2f\t%0.2f' % (data[6], data[7],
                                                  aai.aaiMeanBinHetero.get(
                                                      self.binId, 0.0))
                row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (
                    self.binStats['Genome size'],
                    self.binStats['# ambiguous bases'],
                    self.binStats['# scaffolds'], self.binStats['# contigs'],
                    self.binStats['N50 (scaffolds)'],
                    self.binStats['N50 (contigs)'],
                    self.binStats['Mean scaffold length'],
                    self.binStats['Mean contig length'],
                    self.binStats['Longest scaffold'],
                    self.binStats['Longest contig'])
                row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100,
                                         self.binStats['GC std'] * 100)
                row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] *
                                           100,
                                           self.binStats['Translation table'],
                                           self.binStats['# predicted genes'])
                row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)])

                if coverageBinProfiles:
                    for _, coverageStats in coverageBinProfiles[
                            self.binId].iteritems():
                        row += '\t%.2f\t%.2f' % (coverageStats[0],
                                                 coverageStats[1])

                print(row)
            else:
                row = [
                    self.binId, lineageStr, selectedMarkerSet.numGenomes,
                    selectedMarkerSet.numMarkers(),
                    selectedMarkerSet.numSets()
                ]
                row.extend([
                    data[6], data[7],
                    aai.aaiMeanBinHetero.get(self.binId, 0.0)
                ])
                row.extend([
                    self.binStats['Genome size'],
                    self.binStats['# ambiguous bases'],
                    self.binStats['# scaffolds'], self.binStats['# contigs'],
                    self.binStats['N50 (scaffolds)'],
                    self.binStats['N50 (contigs)'],
                    int(self.binStats['Mean scaffold length']),
                    int(self.binStats['Mean contig length']),
                    self.binStats['Longest scaffold'],
                    self.binStats['Longest contig']
                ])
                row.extend(
                    [self.binStats['GC'] * 100, self.binStats['GC std'] * 100])
                row.extend([
                    self.binStats['Coding density'] * 100,
                    self.binStats['Translation table'],
                    self.binStats['# predicted genes']
                ])
                row.extend(data[0:6])

                if coverageBinProfiles:
                    for _, coverageStats in coverageBinProfiles[
                            self.binId].iteritems():
                        row.extend(coverageStats)

                table.add_row(row)
        elif outputFormat == 3:
            for ms in binMarkerSets.markerSetIter():
                data = self.geneCounts(ms, self.markerHits, bIndividualMarkers)
                row = "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (
                    self.binId, ms.UID, ms.lineageStr, ms.numGenomes,
                    ms.numMarkers(), ms.numSets(), "\t".join(
                        [str(data[i]) for i in range(6)]), data[6], data[7],
                    aai.aaiMeanBinHetero.get(self.binId, 0.0))
                if table == None:
                    print(row)
                else:
                    table.add_row([
                        self.binId, ms.UID, ms.lineageStr, ms.numGenomes,
                        ms.numMarkers(),
                        ms.numSets()
                    ] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)])

        elif outputFormat == 4:
            selectedMarkerSet = binMarkerSets.selectedMarkerSet()
            data = self.hitsToMarkerGene(binMarkerSets.selectedMarkerSet())
            row = "Node Id: %s; Marker lineage: %s" % (
                selectedMarkerSet.UID, selectedMarkerSet.lineageStr)
            for marker in data:
                row += '\t' + marker
            print(row)

            row = self.binId
            for count in data.values():
                row += '\t' + str(count)
            print(row)

            print()
        elif outputFormat == 5:
            # tabular of bin_id, marker, contig_id
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            for marker, hit_list in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                for hit in hit_list:
                    print(self.binId,
                          marker,
                          hit.target_name,
                          sep='\t',
                          end='\n')

        elif outputFormat == 6:
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            seqsReported = 0
            for marker, hitList in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                if len(hitList) >= 2:
                    print(self.binId, marker, sep='\t', end='\t')

                    scaffoldIds = []
                    for hit in hitList:
                        scaffoldIds.append(hit.target_name)

                    print(','.join(sorted(scaffoldIds)), end='\n')

                    seqsReported += 1

            return seqsReported

        elif outputFormat == 7:
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            seqsReported = 0
            for marker, hitList in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                if len(hitList) >= 2:
                    scaffoldsWithMultipleHits = set()
                    for i in xrange(0, len(hitList)):
                        scaffoldId = hitList[i].target_name[
                            0:hitList[i].target_name.rfind('_')]
                        for j in xrange(i + 1, len(hitList)):
                            if scaffoldId == hitList[j].target_name[
                                    0:hitList[j].target_name.rfind('_')]:
                                scaffoldsWithMultipleHits.add(
                                    hitList[i].target_name)
                                scaffoldsWithMultipleHits.add(
                                    hitList[j].target_name)

                    if len(scaffoldsWithMultipleHits) >= 2:
                        print(self.binId, marker, sep='\t', end='\t')
                        print(','.join(sorted(
                            list(scaffoldsWithMultipleHits))),
                              end='\n')
                        seqsReported += 1

            return seqsReported

        elif outputFormat == 8:
            # tabular - print only position of marker genes
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

            genesWithMarkers = {}
            for marker, hit_list in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                for hit in hit_list:
                    genesWithMarkers[hit.target_name] = genesWithMarkers.get(
                        hit.target_name, []) + [hit]

            for geneId, hits in genesWithMarkers.iteritems():
                rowStr = self.binId + '\t' + geneId
                for hit in hits:
                    rowStr += '\t' + hit.query_accession + ',' + str(
                        hit.ali_from) + ',' + str(hit.ali_to)
                print(rowStr)

        # Hunter Cameron, May 29, 2015 - print a fasta of marker genes
        elif outputFormat == 9:
            # tabular of bin_id, marker, contig_id

            # check for the analyze folder for later use
            if anaFolder is None:
                raise ValueError(
                    "AnaFolder must not be None for outputFormat 9")

            # ## build a dict to link target_names with marker gene alignment information
            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
            hitInfo = {}
            for marker, hit_list in self.markerHits.items():
                if marker not in markerGenes:
                    continue

                for hit in hit_list:
                    name = hit.target_name
                    hitInfo[name] = {
                        "marker": marker,
                        "ali_from": str(hit.ali_from),
                        "ali_to": str(hit.ali_to)
                    }

            # ## Open genes.faa and print the ones that were found with some descriptive info in the header
            path_to_genes = "/".join(
                [anaFolder, "bins", self.binId, "genes.faa"])

            # get only the seqs we need and their information as a dict
            seqs = readFasta(path_to_genes, trimHeader=False)

            filt_seqs = []
            # remove seqs without markers
            for header in seqs.keys():
                gene_name = header.split(" # ")[0]
                if gene_name in hitInfo:
                    filt_seqs.append(header)

            def sort_header(header):
                """ sorts headers by contig and gene number """
                name = header.split(" # ")[0]
                ctg_name, gene_num = name.rsplit("_", 1)
                return ctg_name, int(gene_num)

            for header in sorted(filt_seqs, key=sort_header):
                elems = header.split(" # ")
                gene_name = elems[0]

                # remove the gene number from Prodigal to get the original contig name
                contig_name, gene_num = gene_name.rsplit("_", 1)

                # parse some info about the gene from the header line
                gene_start = elems[1]
                gene_end = elems[2]
                gene_strand = elems[3]

                # if table output not specified, print FASTA
                if table != None:
                    gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
                        gene_num, gene_start, gene_end, gene_strand,
                        str(len(seqs[header])))

                    marker_info = "marker={};mstart={};mend={}".format(
                        hitInfo[gene_name]["marker"],
                        hitInfo[gene_name]["ali_from"],
                        hitInfo[gene_name]["ali_to"])

                    # new header will be the bin name, contig name, gene info, and marker info separated by spaces
                    new_header = ">" + " ".join(
                        [self.binId, contig_name, gene_info, marker_info])

                    print(new_header, seqs[header], sep="\n")
                # otherwise, print a table
                else:
                    print("\t".join([
                        self.binId, contig_name, gene_num, gene_start,
                        gene_end, gene_strand,
                        str(len(seqs[header])), hitInfo[gene_name]["marker"],
                        hitInfo[gene_name]["ali_from"],
                        hitInfo[gene_name]["ali_to"], seqs[header]
                    ]))
        else:
            self.logger.error("Unknown output format: %d", outputFormat)

        return 0
        '''
Esempio n. 58
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist,
                   axesDeltaGC):
        # Read reference distributions from file
        dist = readDistribution('gc_dist')

        # get GC for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for _, seq in seqs.iteritems():
            start = 0
            end = self.options.gc_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while (end < seqLen):
                a, c, g, t = baseCount(seq[start:end])
                try:
                    data.append(float(g + c) / (a + c + g + t))
                except:
                    # it is possible to reach a long stretch of
                    # N's that causes a division by zero error

                    pass

                start = end
                end += self.options.gc_window_size

        if len(data) == 0:
            axesHist.set_xlabel(
                '[Error] No seqs >= %d, the specified window size' %
                self.options.gc_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.gc_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% GC')
        axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) +
                            ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get GC bin statistics
        binTools = BinTools()
        meanGC, deltaGCs, _ = binTools.gcDist(seqs)

        # Delta-GC vs Sequence length plot
        axesDeltaGC.scatter(deltaGCs,
                            seqLens,
                            c=abs(deltaGCs),
                            s=10,
                            lw=0.5,
                            cmap='gray_r')
        axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' %
                               (meanGC * 100))
        axesDeltaGC.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaGC.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestGC = findNearest(np.array(dist.keys()), meanGC)

            # find closest distribution values
            sampleSeqLen = dist[closestGC].keys()[0]
            d = dist[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestGC]:
                xL.append(dist[closestGC][windowSize][gcLowerBoundKey])
                xU.append(dist[closestGC][windowSize][gcUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaGC.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaGC.vlines(0,
                           0,
                           yMaxSeqs,
                           linestyle='dashed',
                           color=self.axesColour,
                           zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaGC.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaGC.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaGC.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaGC.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaGC.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaGC.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaGC.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 59
0
    def plot(self, fastaFile):
        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axes = self.fig.add_subplot(111)

        # calculate cumulative sequence length
        seqs = readFasta(fastaFile)

        seqLens = []
        for seq in seqs.values():
            seqLens.append(len(seq))

        seqLens.sort(reverse=True)
        x = np.arange(0, len(seqLens))

        y = []
        cumLen = 0
        for seqLen in seqLens:
            cumLen += seqLen
            y.append(cumLen)

        # Create plot
        axes.plot(
            x,
            y,
            'k-',
        )
        axes.set_xlabel('Sequence index')
        axes.set_ylabel('Cumulative sequence length (Mbp)')

        # ensure y-axis include zero
        _, end = axes.get_ylim()
        axes.set_ylim([0, end])

        # Change sequence lengths from bp to kbp
        yticks = axes.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.2f' % (float(seqLen) / 1e6)
            label = label.replace('.00', '')  # remove trailing zeros
            if label[-1] == '0':
                label = label[0:-1]
            kbpLabels.append(label)
        axes.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axes.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.fig.tight_layout(pad=1)
        self.draw()