def sequenceStats(self, outDir, binFile): """Calculate statistics for all sequences within a bin.""" # read scaffolds seqs = readFasta(binFile) seqStats = {} for seqId in seqs: seqStats[seqId] = {} self.calculateGC(seqs, seqStats) self.calculateSeqStats(seqs, seqStats) binId = binIdFromFilename(binFile) aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) if os.path.exists(aaFile): aaGenes = readFasta(aaFile) for geneId, gene in aaGenes.items(): seqId = geneId[0:geneId.rfind('_')] seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 seqStats[seqId]['Coding bases'] = seqStats[seqId].get( 'Coding bases', 0) + len(gene) * 3 else: # missing amino acid file likely indicates users used a pre-called gene file, so # just set some defaults seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 seqStats[seqId]['Coding bases'] = seqStats[seqId].get( 'Coding bases', 0) + len(gene) * 3 return seqStats
def sequenceStats(self, outDir, binFile): """Calculate statistics for all sequences within a bin.""" # read scaffolds seqs = readFasta(binFile) seqStats = {} for seqId in seqs: seqStats[seqId] = {} self.calculateGC(seqs, seqStats) self.calculateSeqStats(seqs, seqStats) binId = binIdFromFilename(binFile) aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) if os.path.exists(aaFile): aaGenes = readFasta(aaFile) for geneId, gene in aaGenes.iteritems(): seqId = geneId[0:geneId.rfind('_')] seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3 else: # missing amino acid file likely indicates users used a pre-called gene file, so # just set some defaults seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3 return seqStats
def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen): checkFileExists(seqFile) # get list of sequences in bins self.logger.info(' Reading binned sequences.') binnedSeqs = {} totalBinnedBases = 0 for binFile in binFiles: seqs = readFasta(binFile) binnedSeqs.update(seqs) for seq in seqs.values(): totalBinnedBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6)) # get list of all sequences self.logger.info(' Reading all sequences.') allSeqs = readFasta(seqFile) totalBases = 0 for seq in allSeqs.values(): totalBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6)) # write all unbinned sequences self.logger.info(' Identifying unbinned sequences >= %d bp.' % minSeqLen) seqOut = open(outSeqFile, 'w') statsOut = open(outStatsFile, 'w') statsOut.write('Sequence Id\tLength\tGC\n') unbinnedCount = 0 unbinnedBases = 0 for seqId, seq in allSeqs.iteritems(): if seqId not in binnedSeqs: if len(seq) >= minSeqLen: unbinnedCount += 1 seqOut.write('>' + seqId + '\n') seqOut.write(seq + '\n') unbinnedBases += len(seq) a, c, g, t = baseCount(seq) statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t))) seqOut.close() statsOut.close() self.logger.info(' Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6)) self.logger.info('') self.logger.info(' Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs))) self.logger.info(' Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen): checkFileExists(seqFile) # get list of sequences in bins self.logger.info('Reading binned sequences.') binnedSeqs = {} totalBinnedBases = 0 for binFile in binFiles: seqs = readFasta(binFile) binnedSeqs.update(seqs) for seq in seqs.values(): totalBinnedBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6)) # get list of all sequences self.logger.info('Reading all sequences.') allSeqs = readFasta(seqFile) totalBases = 0 for seq in allSeqs.values(): totalBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6)) # write all unbinned sequences self.logger.info('Identifying unbinned sequences >= %d bp.' % minSeqLen) seqOut = open(outSeqFile, 'w') statsOut = open(outStatsFile, 'w') statsOut.write('Sequence Id\tLength\tGC\n') unbinnedCount = 0 unbinnedBases = 0 for seqId, seq in allSeqs.iteritems(): if seqId not in binnedSeqs: if len(seq) >= minSeqLen: unbinnedCount += 1 seqOut.write('>' + seqId + '\n') seqOut.write(seq + '\n') unbinnedBases += len(seq) a, c, g, t = baseCount(seq) statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t))) seqOut.close() statsOut.close() self.logger.info(' Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6)) self.logger.info('Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs))) self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
def modify(self, binFile, seqFile, seqsToAdd, seqsToRemove, outputFile): """Add and remove sequences from a file.""" binSeqs = readFasta(binFile) # add sequences to bin if seqsToAdd != None: refSeqs = readFasta(seqFile) self.__addSeqs(binSeqs, refSeqs, seqsToAdd) # remove sequences from bin if seqsToRemove != None: self.__removeSeqs(binSeqs, seqsToRemove) # save modified bin writeFasta(binSeqs, outputFile)
def __init__(self, binningIndex, completeness, contamination, binFile): self.binningIndex = binningIndex self.completeness = completeness self.contamination = contamination self.binId = binIdFromFilename(binFile) self.seqs = readFasta(binFile) self.binFile = binFile
def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in allTrustedGenomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment( os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)
def run(self, aaiStrainThreshold, outDir, alignmentOutputFile): """Calculate AAI between input alignments.""" self.logger.info('Calculating AAI between multi-copy marker genes.') if alignmentOutputFile: fout = open(alignmentOutputFile, 'w') # calculate AAI for duplicate marker genes binIds = getBinIdsFromOutDir(outDir) aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa') for binId in binIds: binPath = os.path.join(aaiOutputDir, binId) if not os.path.exists(binPath): continue for f in os.listdir(binPath): if not f.endswith('.masked.faa'): continue markerId = f[0:f.find('.')] seqs = readFasta(os.path.join(binPath, f)) # calculate AAI between all pairs of seqs for i in range(0, len(seqs)): seqIdI = list(seqs.keys())[i] binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)] seqI = seqs[seqIdI] for j in range(i + 1, len(seqs)): seqIdJ = list(seqs.keys())[j] binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)] seqJ = seqs[seqIdJ] if binIdI == binIdJ: aai = self.aai(seqI, seqJ) if alignmentOutputFile: fout.write(binId + ',' + markerId + '\n') fout.write(seqIdI + '\t' + seqI + '\n') fout.write(seqIdJ + '\t' + seqJ + '\n') fout.write('AAI: %.3f\n' % aai) fout.write('\n') if binIdI not in self.aaiRawScores: self.aaiRawScores[binIdI] = defaultdict(list) self.aaiRawScores[binIdI][markerId].append(aai) else: # something is wrong as the bin Ids should always be the same self.logger.error(' [Error] Bin ids do not match.') sys.exit(1) if alignmentOutputFile: fout.close() # calculate strain heterogeneity for each marker gene in each bin self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
def removeOutliers(self, binFile, outlierFile, outputFile): """Remove sequences specified as outliers in the provided file.""" binSeqs = readFasta(binFile) binIdToModify = binIdFromFilename(binFile) # get files to remove checkFileExists(outlierFile) seqsToRemove = [] bHeader = True for line in open(outlierFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') binId = lineSplit[0] if binId == binIdToModify: seqId = lineSplit[1] seqsToRemove.append(seqId) # remove sequences from bin if len(seqsToRemove) > 0: self.__removeSeqs(binSeqs, seqsToRemove) # save modified bin writeFasta(binSeqs, outputFile)
def run(self, aaiStrainThreshold, outDir, alignmentOutputFile): """Calculate AAI between input alignments.""" self.logger.info(' Calculating AAI between multi-copy marker genes.') if alignmentOutputFile: fout = open(alignmentOutputFile, 'w') # calculate AAI for duplicate marker genes binIds = getBinIdsFromOutDir(outDir) aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa') for binId in binIds: binPath = os.path.join(aaiOutputDir, binId) if not os.path.exists(binPath): continue for f in os.listdir(binPath): if not f.endswith('.masked.faa'): continue markerId = f[0:f.find('.')] seqs = readFasta(os.path.join(binPath, f)) # calculate AAI between all pairs of seqs for i in xrange(0, len(seqs)): seqIdI = seqs.keys()[i] binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)] seqI = seqs[seqIdI] for j in xrange(i+1, len(seqs)): seqIdJ = seqs.keys()[j] binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)] seqJ = seqs[seqIdJ] if binIdI == binIdJ: aai = self.aai(seqI, seqJ) if alignmentOutputFile: fout.write(binId + ',' + markerId + '\n') fout.write(seqIdI + '\t' + seqI + '\n') fout.write(seqIdJ + '\t' + seqJ + '\n') fout.write('AAI: %.3f\n' % aai) fout.write('\n') if binIdI not in self.aaiRawScores: self.aaiRawScores[binIdI] = defaultdict(list) self.aaiRawScores[binIdI][markerId].append(aai) else: # something is wrong as the bin Ids should always be the same self.logger.error(' [Error] Bin ids do not match.') sys.exit() if alignmentOutputFile: fout.close() # calculate strain heterogeneity for each marker gene in each bin self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
def reportFullMSA(self, outDir, outFile): """Create MSA with all reference and bin alignments.""" # write bin alignments to file oldStdOut = reassignStdOut(outFile) for line in open( os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)): print((line.rstrip())) # read duplicate seqs duplicateNodes = self.__readDuplicateSeqs() # write reference alignments to file seqs = readFasta( os.path.join(DefaultValues.PPLACER_REF_PACKAGE_FULL, DefaultValues.GENOME_TREE_FASTA)) for seqId, seq in seqs.items(): print(('>' + seqId)) print(seq) if seqId in duplicateNodes: for dupSeqId in duplicateNodes[seqId]: print(('>' + dupSeqId)) print(seq) restoreStdOut(outFile, oldStdOut)
def __extractMarkerSeqsTopHits(self, outDir, resultsParser): """Extract marker sequences from top hits (assume all bins use the same HMM file).""" markerSeqs = defaultdict(dict) markerStats = defaultdict(dict) for binId in resultsParser.results: # read ORFs for bin aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) binORFs = readFasta(aaGeneFile) # extract ORFs hitting a marker for markerId, hits in resultsParser.results[ binId].markerHits.items(): markerSeqs[markerId][binId] = {} markerStats[markerId][binId] = {} # sort hits from highest to lowest e-value in order to ensure only the best hit # to a given target is retained hits.sort(key=lambda x: x.full_e_value, reverse=True) topHit = hits[0] markerSeqs[markerId][binId][ topHit.target_name] = self.__extractSeq( topHit.target_name, binORFs) markerStats[markerId][binId][topHit.target_name] = [ topHit.full_e_value, topHit.full_score ] return markerSeqs, markerStats
def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in allTrustedGenomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)
def __extractMarkerSeqsUnique(self, outDir, resultsParser): """Extract marker sequences with a single unique hit.""" markerSeqs = defaultdict(dict) markerStats = defaultdict(dict) for binId in resultsParser.results: # read ORFs for bin aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) binORFs = readFasta(aaGeneFile) # extract ORFs hitting a marker for markerId, hits in resultsParser.results[ binId].markerHits.items(): markerSeqs[markerId][binId] = {} markerStats[markerId][binId] = {} # only record hits which are unique if len(hits) == 1: hit = hits[0] markerSeqs[markerId][binId][ hit.target_name] = self.__extractSeq( hit.target_name, binORFs) markerStats[markerId][binId][hit.target_name] = [ hit.full_e_value, hit.full_score ] return markerSeqs, markerStats
def __extractMarkersWithMultipleHits(self, outDir, binId, resultsParser, binMarkerSet): """Extract markers with multiple hits within a single bin.""" markersWithMultipleHits = defaultdict(dict) aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) binORFs = readFasta(aaGeneFile) markerGenes = binMarkerSet.selectedMarkerSet().getMarkerGenes() for markerId, hits in resultsParser.results[binId].markerHits.items(): if markerId not in markerGenes or len(hits) < 2: continue # sort hits from highest to lowest e-value in order to ensure only the best hit # to a given target is retained hits.sort(key=lambda x: x.full_e_value, reverse=True) # Note: this data structure is used to mimic that used by __extractMarkerSeqsTopHits() markersWithMultipleHits[markerId][binId] = {} for hit in hits: markersWithMultipleHits[markerId][binId][ hit.target_name] = self.__extractSeq( hit.target_name, binORFs) return markersWithMultipleHits
def run(self, outputDir): # make sure output directory exists if not os.path.exists(outputDir): os.mkdir(outputDir) # remove similar taxa print 'Filtering out highly similar taxa in order to reduce size of tree:' seqs = readFasta(self.derepConcatenatedAlignFile) nearlyIdentical = self.__nearlyIdenticalGenomes(seqs, outputDir) reducedSeqs = {} for s in nearlyIdentical: rndGenome = random.choice(tuple(s)) reducedSeqs[rndGenome] = seqs[rndGenome] # write out reduced alignment reducedAlignmentFile = os.path.join(outputDir, "genome_tree.fasta") writeFasta(reducedSeqs, reducedAlignmentFile) # prune tree to retained taxa print '' print 'Pruning tree:' tree = dendropy.Tree.get_from_path(self.tree, schema='newick', as_rooted=False, preserve_underscores=True) for seqId in reducedSeqs: node = tree.find_node_with_taxon_label(seqId) if not node: print 'Missing taxa: %s' % seqId tree.retain_taxa_with_labels(reducedSeqs.keys()) outputTree = os.path.join(outputDir, 'genome_tree.tre') tree.write_to_path(outputTree, schema='newick', suppress_rooting=True, unquoted_underscores=True) for t in tree.internal_nodes(): t.label = None for t in tree.leaf_nodes(): if t.taxon.label not in reducedSeqs: print 'missing in sequence file: %s' % t.taxon.label outputTreeWithoutLabels = os.path.join(outputDir, 'genome_tree.small.no_internal_labels.tre') tree.write_to_path(outputTreeWithoutLabels, schema='newick', suppress_rooting=True, unquoted_underscores=True) print ' Pruned tree written to: %s' % outputTree # calculate model parameters for pruned tree print '' print 'Determining model parameters for new tree.' outputTreeLog = os.path.join(outputDir, 'genome_tree.log') fastTreeOutput = os.path.join(outputDir, 'genome_tree.no_internal_labels.fasttree.tre') # os.system('FastTreeMP -nome -mllen -intree %s -log %s < %s > %s' % (outputTreeWithoutLabels, outputTreeLog, reducedAlignmentFile, fastTreeOutput)) # calculate reference package for pruned tree print '' print 'Creating reference package.' os.system('taxit create -l %s -P %s --aln-fasta %s --tree-stats %s --tree-file %s' % ('genome_tree_reduced', os.path.join(outputDir, 'genome_tree_reduced.refpkg'), reducedAlignmentFile, outputTreeLog, outputTree))
def __genomeSeqLens(self, genomeId): """Determine length of contigs/scaffolds comprising genome.""" genomeFile = os.path.join(self.genomeDir, genomeId, genomeId + '.fna') seqs = readFasta(genomeFile) seqLens = {} for seqId, seq in seqs.iteritems(): seqLens[seqId] = len(seq) return seqLens
def __genomeSeqLens(self, genomeId): """Determine length of contigs/scaffolds comprising genome.""" genomeFile = os.path.join(self.genomeDir, genomeId, genomeId + '.fna') seqs = readFasta(genomeFile) seqLens = {} for seqId, seq in seqs.items(): seqLens[seqId] = len(seq) return seqLens
def plot(self, fastaFile): # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axes = self.fig.add_subplot(111) # calculate sequence lengths (in kb) seqs = readFasta(fastaFile) seqLens = [] for seq in seqs.values(): seqLens.append(float(len(seq)) / 1e3) # set unequal bin sizes (in kb) bins = [0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1e12] counts, _edges = np.histogram(seqLens, bins=bins) # create histogram axes.bar(x=np.arange(0.1, len(counts)), height=counts, width=0.8, color=(0.5, 0.5, 0.5)) axes.set_xlabel('Sequence length (kbp)') axes.set_ylabel('Number sequences (out of %d)' % len(seqs)) # ensure y-axis include zero _, end = axes.get_ylim() axes.set_ylim([0, end]) axes.get_yaxis().set_major_locator(MaxNLocator(integer=True)) # Change sequence lengths from bp to kbp axes.set_xlim([0, len(counts)]) axes.set_xticks(np.arange(0.5, len(counts))) axes.set_xticklabels(['<1', '1-2', '2-5', '5-10', '10-20', '20-50', '50-100', '100-200', '200-500', '>500']) # Prettify plot for a in axes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axes.spines.items(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.fig.tight_layout(pad=1) self.draw()
def plot(self, fastaFile): # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axes = self.fig.add_subplot(111) # calculate sequence lengths (in kb) seqs = readFasta(fastaFile) seqLens = [] for seq in seqs.values(): seqLens.append(float(len(seq)) / 1e3) # set unequal bin sizes (in kb) bins = [0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1e12] counts, _edges = np.histogram(seqLens, bins=bins) # create histogram axes.bar(left=np.arange(0.1, len(counts)), height=counts, width=0.8, color=(0.5, 0.5, 0.5)) axes.set_xlabel('Sequence length (kbp)') axes.set_ylabel('Number sequences (out of %d)' % len(seqs)) # ensure y-axis include zero _, end = axes.get_ylim() axes.set_ylim([0, end]) axes.get_yaxis().set_major_locator(MaxNLocator(integer=True)) # Change sequence lengths from bp to kbp axes.set_xlim([0, len(counts)]) axes.set_xticks(np.arange(0.5, len(counts))) axes.set_xticklabels(['<1', '1-2', '2-5', '5-10', '10-20', '20-50', '50-100', '100-200', '200-500', '>500']) # Prettify plot for a in axes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axes.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.fig.tight_layout(pad=1) self.draw()
def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir): """Create a concatenated alignment of marker genes for each bin.""" # read alignment files self.logger.info(' Reading marker alignment files.') alignments = defaultdict(dict) files = os.listdir(alignOutputDir) binIds = set() for f in files: if f.endswith('.masked.faa'): markerId = f[0:f.find('.masked.faa')] seqs = readFasta(os.path.join(alignOutputDir, f)) for seqId, seq in seqs.items(): binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[markerId][binId] = seq binIds.add(binId) # get all markers and their lengths markerIds = list(resultsParser.models[list( resultsParser.models.keys())[0]].keys()) markerIdLens = {} for markerId in markerIds: markerIdLens[markerId] = resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng # create concatenated alignment self.logger.info(' Concatenating alignments.') concatenatedSeqs = {} for markerId in sorted(markerIds): seqs = alignments[markerId] for binId in binIds: if binId in seqs: # append alignment concatenatedSeqs[binId] = concatenatedSeqs.get( binId, '') + seqs[binId] else: # missing gene concatenatedSeqs[binId] = concatenatedSeqs.get( binId, '') + '-' * markerIdLens[markerId] # save concatenated alignment concatenatedAlignFile = os.path.join( alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT) writeFasta(concatenatedSeqs, concatenatedAlignFile) return concatenatedAlignFile
def plot(self, fastaFile): # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axes = self.fig.add_subplot(111) # calculate Nx seqs = readFasta(fastaFile) x = np.arange(0, 1.0 + 0.5 * self.options.step_size, self.options.step_size) nx = self.calculateNx(x, seqs) # Create plot axes.plot(x, nx, 'ko-', ms=4) axes.set_xlabel('Nx') axes.set_ylabel('Sequence length (kbp)') # Change sequence lengths from bp to kbp yticks = axes.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axes.set_yticklabels(kbpLabels) # Prettify plot for a in axes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axes.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.fig.tight_layout(pad=1) self.draw()
def calculateCodingDensity(self, outDir, genomeSize, seqStats): """Calculate coding density of putative genome bin.""" gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF) if os.path.exists(gffFile): prodigalParserGFF = ProdigalGeneFeatureParser(gffFile) aaFile = os.path.join(outDir, DefaultValues.PRODIGAL_AA) # use AA file as nucleotide file is optional aaGenes = readFasta(aaFile) codingBasePairs = self.__calculateCodingBases(aaGenes, seqStats) return float(codingBasePairs) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes) else: # there is not gene feature file (perhaps the user specified precalculated genes) # so calculting the coding density is not possible return -1, -1, -1
def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir): """Create a concatenated alignment of marker genes for each bin.""" # read alignment files self.logger.info(' Reading marker alignment files.') alignments = defaultdict(dict) files = os.listdir(alignOutputDir) binIds = set() for f in files: if f.endswith('.masked.faa'): markerId = f[0:f.find('.masked.faa')] seqs = readFasta(os.path.join(alignOutputDir, f)) for seqId, seq in seqs.iteritems(): binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[markerId][binId] = seq binIds.add(binId) # get all markers and their lengths markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys() markerIdLens = {} for markerId in markerIds: markerIdLens[markerId] = resultsParser.models[resultsParser.models.keys()[0]][markerId].leng # create concatenated alignment self.logger.info(' Concatenating alignments.') concatenatedSeqs = {} for markerId in sorted(markerIds): seqs = alignments[markerId] for binId in binIds: if binId in seqs: # append alignment concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + seqs[binId] else: # missing gene concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + '-' * markerIdLens[markerId] # save concatenated alignment concatenatedAlignFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT) writeFasta(concatenatedSeqs, concatenatedAlignFile) return concatenatedAlignFile
def calculate(self, seqFile, outputFile): """Calculate genomic signature of each sequence.""" self.logger.info( ' Determining tetranucleotide signature of each sequence.') # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() seqs = readFasta(seqFile) for seqId, seq in seqs.items(): workerQueue.put((seqId, seq)) for _ in range(self.totalThreads): workerQueue.put((None, None)) try: calcProc = [ mp.Process(target=self.__calculateResults, args=(workerQueue, writerQueue)) for _ in range(self.totalThreads) ] writeProc = mp.Process(target=self.__storeResults, args=(seqFile, outputFile, len(seqs), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None)) writeProc.join() except: # make sure all processes are terminated for p in calcProc: p.terminate() writeProc.terminate()
def __processBin(self, outDir, queueIn, queueOut): """Thread safe bin processing.""" while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binStats = {} binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # read scaffolds scaffolds = readFasta(binFile) # calculate GC statistics GC, stdGC = self.calculateGC(scaffolds) binStats['GC'] = GC binStats['GC std'] = stdGC # calculate statistics related to contigs and scaffolds maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, scaffoldAvgLen, contigAvgLen, numContigs, numAmbiguousBases = self.calculateSeqStats( scaffolds) binStats['Genome size'] = genomeSize binStats['# ambiguous bases'] = numAmbiguousBases binStats['# scaffolds'] = len(scaffolds) binStats['# contigs'] = numContigs binStats['Longest scaffold'] = maxScaffoldLen binStats['Longest contig'] = maxContigLen binStats['N50 (scaffolds)'] = scaffold_N50 binStats['N50 (contigs)'] = contig_N50 binStats['Mean scaffold length'] = scaffoldAvgLen binStats['Mean contig length'] = contigAvgLen # calculate coding density statistics codingDensity, translationTable, numORFs = self.calculateCodingDensity( binDir, scaffolds, genomeSize) binStats['Coding density'] = codingDensity binStats['Translation table'] = translationTable binStats['# predicted genes'] = numORFs queueOut.put((binId, binStats))
def __processBin(self, outDir, queueIn, queueOut): """Thread safe bin processing.""" while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binStats = {} scaffoldStats = {} binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # read scaffolds scaffolds = readFasta(binFile) for seqId in scaffolds: scaffoldStats[seqId] = {} # calculate GC statistics GC, stdGC = self.calculateGC(scaffolds, scaffoldStats) binStats['GC'] = GC binStats['GC std'] = stdGC # calculate statistics related to scaffold lengths maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds, scaffoldStats) binStats['Genome size'] = genomeSize binStats['# ambiguous bases'] = numAmbiguousBases binStats['# scaffolds'] = len(scaffolds) binStats['# contigs'] = numContigs binStats['Longest scaffold'] = maxScaffoldLen binStats['Longest contig'] = maxContigLen binStats['N50 (scaffolds)'] = scaffold_N50 binStats['N50 (contigs)'] = contig_N50 # calculate coding density statistics codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, genomeSize, scaffoldStats) binStats['Coding density'] = codingDensity binStats['Translation table'] = translationTable binStats['# predicted genes'] = numORFs queueOut.put((binId, binStats, scaffoldStats))
def calculateCodingDensity(self, outDir, scaffolds, genomeSize): """Calculate coding density of putative genome bin.""" gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF) if os.path.exists(gffFile): prodigalParserGFF = ProdigalGeneFeatureParser(gffFile) aaFile = os.path.join( outDir, DefaultValues.PRODIGAL_AA ) # use AA file as nucleotide file is optional aaGenes = readFasta(aaFile) codingBasePairs = 0 # self.__calculateCodingBases(aaGenes) for scaffold_id in scaffolds.keys(): codingBasePairs += prodigalParserGFF.codingBases(scaffold_id) return float( codingBasePairs ) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes) else: # there is no gene feature file (perhaps the user specified pre-calculated genes) # so calculating the coding density is not possible return -1, -1, -1
def __extractMarkerSeqsUnique(self, outDir, resultsParser): """Extract marker sequences with a single unique hit.""" markerSeqs = defaultdict(dict) markerStats = defaultdict(dict) for binId in resultsParser.results: # read ORFs for bin aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) binORFs = readFasta(aaGeneFile) # extract ORFs hitting a marker for markerId, hits in resultsParser.results[binId].markerHits.iteritems(): markerSeqs[markerId][binId] = {} markerStats[markerId][binId] = {} # only record hits which are unique if len(hits) == 1: hit = hits[0] markerSeqs[markerId][binId][hit.target_name] = self.__extractSeq(hit.target_name, binORFs) markerStats[markerId][binId][hit.target_name] = [hit.full_e_value, hit.full_score] return markerSeqs, markerStats
def __extractMarkersWithMultipleHits(self, outDir, binId, resultsParser, binMarkerSet): """Extract markers with multiple hits within a single bin.""" markersWithMultipleHits = defaultdict(dict) aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) binORFs = readFasta(aaGeneFile) markerGenes = binMarkerSet.selectedMarkerSet().getMarkerGenes() for markerId, hits in resultsParser.results[binId].markerHits.iteritems(): if markerId not in markerGenes or len(hits) < 2: continue # sort hits from highest to lowest e-value in order to ensure only the best hit # to a given target is retained hits.sort(key=lambda x: x.full_e_value, reverse=True) # Note: this data structure is used to mimic that used by __extractMarkerSeqsTopHits() markersWithMultipleHits[markerId][binId] = {} for hit in hits: markersWithMultipleHits[markerId][binId][hit.target_name] = self.__extractSeq(hit.target_name, binORFs) return markersWithMultipleHits
def reportFullMSA(self, outDir, outFile): """Create MSA with all reference and bin alignments.""" # write bin alignments to file oldStdOut = reassignStdOut(outFile) for line in open(os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)): print(line.rstrip()) # read duplicate seqs duplicateNodes = self.__readDuplicateSeqs() # write reference alignments to file seqs = readFasta(os.path.join(DefaultValues.PPLACER_REF_PACKAGE, 'genome_tree.concatenated.derep.fasta')) for seqId, seq in seqs.iteritems(): print('>' + seqId) print(seq) if seqId in duplicateNodes: for dupSeqId in duplicateNodes[seqId]: print('>' + dupSeqId) print(seq) restoreStdOut(outFile, oldStdOut)
def __extractMarkerSeqsTopHits(self, outDir, resultsParser): """Extract marker sequences from top hits (assume all bins use the same HMM file).""" markerSeqs = defaultdict(dict) markerStats = defaultdict(dict) for binId in resultsParser.results: # read ORFs for bin aaGeneFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) binORFs = readFasta(aaGeneFile) # extract ORFs hitting a marker for markerId, hits in resultsParser.results[binId].markerHits.iteritems(): markerSeqs[markerId][binId] = {} markerStats[markerId][binId] = {} # sort hits from highest to lowest e-value in order to ensure only the best hit # to a given target is retained hits.sort(key=lambda x: x.full_e_value, reverse=True) topHit = hits[0] markerSeqs[markerId][binId][topHit.target_name] = self.__extractSeq(topHit.target_name, binORFs) markerStats[markerId][binId][topHit.target_name] = [topHit.full_e_value, topHit.full_score] return markerSeqs, markerStats
def calculate(self, seqFile, outputFile): """Calculate genomic signature of each sequence.""" self.logger.info(' Determining tetranucleotide signature of each sequence.') # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() seqs = readFasta(seqFile) for seqId, seq in seqs.iteritems(): workerQueue.put((seqId, seq)) for _ in range(self.totalThreads): workerQueue.put((None, None)) try: calcProc = [mp.Process(target=self.__calculateResults, args=(workerQueue, writerQueue)) for _ in range(self.totalThreads)] writeProc = mp.Process(target=self.__storeResults, args=(seqFile, outputFile, len(seqs), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None)) writeProc.join() except: # make sure all processes are terminated for p in calcProc: p.terminate() writeProc.terminate()
def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD): # Read reference distributions from file dist = readDistribution("td_dist") # get tetranucleotide signature for bin seqs = readFasta(fastaFile) binTools = BinTools() binSig = binTools.binTetraSig(seqs, tetraSigs) # get tetranucleotide distances for windows genomicSig = GenomicSignatures(K=4, threads=1) data = [] seqLens = [] deltaTDs = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.td_window_size seqLen = len(seq) seqLens.append(seqLen) deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig)) while end < seqLen: windowSig = genomicSig.seqSignature(seq[start:end]) data.append(genomicSig.distance(windowSig, binSig)) start = end end += self.options.td_window_size if len(data) == 0: axesHist.set_xlabel("[Error] No seqs >= %d, the specified window size" % self.options.td_window_size) return deltaTDs = np.array(deltaTDs) # Histogram plot bins = [0.0] binWidth = self.options.td_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel(r"$\Delta$ TD") axesHist.set_ylabel("% windows (" + str(self.options.td_window_size) + " bp)") # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ["right", "top"]: spine.set_color("none") else: spine.set_color(self.axesColour) # get CD bin statistics meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) # Delta-TD vs Sequence length plot axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap="gray_r") axesDeltaTD.set_xlabel(r"$\Delta$ TD (mean TD = %.2f)" % meanTD) axesDeltaTD.set_ylabel("Sequence length (kbp)") _, yMaxSeqs = axesDeltaTD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot) x = [] y = [] for windowSize in dist: x.append(dist[windowSize][boundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) x = np.array(x)[sortIndexY] y = np.array(y)[sortIndexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1]) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axesDeltaTD.plot(x, y, "r--", lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaTD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle="dashed", color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaTD.get_yticks() kbpLabels = [] for seqLen in yticks: label = "%.1f" % (float(seqLen) / 1000) label = label.replace(".0", "") # remove trailing zero kbpLabels.append(label) axesDeltaTD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaTD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaTD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaTD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaTD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaTD.spines.iteritems(): if loc in ["right", "top"]: spine.set_color("none") else: spine.set_color(self.axesColour)
def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet( tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable( [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck( hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta( os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample( genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta( os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens( contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( 1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set( contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append( completeness - trueComp) deltaContSet[ms.lineageStr].append( contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append( completeness - trueComp) deltaContRefined[ms.lineageStr].append( contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append( completeness - trueComp) deltaContSetRefined[ms.lineageStr].append( contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put( (testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts))
def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD): # Read reference distributions from file dist = readDistribution('td_dist') # get tetranucleotide signature for bin seqs = readFasta(fastaFile) binTools = BinTools() binSig = binTools.binTetraSig(seqs, tetraSigs) # get tetranucleotide distances for windows genomicSig = GenomicSignatures(K=4, threads=1) data = [] seqLens = [] deltaTDs = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.td_window_size seqLen = len(seq) seqLens.append(seqLen) deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig)) while (end < seqLen): windowSig = genomicSig.seqSignature(seq[start:end]) data.append(genomicSig.distance(windowSig, binSig)) start = end end += self.options.td_window_size if len(data) == 0: axesHist.set_xlabel( '[Error] No seqs >= %d, the specified window size' % self.options.td_window_size) return deltaTDs = np.array(deltaTDs) # Histogram plot bins = [0.0] binWidth = self.options.td_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel(r'$\Delta$ TD') axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) # Delta-TD vs Sequence length plot axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap='gray_r') axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD) axesDeltaTD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaTD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot) x = [] y = [] for windowSize in dist: x.append(dist[windowSize][boundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) x = np.array(x)[sortIndexY] y = np.array(y)[sortIndexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1] ) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axesDeltaTD.plot(x, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaTD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaTD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaTD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaTD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaTD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaTD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaTD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaTD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold): # make sure output directory exists if not os.path.exists(outputDir): os.makedirs(outputDir) # get bin id of binned contigs self.logger.info(' Determining bin assignment of sequences.') seqIdToBinId = {} for f in binFiles: binId = binIdFromFilename(f) seqIds = readFastaSeqIds(f) for seqId in seqIds: seqIdToBinId[seqId] = binId # identify 16S reads from contigs/scaffolds self.logger.info(' Identifying SSU rRNAs on sequences.') self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu')) # read HMM hits hitsPerDomain = {} for domain in ['archaea', 'bacteria', 'euk']: hits = {} seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold) if len(seqInfo) > 0: for seqId, seqHits in seqInfo.iteritems(): for hit in seqHits: self.__addHit(hits, seqId, hit, concatenateThreshold) hitsPerDomain[domain] = hits # find best domain hit for each sequence bestHits = {} for _, hits in hitsPerDomain.iteritems(): for seqId, info in hits.iteritems(): if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] self.__addDomainHit(bestHits, seqId, info) # write summary file and putative SSU rRNAs to file summaryFile = os.path.join(outputDir, 'ssu_summary.tsv') summaryOut = open(summaryFile, 'w') summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n') seqFile = os.path.join(outputDir, 'ssu.fna') seqOut = open(seqFile, 'w') seqs = readFasta(contigFile) hitsToBins = {} for seqId in bestHits: origSeqId = seqId if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] if seqId in seqIdToBinId: binId = seqIdToBinId[seqId] else: binId = DefaultValues.UNBINNED seqInfo = [origSeqId] + bestHits[origSeqId] hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo] for binId in sorted(hitsToBins.keys()): for seqInfo in hitsToBins[binId]: seqId = seqInfo[0] if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] seq = seqs[seqId] summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n') seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n') seqOut.write(seq[int(seqInfo[3]):int(seqInfo[4])] + '\n') summaryOut.close() seqOut.close() self.logger.info('') self.logger.info(' Identified ' + str(len(bestHits)) + ' putative SSU genes:') self.logger.info(' Summary of identified hits written to: ' + summaryFile) self.logger.info(' SSU sequences written to: ' + seqFile)
def report(self, binFiles1, binFiles2, seqFile, outputFile): # determine total number of sequences self.logger.info(' Reading sequences.') seqs = readFasta(seqFile) seqLens = {} totalBases = 0 numSeq1K = 0 totalBases1K = 0 numSeq5K = 0 totalBases5K = 0 for seqId, seq in seqs.items(): seqLen = len(seq) seqLens[seqId] = seqLen totalBases += seqLen if seqLen >= 1000: numSeq1K += 1 totalBases1K += seqLen if seqLen >= 5000: numSeq5K += 1 totalBases5K += seqLen # determine sequences in each bin bins1 = self.__readBins(binFiles1) bins2 = self.__readBins(binFiles2) # determine bin stats binStats1, totalUniqueBinnedSeqs1, totalUniqueBinnedBases1, numRepeats1 = self.__binningStats( bins1, seqLens) binStats2, totalUniqueBinnedSeqs2, totalUniqueBinnedBases2, numRepeats2 = self.__binningStats( bins2, seqLens) # sort bins by size binStats1 = sorted(iter(binStats1.items()), key=lambda x: x[1][1], reverse=True) binStats2 = sorted(iter(binStats2.items()), key=lambda x: x[1][1], reverse=True) # report summary results self.logger.info(' Total seqs = %d (%.2f Mbp)' % (len(seqs), float(totalBases) / 1e6)) self.logger.info(' # seqs > 1 kbp = %d (%.2f Mbp)' % (numSeq1K, float(totalBases1K) / 1e6)) self.logger.info(' # seqs > 5 kbp = %d (%.2f Mbp)' % (numSeq5K, float(totalBases5K) / 1e6)) self.logger.info('') self.logger.info(' Binned seqs statistics:') self.logger.info( ' 1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' % (len(bins1), totalUniqueBinnedSeqs1, float(totalUniqueBinnedSeqs1) * 100 / len(seqs), float(totalUniqueBinnedBases1) / 1e6, float(totalUniqueBinnedBases1) * 100 / totalBases, numRepeats1)) self.logger.info( ' 2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' % (len(bins2), totalUniqueBinnedSeqs2, float(totalUniqueBinnedSeqs2) * 100 / len(seqs), float(totalUniqueBinnedBases2) / 1e6, float(totalUniqueBinnedBases2) * 100 / totalBases, numRepeats2)) # output report fout = open(outputFile, 'w') for data in binStats2: fout.write('\t' + data[0]) fout.write( '\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n' ) totalSeqsInCommon2 = defaultdict(int) maxBasesInCommon2 = defaultdict(int) maxSeqsInCommon2 = defaultdict(int) bestMatchingBin2 = {} binnedSeqs2 = defaultdict(set) for data1 in binStats1: binId1 = data1[0] fout.write(binId1) seqs1 = bins1[binId1] maxBasesInCommon = 0 maxSeqsInCommon = 0 bestMatchingBin = 'n/a' binnedSeqs = set() for data2 in binStats2: binId2 = data2[0] seqs2 = bins2[binId2] seqsInCommon = seqs1.intersection(seqs2) binnedSeqs.update(seqsInCommon) numSeqsInCommon = len(seqsInCommon) fout.write('\t' + str(numSeqsInCommon)) basesInCommon = 0 for seqId in seqsInCommon: basesInCommon += seqLens[seqId] if basesInCommon > maxBasesInCommon: maxBasesInCommon = basesInCommon maxSeqsInCommon = numSeqsInCommon bestMatchingBin = binId2 if basesInCommon > maxBasesInCommon2[binId2]: maxBasesInCommon2[binId2] = basesInCommon maxSeqsInCommon2[binId2] = numSeqsInCommon bestMatchingBin2[binId2] = binId1 binnedSeqs2[binId2].update(seqsInCommon) fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % ( len(seqs1) - len(binnedSeqs), data1[1][0], float(data1[1][1]) / 1e6, bestMatchingBin, float(maxBasesInCommon) * 100 / data1[1][1], float(maxSeqsInCommon) * 100 / data1[1][0], )) fout.write('unbinned') for data in binStats2: binId = data[0] fout.write('\t%d' % (len(bins2[binId]) - len(binnedSeqs2[binId]))) fout.write('\n') fout.write('# seqs') for data in binStats2: fout.write('\t%d' % data[1][0]) fout.write('\n') fout.write('# bases (Mbp)') for data in binStats2: fout.write('\t%.2f' % (float(data[1][1]) / 1e6)) fout.write('\n') fout.write('Best match') for data in binStats2: binId = data[0] fout.write('\t%s' % bestMatchingBin2.get(binId, 'n/a')) fout.write('\n') fout.write('% bases in common') for data in binStats2: binId = data[0] fout.write('\t%.2f' % (float(maxBasesInCommon2[binId]) * 100 / data[1][1])) fout.write('\n') fout.write('% seqs in common') for data in binStats2: binId = data[0] fout.write('\t%.2f' % (float(maxSeqsInCommon2[binId]) * 100 / data[1][0])) fout.write('\n') fout.close()
def plot(self, f, seqIds, pc, variance): # ensure pc matrix has at least 3 dimensions if pc.shape[1] == 1: pc = np.append(pc, np.zeros((pc.shape[0], 2)), 1) variance = np.append(variance[0], np.ones(2)) elif pc.shape[1] == 2: pc = np.append(pc, np.zeros((pc.shape[0], 1)), 1) variance = np.append(variance[0:2], np.ones(1)) # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axesPC1vsPC2 = self.fig.add_subplot(221) axesPC2vsPC3 = self.fig.add_subplot(222) axesPC1vsPC3 = self.fig.add_subplot(223) axesVariance = self.fig.add_subplot(224) # get sequence in bin seqs = readFasta(f) binIndices = [] for rowIndex, seqId in enumerate(seqIds): if seqId in seqs.keys(): binIndices.append(rowIndex) # plot sequence in bin axesPC1vsPC2.scatter(pc[:, 0], pc[:, 1], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o") axesPC1vsPC2.scatter(pc[binIndices, 0], pc[binIndices, 1], s=10, lw=0.5, facecolor="r", marker="o") axesPC1vsPC2.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100)) axesPC1vsPC2.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100)) axesPC2vsPC3.scatter(pc[:, 2], pc[:, 1], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o") axesPC2vsPC3.scatter(pc[binIndices, 2], pc[binIndices, 1], s=10, lw=0.5, facecolor="r", marker="o") axesPC2vsPC3.set_xlabel('PC3 (%.1f%%)' % (variance[2] * 100)) axesPC2vsPC3.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100)) axesPC1vsPC3.scatter(pc[:, 0], pc[:, 2], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o") axesPC1vsPC3.scatter(pc[binIndices, 0], pc[binIndices, 2], s=10, lw=0.5, facecolor="r", marker="o") axesPC1vsPC3.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100)) axesPC1vsPC3.set_ylabel('PC3 (%.1f%%)' % (variance[2] * 100)) axesVariance.plot(np.arange(len(variance), dtype=int) + 1, np.cumsum(variance)) axesVariance.set_xlabel('Principal Component') axesVariance.set_ylabel('Percentage of Cumulative Variance') # axesVariance.vlines(3, 0, 1.0, linestyle='dashed', color=self.axesColour, zorder=0, lw=0.5) axesVariance.set_ylim([0, 1.02]) axesVariance.set_xlim([0, len(variance)]) axesVariance.get_xaxis().set_major_locator(MaxNLocator(integer=True)) xticks = axesVariance.get_xticks() if 0 in xticks and 1 not in xticks: xticks = np.append(np.array([1]), xticks[1:]) axesVariance.set_xticks(xticks) # Prettify plot for axes in [axesPC1vsPC2, axesPC2vsPC3, axesPC1vsPC3, axesVariance]: for a in axes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axes.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.fig.tight_layout(pad=1, w_pad=2, h_pad=2) self.draw()
def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes): # get GC for windows seqs = readFasta(binFile) gcProfile = {} for seqId, seq in seqs.items(): start = 0 end = self.options.window_size windowGCs = [] while (end < len(seq)): a, c, g, t = baseCount(seq[start:end]) windowGCs.append(float(g + c) / (a + c + g + t)) start = end end += self.options.window_size a, c, g, t = baseCount(seq) seqGC = float(g + c) / (a + c + g + t) gcProfile[seqId] = [seqGC, windowGCs] # plot GC vs coverage for windows gc = [] coverage = [] for seqId, gcInfo in gcProfile.items(): gc += gcInfo[1] coverage += coverageProfile[seqId][1] windowAxes.scatter(gc, coverage, c=abs(array(coverage)), s=10, lw=0.5, cmap='gray_r') windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100)) windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) # plot linear regression line if len(gc) > 1: slope, inter = polyfit(gc, coverage, 1) fit_fn = poly1d( [slope, inter] ) # fit_fn is now a function which takes in x and returns an estimate for y windowAxes.plot([min(gc), max(gc)], fit_fn([min(gc), max(gc)]), '--r', lw=0.5) windowAxes.set_title( 'GC vs. Coverage\n(window size = %d bp, slope = %.2f)' % (self.options.window_size, slope)) else: # not possible to calculate best fit line windowAxes.set_title( 'GC vs. Coverage\n(window size = %d bp, no best fit line)' % self.options.window_size) # Prettify plot for a in windowAxes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in windowAxes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in windowAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in windowAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in windowAxes.spines.items(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # plot GC vs coverage for entire sequences gc = [] coverage = [] seqLen = [] for seqId, gcInfo in gcProfile.items(): gc.append(gcInfo[0]) coverage.append(coverageProfile[seqId][0]) seqLen.append(len(seqs[seqId])) # set marker size proportional to sequence length markerSize = log(array(seqLen)) # log-scale markerSize = (markerSize - min(markerSize)) / max( markerSize) # normalize between 0 and 1 markerSize = markerSize * 200 + 10 # normalize between 10 and 200 seqAxes.scatter(gc, coverage, c=abs(array(coverage)), s=markerSize, lw=0.5, cmap='gray_r') seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100)) seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) seqAxes.set_title('GC vs. Coverage\nIndividual Sequences') # Prettify plot for a in seqAxes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in seqAxes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in seqAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in seqAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in seqAxes.spines.items(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes): # get GC for windows seqs = readFasta(binFile) gcProfile = {} for seqId, seq in seqs.iteritems(): start = 0 end = self.options.window_size windowGCs = [] while(end < len(seq)): a, c, g, t = baseCount(seq[start:end]) windowGCs.append(float(g + c) / (a + c + g + t)) start = end end += self.options.window_size a, c, g, t = baseCount(seq) seqGC = float(g + c) / (a + c + g + t) gcProfile[seqId] = [seqGC, windowGCs] # plot GC vs coverage for windows gc = [] coverage = [] for seqId, gcInfo in gcProfile.iteritems(): gc += gcInfo[1] coverage += coverageProfile[seqId][1] windowAxes.scatter(gc, coverage, c=abs(array(coverage)), s=10, lw=0.5, cmap=pylab.cm.Greys) windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100)) windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) # plot linear regression line if len(gc) > 1: slope, inter = polyfit(gc, coverage,1) fit_fn = poly1d([slope, inter]) # fit_fn is now a function which takes in x and returns an estimate for y windowAxes.plot([min(gc), max(gc)], fit_fn([min(gc), max(gc)]), '--r', lw=0.5) windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, slope = %.2f)' % (self.options.window_size, slope)) else: # not possible to calculate best fit line windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, no best fit line)' % self.options.window_size) # Prettify plot for a in windowAxes.yaxis.majorTicks: a.tick1On=True a.tick2On=False for a in windowAxes.xaxis.majorTicks: a.tick1On=True a.tick2On=False for line in windowAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in windowAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in windowAxes.spines.iteritems(): if loc in ['right','top']: spine.set_color('none') else: spine.set_color(self.axesColour) # plot GC vs coverage for entire sequences gc = [] coverage = [] seqLen = [] for seqId, gcInfo in gcProfile.iteritems(): gc.append(gcInfo[0]) coverage.append(coverageProfile[seqId][0]) seqLen.append(len(seqs[seqId])) # set marker size proportional to sequence length markerSize = log(array(seqLen)) # log-scale markerSize = (markerSize - min(markerSize)) / max(markerSize) # normalize between 0 and 1 markerSize = markerSize*200 + 10 # normalize between 10 and 200 seqAxes.scatter(gc, coverage, c=abs(array(coverage)), s=markerSize, lw=0.5, cmap=pylab.cm.Greys) seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100)) seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) seqAxes.set_title('GC vs. Coverage\nIndividual Sequences') # Prettify plot for a in seqAxes.yaxis.majorTicks: a.tick1On=True a.tick2On=False for a in seqAxes.xaxis.majorTicks: a.tick1On=True a.tick2On=False for line in seqAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in seqAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in seqAxes.spines.iteritems(): if loc in ['right','top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plot(self, fastaFile): # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axes = self.fig.add_subplot(111) # calculate cumulative sequence length seqs = readFasta(fastaFile) seqLens = [] for seq in seqs.values(): seqLens.append(len(seq)) seqLens.sort(reverse=True) x = np.arange(0, len(seqLens)) y = [] cumLen = 0 for seqLen in seqLens: cumLen += seqLen y.append(cumLen) # Create plot axes.plot(x, y, 'k-',) axes.set_xlabel('Sequence index') axes.set_ylabel('Cumulative sequence length (Mbp)') # ensure y-axis include zero _, end = axes.get_ylim() axes.set_ylim([0, end]) # Change sequence lengths from bp to kbp yticks = axes.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.2f' % (float(seqLen) / 1e6) label = label.replace('.00', '') # remove trailing zeros if label[-1] == '0': label = label[0:-1] kbpLabels.append(label) axes.set_yticklabels(kbpLabels) # Prettify plot for a in axes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axes.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.fig.tight_layout(pad=1) self.draw()
def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC): """Calculate coverage of sequences for each BAM file.""" # determine bin assignment of each sequence self.logger.info(' Determining bin assignment of each sequence.') seqIdToBinId = {} seqIdToSeqLen = {} for binFile in binFiles: binId = binIdFromFilename(binFile) seqs = readFasta(binFile) for seqId, seq in seqs.iteritems(): seqIdToBinId[seqId] = binId seqIdToSeqLen[seqId] = len(seq) # process each fasta file self.logger.info(" Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads)) # make sure all BAM files are sorted self.numFiles = len(bamFiles) for bamFile in bamFiles: if not os.path.exists(bamFile + '.bai'): self.logger.error( ' [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n') sys.exit() # calculate coverage of each BAM file coverageInfo = {} numFilesStarted = 0 for bamFile in bamFiles: numFilesStarted += 1 self.logger.info( ' Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles))) coverageInfo[bamFile] = mp.Manager().dict() coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile]) # redirect output self.logger.info(' Writing coverage information to file.') oldStdOut = reassignStdOut(outFile) header = 'Sequence Id\tBin Id\tSequence length (bp)' for bamFile in bamFiles: header += '\tBam Id\tCoverage\tMapped reads' print(header) # get length of all seqs for bamFile, seqIds in coverageInfo.iteritems(): for seqId in seqIds.keys(): seqIdToSeqLen[seqId] = seqIds[seqId].seqLen # write coverage stats for all scaffolds to file for seqId, seqLen in seqIdToSeqLen.iteritems(): rowStr = seqId + '\t' + seqIdToBinId.get( seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen) for bamFile in bamFiles: bamId = binIdFromFilename(bamFile) if seqId in coverageInfo[bamFile]: rowStr += '\t%s\t%f\t%d' % ( bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads) else: rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0) print(rowStr) # restore stdout restoreStdOut(outFile, oldStdOut)
def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample(genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta(os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens(contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set(contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts))
def plot(self, f, seqIds, pc, variance): # ensure pc matrix has at least 3 dimensions if pc.shape[1] == 1: pc = np.append(pc, np.zeros((pc.shape[0], 2)), 1) variance = np.append(variance[0], np.ones(2)) elif pc.shape[1] == 2: pc = np.append(pc, np.zeros((pc.shape[0], 1)), 1) variance = np.append(variance[0:2], np.ones(1)) # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axesPC1vsPC2 = self.fig.add_subplot(221) axesPC2vsPC3 = self.fig.add_subplot(222) axesPC1vsPC3 = self.fig.add_subplot(223) axesVariance = self.fig.add_subplot(224) # get sequence in bin seqs = readFasta(f) binIndices = [] for rowIndex, seqId in enumerate(seqIds): if seqId in seqs.keys(): binIndices.append(rowIndex) # plot sequence in bin axesPC1vsPC2.scatter(pc[:, 0], pc[:, 1], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o") axesPC1vsPC2.scatter(pc[binIndices, 0], pc[binIndices, 1], s=10, lw=0.5, facecolor="r", marker="o") axesPC1vsPC2.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100)) axesPC1vsPC2.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100)) axesPC2vsPC3.scatter(pc[:, 2], pc[:, 1], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o") axesPC2vsPC3.scatter(pc[binIndices, 2], pc[binIndices, 1], s=10, lw=0.5, facecolor="r", marker="o") axesPC2vsPC3.set_xlabel('PC3 (%.1f%%)' % (variance[2] * 100)) axesPC2vsPC3.set_ylabel('PC2 (%.1f%%)' % (variance[1] * 100)) axesPC1vsPC3.scatter(pc[:, 0], pc[:, 2], s=10, lw=0.5, facecolor=(0.8, 0.8, 0.8), marker="o") axesPC1vsPC3.scatter(pc[binIndices, 0], pc[binIndices, 2], s=10, lw=0.5, facecolor="r", marker="o") axesPC1vsPC3.set_xlabel('PC1 (%.1f%%)' % (variance[0] * 100)) axesPC1vsPC3.set_ylabel('PC3 (%.1f%%)' % (variance[2] * 100)) axesVariance.plot( np.arange(len(variance), dtype=int) + 1, np.cumsum(variance)) axesVariance.set_xlabel('Principal Component') axesVariance.set_ylabel('Percentage of Cumulative Variance') # axesVariance.vlines(3, 0, 1.0, linestyle='dashed', color=self.axesColour, zorder=0, lw=0.5) axesVariance.set_ylim([0, 1.02]) axesVariance.set_xlim([0, len(variance)]) axesVariance.get_xaxis().set_major_locator(MaxNLocator(integer=True)) xticks = axesVariance.get_xticks() if 0 in xticks and 1 not in xticks: xticks = np.append(np.array([1]), xticks[1:]) axesVariance.set_xticks(xticks) # Prettify plot for axes in [axesPC1vsPC2, axesPC2vsPC3, axesPC1vsPC3, axesVariance]: for a in axes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axes.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.fig.tight_layout(pad=1, w_pad=2, h_pad=2) self.draw()
def report(self, binFiles1, binFiles2, seqFile, outputFile): # determine total number of sequences seqs = readFasta(seqFile) seqLens = {} totalBases = 0 numSeq1K = 0 totalBases1K = 0 numSeq5K = 0 totalBases5K = 0 for seqId, seq in seqs.iteritems(): seqLen = len(seq) seqLens[seqId] = seqLen totalBases += seqLen if seqLen >= 1000: numSeq1K += 1 totalBases1K += seqLen if seqLen >= 5000: numSeq5K += 1 totalBases5K += seqLen # determine sequences in each bin bins1 = self.__readBins(binFiles1) bins2 = self.__readBins(binFiles2) # determine bin stats binStats1, totalBinnedSeqs1, totalBinnedBases1 = self.__binningStats(bins1, seqLens) binStats2, totalBinnedSeqs2, totalBinnedBases2 = self.__binningStats(bins2, seqLens) # sort bins by size binStats1 = sorted(binStats1.iteritems(), key = lambda x: x[1][1], reverse = True) binStats2 = sorted(binStats2.iteritems(), key = lambda x: x[1][1], reverse = True) # report summary results self.logger.info('') self.logger.info(' Total seqs = %d (%.2f Mbp)' % (len(seqs), float(totalBases)/1e6)) self.logger.info(' # seqs > 1 kbp = %d (%.2f Mbp)' % (numSeq1K, float(totalBases1K)/1e6)) self.logger.info(' # seqs > 5 kbp= %d (%.2f Mbp)' % (numSeq5K, float(totalBases5K)/1e6)) self.logger.info('') self.logger.info(' Binned seqs statistics:') self.logger.info(' 1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%)' % (len(bins1), totalBinnedSeqs1, float(totalBinnedSeqs1)*100 / len(seqs), float(totalBinnedBases1)/1e6, float(totalBinnedBases1)*100/totalBases)) self.logger.info(' 2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%)' % (len(bins2), totalBinnedSeqs2, float(totalBinnedSeqs2)*100 / len(seqs), float(totalBinnedBases2)/1e6, float(totalBinnedBases2)*100/totalBases)) # output report fout = open(outputFile, 'w') for data in binStats2: fout.write('\t' + data[0]) fout.write('\tunbinned\t% bases in common\t% seqs in common\tBest match\t# seqs\t# bases (Mbp)\n') totalSeqsInCommon2 = defaultdict(int) maxBasesInCommon2 = defaultdict(int) maxSeqsInCommon2 = defaultdict(int) bestMatchingBin2 = {} for data1 in binStats1: binId1 = data1[0] fout.write(binId1) seqs1 = bins1[binId1] totalSeqsInCommon = 0 maxBasesInCommon = 0 maxSeqsInCommon = 0 bestMatchingBin = 'n/a' for data2 in binStats2: binId2 = data2[0] seqs2 = bins2[binId2] seqsInCommon = seqs1.intersection(seqs2) numSeqsInCommon = len(seqsInCommon) fout.write('\t' + str(numSeqsInCommon)) basesInCommon = 0 for seqId in seqsInCommon: basesInCommon += seqLens[seqId] if basesInCommon > maxBasesInCommon: maxBasesInCommon = basesInCommon maxSeqsInCommon = numSeqsInCommon bestMatchingBin = binId2 if basesInCommon > maxBasesInCommon2[binId2]: maxBasesInCommon2[binId2] = basesInCommon maxSeqsInCommon2[binId2] = numSeqsInCommon bestMatchingBin2[binId2] = binId1 totalSeqsInCommon += numSeqsInCommon totalSeqsInCommon2[binId2] += numSeqsInCommon fout.write('\t%d\t%.2f\t%.2f\t%s\t%d\t%.2f\n' % (len(seqs1) - totalSeqsInCommon, float(maxBasesInCommon)*100 / data1[1][1], float(maxSeqsInCommon)*100 / data1[1][0], bestMatchingBin, data1[1][0], float(data1[1][1])/1e6)) fout.write('unbinned') for data in binStats2: binId = data[0] fout.write('\t%d' % (len(bins2[binId]) - totalSeqsInCommon2[binId])) fout.write('\n') fout.write('% bases in common') for data in binStats2: binId = data[0] fout.write('\t%.2f' % (float(maxBasesInCommon2[binId])*100 / data[1][1])) fout.write('\n') fout.write('% seqs in common') for data in binStats2: binId = data[0] fout.write('\t%.2f' % (float(maxSeqsInCommon2[binId])*100 / data[1][0])) fout.write('\n') fout.write('Best match') for data in binStats2: binId = data[0] fout.write('\t%s' % bestMatchingBin2.get(binId, 'n/a')) fout.write('\n') fout.write('# seqs') for data in binStats2: fout.write('\t%d' % data[1][0]) fout.write('\n') fout.write('# bases (Mbp)') for data in binStats2: fout.write('\t%.2f' % (float(data[1][1])/1e6)) fout.write('\n') fout.close()
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF) sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r') axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None): """Print out information about bin.""" if outputFormat == 1: selectedMarkerSet = binMarkerSets.selectedMarkerSet() lineageStr = selectedMarkerSet.lineageStr if selectedMarkerSet.UID != '0': lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) row = "%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(), "\t".join([str(data[i]) for i in range(6)]), data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0) ) if table == None: print(row) else: table.add_row([self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) elif outputFormat == 2: selectedMarkerSet = binMarkerSets.selectedMarkerSet() lineageStr = selectedMarkerSet.lineageStr if selectedMarkerSet.UID != '0': lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) if table == None: row = self.binId row += '\t%s\t%d\t%d\t%d' % (lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()) row += '\t%0.2f\t%0.2f\t%0.2f' % (data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)) row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'], self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], self.binStats['Mean scaffold length'], self.binStats['Mean contig length'], self.binStats['Longest scaffold'], self.binStats['Longest contig']) row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100) row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']) row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)]) if coverageBinProfiles: for _, coverageStats in coverageBinProfiles[self.binId].iteritems(): row += '\t%.2f\t%.2f' % (coverageStats[0], coverageStats[1]) print(row) else: row = [self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] row.extend([data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)]) row.extend([self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'], self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], int(self.binStats['Mean scaffold length']), int(self.binStats['Mean contig length']), self.binStats['Longest scaffold'], self.binStats['Longest contig']]) row.extend([self.binStats['GC'] * 100, self.binStats['GC std'] * 100]) row.extend([self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']]) row.extend(data[0:6]) if coverageBinProfiles: for _, coverageStats in coverageBinProfiles[self.binId].iteritems(): row.extend(coverageStats) table.add_row(row) elif outputFormat == 3: for ms in binMarkerSets.markerSetIter(): data = self.geneCounts(ms, self.markerHits, bIndividualMarkers) row = "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets(), "\t".join([str(data[i]) for i in range(6)]), data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0) ) if table == None: print(row) else: table.add_row([self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) elif outputFormat == 4: selectedMarkerSet = binMarkerSets.selectedMarkerSet() data = self.hitsToMarkerGene(binMarkerSets.selectedMarkerSet()) row = "Node Id: %s; Marker lineage: %s" % (selectedMarkerSet.UID, selectedMarkerSet.lineageStr) for marker in data: row += '\t' + marker print(row) row = self.binId for count in data.values(): row += '\t' + str(count) print(row) print() elif outputFormat == 5: # tabular of bin_id, marker, contig_id markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() for marker, hit_list in self.markerHits.items(): if marker not in markerGenes: continue for hit in hit_list: print(self.binId, marker, hit.target_name, sep='\t', end='\n') elif outputFormat == 6: markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() seqsReported = 0 for marker, hitList in self.markerHits.items(): if marker not in markerGenes: continue if len(hitList) >= 2: print(self.binId, marker, sep='\t', end='\t') scaffoldIds = [] for hit in hitList: scaffoldIds.append(hit.target_name) print(','.join(sorted(scaffoldIds)), end='\n') seqsReported += 1 return seqsReported elif outputFormat == 7: markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() seqsReported = 0 for marker, hitList in self.markerHits.items(): if marker not in markerGenes: continue if len(hitList) >= 2: scaffoldsWithMultipleHits = set() for i in xrange(0, len(hitList)): scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind('_')] for j in xrange(i + 1, len(hitList)): if scaffoldId == hitList[j].target_name[0:hitList[j].target_name.rfind('_')]: scaffoldsWithMultipleHits.add(hitList[i].target_name) scaffoldsWithMultipleHits.add(hitList[j].target_name) if len(scaffoldsWithMultipleHits) >= 2: print(self.binId, marker, sep='\t', end='\t') print(','.join(sorted(list(scaffoldsWithMultipleHits))), end='\n') seqsReported += 1 return seqsReported elif outputFormat == 8: # tabular - print only position of marker genes markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() genesWithMarkers = {} for marker, hit_list in self.markerHits.items(): if marker not in markerGenes: continue for hit in hit_list: genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit] for geneId, hits in genesWithMarkers.iteritems(): rowStr = self.binId + '\t' + geneId for hit in hits: rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to) print(rowStr) # Hunter Cameron, May 29, 2015 - print a fasta of marker genes elif outputFormat == 9: # tabular of bin_id, marker, contig_id # check for the analyze folder for later use if anaFolder is None: raise ValueError("AnaFolder must not be None for outputFormat 9") # ## build a dict to link target_names with marker gene alignment information markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() hitInfo = {} for marker, hit_list in self.markerHits.items(): if marker not in markerGenes: continue for hit in hit_list: name = hit.target_name hitInfo[name] = { "marker": marker, "ali_from": str(hit.ali_from), "ali_to": str(hit.ali_to) } # ## Open genes.faa and print the ones that were found with some descriptive info in the header path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"]) # get only the seqs we need and their information as a dict seqs = readFasta(path_to_genes, trimHeader=False) filt_seqs = [] # remove seqs without markers for header in seqs.keys(): gene_name = header.split(" # ")[0] if gene_name in hitInfo: filt_seqs.append(header) def sort_header(header): """ sorts headers by contig and gene number """ name = header.split(" # ")[0] ctg_name, gene_num = name.rsplit("_", 1) return ctg_name, int(gene_num) for header in sorted(filt_seqs, key=sort_header): elems = header.split(" # ") gene_name = elems[0] # remove the gene number from Prodigal to get the original contig name contig_name, gene_num = gene_name.rsplit("_", 1) # parse some info about the gene from the header line gene_start = elems[1] gene_end = elems[2] gene_strand = elems[3] # if table output not specified, print FASTA if table != None: gene_info = "geneId={};start={};end={};strand={};protlen={}".format( gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header]))) marker_info = "marker={};mstart={};mend={}".format( hitInfo[gene_name]["marker"], hitInfo[gene_name]["ali_from"], hitInfo[gene_name]["ali_to"]) # new header will be the bin name, contig name, gene info, and marker info separated by spaces new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info]) print(new_header, seqs[header], sep="\n") # otherwise, print a table else: print("\t".join([ self.binId, contig_name, gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header])), hitInfo[gene_name]["marker"], hitInfo[gene_name]["ali_from"], hitInfo[gene_name]["ali_to"], seqs[header] ])) else: self.logger.error("Unknown output format: %d", outputFormat) return 0 '''
def plot(self, binFile, markerGeneStats, binStats): binId = binIdFromFilename(binFile) markerGenesPerSeq, _markerGeneNum = self.getMarkerGenesPerSeq( markerGeneStats) if len(markerGenesPerSeq) == 0: return False # Get length of sequences with one or more marker genes seqs = readFasta(binFile) seqLens = {} longestSeq = 0 binSize = 0 for seqId, seq in seqs.iteritems(): seqLen = len(seq) binSize += seqLen if seqId not in markerGenesPerSeq: continue seqLens[seqId] = seqLen if seqLen > longestSeq: longestSeq = seqLen sortedSeqLens = sorted(seqLens.iteritems(), key=operator.itemgetter(1), reverse=True) MAX_BINS = 100 plotBinSize = self.roundUpToNearest100(float(longestSeq) / MAX_BINS) yLabels = [x[0] for x in sortedSeqLens] # get position of genes in bin prodigalFastaParser = ProdigalFastaParser() geneFile = os.path.join(self.options.results_dir, 'bins', binId, DefaultValues.PRODIGAL_AA) genePos = prodigalFastaParser.genePositions(geneFile) # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) yLabelBounds = self.yLabelExtents(yLabels, self.options.font_size) heightBottomLabels = 0.4 + self.options.fig_padding # inches widthSideLabel = yLabelBounds.width * self.options.width + self.options.fig_padding # inches widthPerBin = (self.options.width - widthSideLabel - self.options.fig_padding) / MAX_BINS titleHeight = 0.2 HEIGHT_PER_ROW = 0.2 height = HEIGHT_PER_ROW * len( sortedSeqLens ) + heightBottomLabels + self.options.fig_padding + titleHeight rowBinHeight = widthPerBin / HEIGHT_PER_ROW self.fig.set_size_inches(self.options.width, height) axes = self.fig.add_axes([widthSideLabel / self.options.width, heightBottomLabels / height, \ 1.0 - (widthSideLabel + self.options.fig_padding) / self.options.width, \ 1.0 - (heightBottomLabels + self.options.fig_padding + titleHeight) / height]) # set plot axis axes.set_xlim([0, MAX_BINS + 0.1]) axes.set_xlabel('Position (' + str(plotBinSize) + ' bp/bin)') axes.set_ylim([0, len(sortedSeqLens)]) axes.set_yticks(np.arange(0.5, len(sortedSeqLens) + 0.5, 1.0)) axes.set_yticklabels(yLabels) # legend colours = [(1.0, 1.0, 1.0), (127 / 255.0, 201 / 255.0, 127 / 255.0), (255 / 255.0, 192 / 255.0, 134 / 255.0), (190 / 255.0, 174 / 255.0, 212 / 255.0), (0.0, 0.0, 0.0)] discreteColourMap = mpl.colors.ListedColormap(colours) axisColourMap = self.fig.add_axes([ self.options.fig_padding / self.options.width, self.options.fig_padding / height, 0.15, 0.03 * (self.options.width / height) ]) colourBar = mpl.colorbar.ColorbarBase(axisColourMap, cmap=discreteColourMap, norm=mpl.colors.Normalize( vmin=0, vmax=1), orientation='horizontal', drawedges=True) colourBar.set_ticks([0.1, 0.3, 0.5, 0.7, 0.9]) colourBar.set_ticklabels(['0', '1', '2', '3', '4+']) # colourBar.outline.set_color(self.axesColour) colourBar.outline.set_linewidth(0.5) # colourBar.dividers.set_color(self.axesColour) colourBar.dividers.set_linewidth(0.5) for a in axisColourMap.xaxis.majorTicks: a.tick1On = False a.tick2On = False # plot each bin binPosX = 0.5 for seqId, seqLen in sortedSeqLens: markerCount = [0] * int(math.ceil(float(seqLen) / plotBinSize)) for geneId, _markerGeneId, geneStartPos, _geneEndPos in markerGenesPerSeq[ seqId]: binPos = int( float(genePos[geneId][0] + geneStartPos) / plotBinSize) markerCount[binPos] += 1 for i in xrange(0, len(markerCount)): if markerCount[i] < len(colours): axes.add_patch( Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight), 0.8, 0.8 * rowBinHeight, facecolor=colours[markerCount[i]], lw=0.2)) else: axes.add_patch( Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight), 0.8, 0.8 * rowBinHeight, facecolor=colours[-1], lw=0.2)) binPosX += 1.0 # set plot title titleStr = binId + '\n' titleStr += '(%.2f Mbp, %d seqs, %.2f%% complete, %.2f%% contamination)' % ( float(binSize) / 1e6, len(seqs), binStats['Completeness'], binStats['Contamination']) axes.set_title(titleStr) # Prettify plot for a in axes.yaxis.majorTicks: a.tick1On = False a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) line.set_ms(2) for loc, spine in axes.spines.iteritems(): if loc in ['left', 'right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.draw() return True
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): # Read reference distributions from file dist = readDistribution('gc_dist') # get GC for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for _, seq in seqs.iteritems(): start = 0 end = self.options.gc_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): a, c, g, t = baseCount(seq[start:end]) try: data.append(float(g + c) / (a + c + g + t)) except: # it is possible to reach a long stretch of # N's that causes a division by zero error pass start = end end += self.options.gc_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.gc_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% GC') axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get GC bin statistics binTools = BinTools() meanGC, deltaGCs, _ = binTools.gcDist(seqs) # Delta-GC vs Sequence length plot axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) axesDeltaGC.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaGC.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestGC = findNearest(np.array(dist.keys()), meanGC) # find closest distribution values sampleSeqLen = dist[closestGC].keys()[0] d = dist[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestGC]: xL.append(dist[closestGC][windowSize][gcLowerBoundKey]) xU.append(dist[closestGC][windowSize][gcUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaGC.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaGC.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaGC.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaGC.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaGC.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaGC.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaGC.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaGC.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC): """Calculate coverage of sequences for each BAM file.""" # determine bin assignment of each sequence self.logger.info(' Determining bin assignment of each sequence.') seqIdToBinId = {} seqIdToSeqLen = {} for binFile in binFiles: binId = binIdFromFilename(binFile) seqs = readFasta(binFile) for seqId, seq in seqs.iteritems(): seqIdToBinId[seqId] = binId seqIdToSeqLen[seqId] = len(seq) # process each fasta file self.logger.info(" Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads)) # make sure all BAM files are sorted self.numFiles = len(bamFiles) for bamFile in bamFiles: if not os.path.exists(bamFile + '.bai'): self.logger.error(' [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n') sys.exit(1) # calculate coverage of each BAM file coverageInfo = {} numFilesStarted = 0 for bamFile in bamFiles: numFilesStarted += 1 self.logger.info(' Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles))) coverageInfo[bamFile] = mp.Manager().dict() coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile]) # redirect output self.logger.info(' Writing coverage information to file.') oldStdOut = reassignStdOut(outFile) header = 'Sequence Id\tBin Id\tSequence length (bp)' for bamFile in bamFiles: header += '\tBam Id\tCoverage\tMapped reads' print(header) # get length of all seqs for bamFile, seqIds in coverageInfo.iteritems(): for seqId in seqIds.keys(): seqIdToSeqLen[seqId] = seqIds[seqId].seqLen # write coverage stats for all scaffolds to file for seqId, seqLen in seqIdToSeqLen.iteritems(): rowStr = seqId + '\t' + seqIdToBinId.get(seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen) for bamFile in bamFiles: bamId = binIdFromFilename(bamFile) if seqId in coverageInfo[bamFile]: rowStr += '\t%s\t%f\t%d' % (bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads) else: rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0) print(rowStr) # restore stdout restoreStdOut(outFile, oldStdOut)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None): """Print out information about bin.""" if outputFormat == 1: selectedMarkerSet = binMarkerSets.selectedMarkerSet() lineageStr = selectedMarkerSet.lineageStr if selectedMarkerSet.UID != '0': lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) row = "%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % ( self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(), "\t".join([str(data[i]) for i in range(6)]), data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)) if table == None: print(row) else: table.add_row([ self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets() ] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) elif outputFormat == 2: selectedMarkerSet = binMarkerSets.selectedMarkerSet() lineageStr = selectedMarkerSet.lineageStr if selectedMarkerSet.UID != '0': lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) if table == None: row = self.binId row += '\t%s\t%d\t%d\t%d' % (lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()) row += '\t%0.2f\t%0.2f\t%0.2f' % (data[6], data[7], aai.aaiMeanBinHetero.get( self.binId, 0.0)) row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % ( self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'], self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], self.binStats['Mean scaffold length'], self.binStats['Mean contig length'], self.binStats['Longest scaffold'], self.binStats['Longest contig']) row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100) row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']) row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)]) if coverageBinProfiles: for _, coverageStats in coverageBinProfiles[ self.binId].iteritems(): row += '\t%.2f\t%.2f' % (coverageStats[0], coverageStats[1]) print(row) else: row = [ self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets() ] row.extend([ data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0) ]) row.extend([ self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'], self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], int(self.binStats['Mean scaffold length']), int(self.binStats['Mean contig length']), self.binStats['Longest scaffold'], self.binStats['Longest contig'] ]) row.extend( [self.binStats['GC'] * 100, self.binStats['GC std'] * 100]) row.extend([ self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes'] ]) row.extend(data[0:6]) if coverageBinProfiles: for _, coverageStats in coverageBinProfiles[ self.binId].iteritems(): row.extend(coverageStats) table.add_row(row) elif outputFormat == 3: for ms in binMarkerSets.markerSetIter(): data = self.geneCounts(ms, self.markerHits, bIndividualMarkers) row = "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % ( self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets(), "\t".join( [str(data[i]) for i in range(6)]), data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)) if table == None: print(row) else: table.add_row([ self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets() ] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) elif outputFormat == 4: selectedMarkerSet = binMarkerSets.selectedMarkerSet() data = self.hitsToMarkerGene(binMarkerSets.selectedMarkerSet()) row = "Node Id: %s; Marker lineage: %s" % ( selectedMarkerSet.UID, selectedMarkerSet.lineageStr) for marker in data: row += '\t' + marker print(row) row = self.binId for count in data.values(): row += '\t' + str(count) print(row) print() elif outputFormat == 5: # tabular of bin_id, marker, contig_id markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() for marker, hit_list in self.markerHits.items(): if marker not in markerGenes: continue for hit in hit_list: print(self.binId, marker, hit.target_name, sep='\t', end='\n') elif outputFormat == 6: markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() seqsReported = 0 for marker, hitList in self.markerHits.items(): if marker not in markerGenes: continue if len(hitList) >= 2: print(self.binId, marker, sep='\t', end='\t') scaffoldIds = [] for hit in hitList: scaffoldIds.append(hit.target_name) print(','.join(sorted(scaffoldIds)), end='\n') seqsReported += 1 return seqsReported elif outputFormat == 7: markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() seqsReported = 0 for marker, hitList in self.markerHits.items(): if marker not in markerGenes: continue if len(hitList) >= 2: scaffoldsWithMultipleHits = set() for i in xrange(0, len(hitList)): scaffoldId = hitList[i].target_name[ 0:hitList[i].target_name.rfind('_')] for j in xrange(i + 1, len(hitList)): if scaffoldId == hitList[j].target_name[ 0:hitList[j].target_name.rfind('_')]: scaffoldsWithMultipleHits.add( hitList[i].target_name) scaffoldsWithMultipleHits.add( hitList[j].target_name) if len(scaffoldsWithMultipleHits) >= 2: print(self.binId, marker, sep='\t', end='\t') print(','.join(sorted( list(scaffoldsWithMultipleHits))), end='\n') seqsReported += 1 return seqsReported elif outputFormat == 8: # tabular - print only position of marker genes markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() genesWithMarkers = {} for marker, hit_list in self.markerHits.items(): if marker not in markerGenes: continue for hit in hit_list: genesWithMarkers[hit.target_name] = genesWithMarkers.get( hit.target_name, []) + [hit] for geneId, hits in genesWithMarkers.iteritems(): rowStr = self.binId + '\t' + geneId for hit in hits: rowStr += '\t' + hit.query_accession + ',' + str( hit.ali_from) + ',' + str(hit.ali_to) print(rowStr) # Hunter Cameron, May 29, 2015 - print a fasta of marker genes elif outputFormat == 9: # tabular of bin_id, marker, contig_id # check for the analyze folder for later use if anaFolder is None: raise ValueError( "AnaFolder must not be None for outputFormat 9") # ## build a dict to link target_names with marker gene alignment information markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() hitInfo = {} for marker, hit_list in self.markerHits.items(): if marker not in markerGenes: continue for hit in hit_list: name = hit.target_name hitInfo[name] = { "marker": marker, "ali_from": str(hit.ali_from), "ali_to": str(hit.ali_to) } # ## Open genes.faa and print the ones that were found with some descriptive info in the header path_to_genes = "/".join( [anaFolder, "bins", self.binId, "genes.faa"]) # get only the seqs we need and their information as a dict seqs = readFasta(path_to_genes, trimHeader=False) filt_seqs = [] # remove seqs without markers for header in seqs.keys(): gene_name = header.split(" # ")[0] if gene_name in hitInfo: filt_seqs.append(header) def sort_header(header): """ sorts headers by contig and gene number """ name = header.split(" # ")[0] ctg_name, gene_num = name.rsplit("_", 1) return ctg_name, int(gene_num) for header in sorted(filt_seqs, key=sort_header): elems = header.split(" # ") gene_name = elems[0] # remove the gene number from Prodigal to get the original contig name contig_name, gene_num = gene_name.rsplit("_", 1) # parse some info about the gene from the header line gene_start = elems[1] gene_end = elems[2] gene_strand = elems[3] # if table output not specified, print FASTA if table != None: gene_info = "geneId={};start={};end={};strand={};protlen={}".format( gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header]))) marker_info = "marker={};mstart={};mend={}".format( hitInfo[gene_name]["marker"], hitInfo[gene_name]["ali_from"], hitInfo[gene_name]["ali_to"]) # new header will be the bin name, contig name, gene info, and marker info separated by spaces new_header = ">" + " ".join( [self.binId, contig_name, gene_info, marker_info]) print(new_header, seqs[header], sep="\n") # otherwise, print a table else: print("\t".join([ self.binId, contig_name, gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header])), hitInfo[gene_name]["marker"], hitInfo[gene_name]["ali_from"], hitInfo[gene_name]["ali_to"], seqs[header] ])) else: self.logger.error("Unknown output format: %d", outputFormat) return 0 '''
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): # Read reference distributions from file dist = readDistribution('gc_dist') # get GC for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for _, seq in seqs.iteritems(): start = 0 end = self.options.gc_window_size seqLen = len(seq) seqLens.append(seqLen) while (end < seqLen): a, c, g, t = baseCount(seq[start:end]) try: data.append(float(g + c) / (a + c + g + t)) except: # it is possible to reach a long stretch of # N's that causes a division by zero error pass start = end end += self.options.gc_window_size if len(data) == 0: axesHist.set_xlabel( '[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.gc_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% GC') axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get GC bin statistics binTools = BinTools() meanGC, deltaGCs, _ = binTools.gcDist(seqs) # Delta-GC vs Sequence length plot axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap='gray_r') axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) axesDeltaGC.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaGC.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestGC = findNearest(np.array(dist.keys()), meanGC) # find closest distribution values sampleSeqLen = dist[closestGC].keys()[0] d = dist[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestGC]: xL.append(dist[closestGC][windowSize][gcLowerBoundKey]) xU.append(dist[closestGC][windowSize][gcUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaGC.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaGC.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaGC.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaGC.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaGC.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaGC.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaGC.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaGC.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plot(self, fastaFile): # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axes = self.fig.add_subplot(111) # calculate cumulative sequence length seqs = readFasta(fastaFile) seqLens = [] for seq in seqs.values(): seqLens.append(len(seq)) seqLens.sort(reverse=True) x = np.arange(0, len(seqLens)) y = [] cumLen = 0 for seqLen in seqLens: cumLen += seqLen y.append(cumLen) # Create plot axes.plot( x, y, 'k-', ) axes.set_xlabel('Sequence index') axes.set_ylabel('Cumulative sequence length (Mbp)') # ensure y-axis include zero _, end = axes.get_ylim() axes.set_ylim([0, end]) # Change sequence lengths from bp to kbp yticks = axes.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.2f' % (float(seqLen) / 1e6) label = label.replace('.00', '') # remove trailing zeros if label[-1] == '0': label = label[0:-1] kbpLabels.append(label) axes.set_yticklabels(kbpLabels) # Prettify plot for a in axes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axes.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.fig.tight_layout(pad=1) self.draw()