def addSAM(self, filename, filterAlignment=None): """ Read a SAM file and add information about the reads that match our reference id. @param filename: A C{str} SAM filename. @param filterAlignment: A 1-argument function to be used for filtering reads in the SAM file. If C{None}, all alignments will be examined. """ self.samFiles.append(filename) referenceId = self.genomeAccession with samfile(filename) as sam: for column in sam.pileup(): for read in column.pileups: alignment = read.alignment if (alignment.reference_name == referenceId and (filterAlignment is None or filterAlignment(alignment))): readId = alignment.query_name self.readIdsMatchingGenome.add(readId) offset = column.reference_pos self.coveredOffsetCount[offset] += 1 try: offsetInfo = self.offsets[offset] except KeyError: pass else: # This offset corresponds to one or more proteins. self.coveredProteins.update( offsetInfo['proteinAccessions']) offsetInfo['readIds'].add(readId)
def _getAlignedReferences(self, alignmentFiles): """ Get the ids of all reference sequences in all alignment files. @param alignmentFiles: A C{list} of C{str} alignment file names. @return: A C{set} of C{str} reference ids as found in all passed alignment files. """ # Get the names of all references in all alignment files. alignedReferences = set() for filename in alignmentFiles: with samfile(filename) as sam: for i in range(sam.nreferences): alignedReferences.add(sam.get_reference_name(i)) return alignedReferences
return { 'duplicateCount': 0, 'primaryCount': 0, 'qcFailCount': 0, 'nonDuplicateCount': 0, 'readIds': set(), 'secondaryCount': 0, 'supplementaryCount': 0, } referenceReads = defaultdict(referenceInfo) mappedCount = unmappedCount = 0 readIds = set() with samfile(args.samFile) as fp: for read in fp.fetch(): readIds.add(read.query_name) if read.is_unmapped: unmappedCount += 1 else: mappedCount += 1 stats = referenceReads[read.reference_name] stats['readIds'].add(read.query_name) if read.is_secondary: stats['secondaryCount'] += 1 elif read.is_supplementary: stats['supplementaryCount'] += 1 else: stats['primaryCount'] += 1 if read.is_duplicate:
if args.noFilter: # Do not do our custom SAM filtering. def filterRead(read): return not (read.is_del or read.is_refskip) else: def filterRead(read): return (not (read.is_del or read.is_refskip) and samFilter.filterAlignment(read.alignment)) if printStats: counts = [] with samfile(args.samfile) as sam: if samFilter.referenceIds: # No need to check if the given reference id is in referenceLengths # because the samFilter.referenceLengths call above catches that. referenceId = samFilter.referenceIds.pop() else: if len(referenceLengths) == 1: referenceId = list(referenceLengths)[0] else: print('SAM file %r contains %d references (%s). Only one ' 'reference id can be analyzed at a time. Please use ' '--referenceId to specify the one you want examined.' % (args.samfile, len(referenceLengths), ', '.join( sorted(referenceLengths))), file=sys.stderr)
def initialReferenceIdAnalysis(self, referenceId, alignmentFile, outputDir): """ Analyze the given reference id in the given alignment file (if an alignment to the reference id is present). @param referenceId: The C{str} id of the reference sequence to analyze. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. @return: C{None} if C{referenceId} is not present in C{alignmentFile} or if no significant offsets are found. Else, a C{dict} containing the signifcant offsets and the consensus sequence that best matches C{referenceId}. """ # Make sure this reference id is in this alignment file and if so # get its length (and check it's the same as the length of the # sequence given in the reference file). with samfile(alignmentFile) as sam: tid = sam.get_tid(referenceId) if tid == -1: # This referenceId is not in this alignment file. self.report(' Reference %s not in alignment file.' % referenceId) return else: genomeLength = sam.lengths[tid] # Sanity check. assert genomeLength == len(self.referenceGenomes[referenceId]) if self.plotSAM: filename = join(outputDir, 'reads.html') self.report(' Saving reads alignment plot to %s' % filename) plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}), filename, title=referenceId, jitter=0.45) alignedReads = [] samFilter = SAMFilter( alignmentFile, referenceIds={referenceId}, dropDuplicates=True, dropSupplementary=True, # dropSecondary=True, storeQueryIds=True) paddedSAM = PaddedSAM(samFilter) for query in paddedSAM.queries(addAlignment=True): assert len(query) == genomeLength alignedReads.append( AlignedRead(query.id, query.sequence, query.alignment)) # Sanity check that all aligned reads have different ids. This # should be the case because the padded SAM queries method adds /2, # /3 etc to queries that have more than one alignment. assert len(alignedReads) == len(set(read.id for read in alignedReads)) readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData( genomeLength, alignedReads) significantOffsets = list( findSignificantOffsets(baseCountAtOffset, readCountAtOffset, self.minReads, self.homogeneousCutoff)) self.report( ' %d alignment%s (of %d unique %s) read from %s' % (samFilter.alignmentCount, s( samFilter.alignmentCount), len(samFilter.queryIds), 'query' if len(samFilter.queryIds) == 1 else 'queries', alignmentFile)) self.report(' %d of which %s aligned to %s' % (len(alignedReads), 'was' if len(alignedReads) == 1 else 'were', referenceId)) self.report(' Reference genome length %d' % genomeLength) self.report(' Found %d significant location%s' % (len(significantOffsets), s(len(significantOffsets)))) self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset) if not significantOffsets: self.report(' No significant locations found.') return if self.saveReducedFASTA: self.saveReducedFasta(significantOffsets, outputDir) self._plotCoverageAndSignificantLocations(referenceId, alignmentFile, readCountAtOffset, genomeLength, significantOffsets, outputDir) self.saveSignificantOffsets(significantOffsets, outputDir) for read in alignedReads: read.setSignificantOffsets(significantOffsets) self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength, significantOffsets, baseCountAtOffset, readCountAtOffset, outputDir) # Save the reference. filename = join(outputDir, 'reference.fasta') self.report(' Saving reference to', filename) reference = self.referenceGenomes[referenceId] Reads([reference]).save(filename) # Extract a consensus according to bcftools. self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir) return (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets, samFilter, paddedSAM)
def analyzeReferenceId(self, referenceId, alignmentFile, outputDir): """ Analyze the given reference id in the given alignment file (if an alignment to the reference id is present). @param referenceId: The C{str} id of the reference sequence to analyze. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. @return: C{None} if C{referenceId} is not present in C{alignmentFile} or if no significant offsets are found. Else, a C{dict} containing the signifcant offsets and the consensus sequence that best matches C{referenceId}. """ analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile, outputDir) if analysis: (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets, samFilter, paddedSAM) = analysis else: return insignificantOffsets = set( range(genomeLength)) - set(significantOffsets) reference = self.referenceGenomes[referenceId] referenceSequence = reference.sequence consensus = [] for base in referenceSequence: ob = OffsetBases() ob.incorporateBase(base) consensus.append(ob) readQueue = PriorityQueue() self.updatePriorityQueue(readQueue, alignedReads, consensus, significantOffsets) consensusFilename = join(outputDir, 'reference-consensus.sam') nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam') self.report(' Writing consensus SAM to', consensusFilename) self.report(' Writing non-consensus SAM to', nonConsensusFilename) with samfile(alignmentFile) as sam: consensusAlignment = AlignmentFile(consensusFilename, mode='w', template=sam) nonConsensusAlignment = AlignmentFile(nonConsensusFilename, mode='w', template=sam) # Reads with no significant offsets get written to both output files. readsWithNoSignificantOffsetsCount = 0 for read in alignedReads: if not read.significantOffsets: readsWithNoSignificantOffsetsCount += 1 consensusAlignment.write(read.alignment) nonConsensusAlignment.write(read.alignment) for offset in insignificantOffsets: base = read.base(offset) if base is not None: consensus[offset].incorporateBase(base) self.report(' %d read%s did not overlap any significant offsets' % (readsWithNoSignificantOffsetsCount, s(readsWithNoSignificantOffsetsCount))) readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0 cutoff = self.cutoff while readQueue: mismatchFraction, _ = readQueue.lowestPriority() read = readQueue.pop() if mismatchFraction <= cutoff: # We want this read. Incorporate it into the consensus. readsMatchingConsensusCount += 1 consensusAlignment.write(read.alignment) affectedReads = set() for offset in read.significantOffsets: readBase = read.base(offset) consensus[offset].incorporateBase(readBase) for readAtOffset in readsAtOffset[offset]: if readAtOffset in readQueue: affectedReads.add(readAtOffset) self.updatePriorityQueue(readQueue, affectedReads, consensus, significantOffsets) else: readsNotMatchingConsensusCount += 1 nonConsensusAlignment.write(read.alignment) consensusAlignment.close() nonConsensusAlignment.close() self.report( ' %d read%s matched the consensus, %d did not.' % (readsMatchingConsensusCount, s(readsMatchingConsensusCount), readsNotMatchingConsensusCount)) # Remove the reference bases from the consensus. for offset, base in enumerate(referenceSequence): consensus[offset].unincorporateBase(base) consensusInfoFilename = join(outputDir, 'reference-consensus.txt') self.report(' Writing consensus info to', consensusInfoFilename) with open(consensusInfoFilename, 'w') as fp: consensusSequence = [] for offset in range(genomeLength): # Take a copy of the commonest set because we may pop from # it below. commonest = set(consensus[offset].commonest) referenceBase = referenceSequence[offset] if len(commonest) > 1: nucleotides = ' Nucleotides: %s' % ( consensus[offset].baseCountsToStr()) else: nucleotides = '' if referenceBase in commonest: consensusBase = referenceBase else: if len(commonest) == 1: # Nothing in the included reads covers this offset. consensusBase = '-' elif len(commonest) > 1: # Report a draw (in which the reference base is not # included and so cannot be used to break the draw). commonest.pop() else: consensusBase = commonest.pop() consensusSequence.append(consensusBase) mismatch = '' if referenceBase == consensusBase else ( ' Mismatch (reference has %s)' % referenceBase) print('%d: %s%s%s' % (offset + 1, consensusBase, mismatch, nucleotides), file=fp) consensusRead = Read('gready-consensus-%s' % referenceId, ''.join(consensusSequence)) consensusFilename = join(outputDir, 'reference-consensus.fasta') self.report(' Writing gready consensus info to', consensusFilename) Reads([consensusRead]).save(consensusFilename) return { 'consensusRead': consensusRead, 'significantOffsets': significantOffsets, }
def main(args): """ Print SAM/BAM file reference read counts. @param args: An argparse namespace with information about parsed command-line options. """ if args.topReferenceIdsFile and args.sortBy != 'count': print( '--topReferenceIdsFile only makes sense when using --sortBy ' 'count', file=sys.stderr) sys.exit(1) referenceReads = defaultdict(referenceInfo) mapped = set() unmapped = set() readIds = set() referenceLengths = SAMFilter(args.samFile).referenceLengths() with samfile(args.samFile) as fp: for read in fp.fetch(): id_ = read.query_name readIds.add(id_) if read.is_unmapped: unmapped.add(id_) else: mapped.add(id_) stats = referenceReads[read.reference_name] stats['readIds'].add(id_) if read.is_secondary: stats['secondary'].add(id_) elif read.is_supplementary: stats['supplementary'].add(id_) else: stats['primary'].add(id_) if read.is_duplicate: stats['duplicate'].add(id_) else: stats['nonDuplicate'].add(id_) if read.is_qcfail: stats['qcFail'].add(id_) totalReads = len(readIds) print('Found a total of %d read%s (%d mapped, %d unmapped) mapping ' 'against %d of %d reference%s.' % (totalReads, '' if totalReads == 1 else 's', len(mapped), len(unmapped), len(referenceReads), len(referenceLengths), '' if len(referenceLengths) == 1 else 's')) if args.sortBy == 'count': def key(referenceId): return len(referenceReads[referenceId]['readIds']) sortedReferenceReads = sorted(referenceReads, key=key, reverse=True) topReference = sortedReferenceReads[0] else: # Sort the references by name sortedReferenceReads = sorted(referenceReads) cumulativeReadIds = set() for count, referenceId in enumerate(sortedReferenceReads, start=1): stats = referenceReads[referenceId] readCount = len(stats['readIds']) if readCount == 0 and args.excludeZeroes: continue newReadCount = len(stats['readIds'] - cumulativeReadIds) if newReadCount == 0 and args.excludeIfNoAdditional: continue cumulativeReadIds.update(stats['readIds']) print('\nReference %d: %s (%d nt):\n' ' Overall reads mapped to the reference: %s\n' ' Non-duplicates: %s, Duplicates: %s, QC fails: %s\n' ' Primary: %s, Secondary: %s, Supplementary: %s\n' ' Reads not matching any reference above: %s\n' ' Previously unmatched reads for this reference: %s' % (count, referenceId, referenceLengths[referenceId], pct(readCount, totalReads), pct(len(stats['nonDuplicate']), readCount), pct(len(stats['duplicate']), readCount), pct(len(stats['qcFail']), readCount), pct(len(stats['primary']), readCount), pct(len(stats['secondary']), readCount), pct(len(stats['supplementary']), readCount), pct(newReadCount, totalReads), pct(newReadCount, readCount))) # Write out the (sorted) read ids of the reference with the most reads. if args.topReferenceIdsFile: with open(args.topReferenceIdsFile, 'w') as fp: print('\n'.join(sorted(referenceReads[topReference]['readIds'])), file=fp)
def referenceInfo(): return { 'readIds': set(), 'primaryCount': 0, 'secondaryCount': 0, 'supplementaryCount': 0, } referenceReads = defaultdict(referenceInfo) unmappedCount = 0 readIds = set() mappingCount = 0 with samfile(args.samFile) as fp: for read in fp.fetch(): mappingCount += 1 readIds.add(read.query_name) if read.is_unmapped: unmappedCount += 1 else: stats = referenceReads[read.reference_name] stats['readIds'].add(read.query_name) if read.is_secondary: stats['secondaryCount'] += 1 elif read.is_supplementary: stats['supplementaryCount'] += 1 else: stats['primaryCount'] += 1
def consensusFromBAM(bamFilename, bamId=None, referenceFasta=None, fastaId=None, consensusId=None, idLambda=None, threshold=0.8, minCoverage=1, lowCoverage='reference', noCoverage='reference', deletionSymbol='-', deletionThreshold=0.5, ignoreQuality=False, insertionCountThreshold=5, strategy='fetch', includeSoftClipped=False, compareWithPileupFile=None, progress=False, quiet=False): """ Build a consensus sequence from a BAM file. @param bamFilename: the BAM file. @param bamId: A C{str} BAM file reference name indicating which aligned reads to make a consensus from. If not given, will be inferred from the BAM file header. @param referenceFasta: A C{str} file name containing the sequence that was aligned to in making the BAM file. @param fastaId: A C{str} reference name indicating which sequence in C{referenceFasta} to use as a reference. Only considered if C{referenceFasta} is given. If not given and C{referenceFasta} is, the reference id will be inferred from reference names in the BAM header, or will be taken as the id of the first sequence in C{referenceFasta}. @param consensusId: The C{str} id to use in the consensus sequence. If not given, the BAM reference id with '-consensus' appended will be used. @param idLambda: A one-argument function taking and returning a sequence id. This can be used to set the id of the consensus sequence based on the id of the reference sequence. The function will be called with the id of the BAM reference sequence. @param threshold: A C{float} threshold. This fraction, at least, of the most-common nucleotides at a site are used to determine the consensus nucleotide (or ambiguous symbol if more than one nucleotide is required to achieve this threshold). If there is a tie in nucleotide counts at a site that causes the threshold to be met, all nucleotides of equeal frequncy will be included in the ambiguous symbol for that site. This is perhaps better explained with an example. See https://assets.geneious.com/manual/2020.1/static/GeneiousManualse43.html and the corresponding testGeneiousExamplesTie test in test/test_dna.py @param minCoverage: An C{int} minimum number of reads that must cover a site for a consensus base to be called. If zero reads cover a site, the C{noCoverage} value is used or if the number is greater than zero but less than C{minCoverage}, the C{lowCoverage} value is used. @param lowCoverage: A C{str} indicating what to do when some reads cover a site, but fewer than C{minCoverage}. Either 'reference' or a single character (e.g., 'N'). @param noCoverage: A C{str} indicating what to do when no reads cover a reference base. Either 'reference' or a single character (e.g., 'N'). @parm deletionSymbol: The C{str} to insert in the consensus when a deleted site is detected. @param deletionThreshold: If some reads have a deletion at a site and some do not, call the site as a deletion if C{float} the fraction of reads with the deletion is at least this value. @param insertionCountThreshold: The C{int} number of reads that must have an insertion at an offset in order for the insertion to be called in the consensus. @param ignoreQuality: If C{True}, ignore quality scores. @param strategy: A C{str} consensus-making strategy. @param includeSoftClipped: Include information from read bases that were marked as soft-clipped by the algorithm that made the BAM file. @param compareWithPileupFile: If C{True}, compare the base counts from the pysam fetch method with those of the pileup methods. This pays no attention to insertions. A summary of the result is written to this file. @param progress: If C{True}, display a progress bar on standard error. @param quiet: If C{True}, suppress diagnostic output. Note that this will silence warnings about differing reference names. @raise UnspecifiedReference: If no id is provided to indicate which BAM file reference to call a consensus for. @raise UnknownReference: If a requested reference id is unknown. @raise UnequalReferenceLengthError: If the passed reference does not have a length identical to the length mentioned in the BAM file. @raise ReferenceNameMismatchError: If the name of the FASTA reference sequence and the BAM reference do not agree (this is not raised if both ids are given explicitly). @return: A C{Read} instance with the consensus sequence. """ if referenceFasta is None: if lowCoverage == 'reference': raise UnspecifiedReference('lowCoverage is "reference" but no ' 'reference FASTA file was given.') if noCoverage == 'reference': raise UnspecifiedReference('noCoverage is "reference" but no ' 'reference FASTA file was given.') with samfile(bamFilename) as bam: bamId, reference, referenceLength = getReferenceInfo( bam, bamFilename, bamId, referenceFasta, fastaId, quiet) if consensusId is None: consensusId = getConsensusId(bamId, idLambda) correspondences, deletions, insertions = getPairs( bam, bamId, referenceLength, ignoreQuality, includeSoftClipped, progress) if strategy == 'fetch': correspondences, consensusBases, otherBases, originalOffsets = ( fetchConsensus(bam, correspondences, deletions, insertions, reference, referenceLength, noCoverage, deletionThreshold, ignoreQuality, insertionCountThreshold, includeSoftClipped, progress)) else: raise ConsensusError(f'Unknown consensus strategy {strategy!r}.') if compareWithPileupFile: with openOr(compareWithPileupFile, 'w', sys.stderr) as fp: compareCorrespondences( fp, correspondences, pileupCorrespondences(bam, bamId, referenceLength, includeSoftClipped, progress), threshold, minCoverage) consensus = basesToConsensus(consensusBases, otherBases, originalOffsets, reference, referenceLength, threshold, minCoverage, lowCoverage, noCoverage, deletionSymbol, progress) return DNARead(consensusId, consensus)