Exemple #1
0
    def addSAM(self, filename, filterAlignment=None):
        """
        Read a SAM file and add information about the reads that match our
        reference id.

        @param filename: A C{str} SAM filename.
        @param filterAlignment: A 1-argument function to be used for filtering
            reads in the SAM file. If C{None}, all alignments will be examined.
        """
        self.samFiles.append(filename)
        referenceId = self.genomeAccession
        with samfile(filename) as sam:
            for column in sam.pileup():
                for read in column.pileups:
                    alignment = read.alignment
                    if (alignment.reference_name == referenceId
                            and (filterAlignment is None
                                 or filterAlignment(alignment))):
                        readId = alignment.query_name
                        self.readIdsMatchingGenome.add(readId)
                        offset = column.reference_pos
                        self.coveredOffsetCount[offset] += 1

                        try:
                            offsetInfo = self.offsets[offset]
                        except KeyError:
                            pass
                        else:
                            # This offset corresponds to one or more proteins.
                            self.coveredProteins.update(
                                offsetInfo['proteinAccessions'])
                            offsetInfo['readIds'].add(readId)
Exemple #2
0
    def _getAlignedReferences(self, alignmentFiles):
        """
        Get the ids of all reference sequences in all alignment files.

        @param alignmentFiles: A C{list} of C{str} alignment file names.
        @return: A C{set} of C{str} reference ids as found in all passed
            alignment files.
        """
        # Get the names of all references in all alignment files.
        alignedReferences = set()
        for filename in alignmentFiles:
            with samfile(filename) as sam:
                for i in range(sam.nreferences):
                    alignedReferences.add(sam.get_reference_name(i))

        return alignedReferences
Exemple #3
0
    return {
        'duplicateCount': 0,
        'primaryCount': 0,
        'qcFailCount': 0,
        'nonDuplicateCount': 0,
        'readIds': set(),
        'secondaryCount': 0,
        'supplementaryCount': 0,
    }


referenceReads = defaultdict(referenceInfo)
mappedCount = unmappedCount = 0
readIds = set()

with samfile(args.samFile) as fp:
    for read in fp.fetch():
        readIds.add(read.query_name)
        if read.is_unmapped:
            unmappedCount += 1
        else:
            mappedCount += 1
            stats = referenceReads[read.reference_name]
            stats['readIds'].add(read.query_name)
            if read.is_secondary:
                stats['secondaryCount'] += 1
            elif read.is_supplementary:
                stats['supplementaryCount'] += 1
            else:
                stats['primaryCount'] += 1
            if read.is_duplicate:
if args.noFilter:
    # Do not do our custom SAM filtering.
    def filterRead(read):
        return not (read.is_del or read.is_refskip)
else:

    def filterRead(read):
        return (not (read.is_del or read.is_refskip)
                and samFilter.filterAlignment(read.alignment))


if printStats:
    counts = []

with samfile(args.samfile) as sam:

    if samFilter.referenceIds:
        # No need to check if the given reference id is in referenceLengths
        # because the samFilter.referenceLengths call above catches that.
        referenceId = samFilter.referenceIds.pop()
    else:
        if len(referenceLengths) == 1:
            referenceId = list(referenceLengths)[0]
        else:
            print('SAM file %r contains %d references (%s). Only one '
                  'reference id can be analyzed at a time. Please use '
                  '--referenceId to specify the one you want examined.' %
                  (args.samfile, len(referenceLengths), ', '.join(
                      sorted(referenceLengths))),
                  file=sys.stderr)
Exemple #5
0
    def initialReferenceIdAnalysis(self, referenceId, alignmentFile,
                                   outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """

        # Make sure this reference id is in this alignment file and if so
        # get its length (and check it's the same as the length of the
        # sequence given in the reference file).
        with samfile(alignmentFile) as sam:
            tid = sam.get_tid(referenceId)
            if tid == -1:
                # This referenceId is not in this alignment file.
                self.report('    Reference %s not in alignment file.' %
                            referenceId)
                return
            else:
                genomeLength = sam.lengths[tid]
                # Sanity check.
                assert genomeLength == len(self.referenceGenomes[referenceId])

        if self.plotSAM:
            filename = join(outputDir, 'reads.html')
            self.report('    Saving reads alignment plot to %s' % filename)
            plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}),
                    filename,
                    title=referenceId,
                    jitter=0.45)

        alignedReads = []
        samFilter = SAMFilter(
            alignmentFile,
            referenceIds={referenceId},
            dropDuplicates=True,
            dropSupplementary=True,
            # dropSecondary=True,
            storeQueryIds=True)
        paddedSAM = PaddedSAM(samFilter)
        for query in paddedSAM.queries(addAlignment=True):
            assert len(query) == genomeLength
            alignedReads.append(
                AlignedRead(query.id, query.sequence, query.alignment))

        # Sanity check that all aligned reads have different ids. This
        # should be the case because the padded SAM queries method adds /2,
        # /3 etc to queries that have more than one alignment.
        assert len(alignedReads) == len(set(read.id for read in alignedReads))

        readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
            genomeLength, alignedReads)

        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   self.minReads, self.homogeneousCutoff))

        self.report(
            '    %d alignment%s (of %d unique %s) read from %s' %
            (samFilter.alignmentCount, s(
                samFilter.alignmentCount), len(samFilter.queryIds), 'query'
             if len(samFilter.queryIds) == 1 else 'queries', alignmentFile))
        self.report('    %d of which %s aligned to %s' %
                    (len(alignedReads),
                     'was' if len(alignedReads) == 1 else 'were', referenceId))
        self.report('    Reference genome length %d' % genomeLength)
        self.report('    Found %d significant location%s' %
                    (len(significantOffsets), s(len(significantOffsets))))

        self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset)

        if not significantOffsets:
            self.report('    No significant locations found.')
            return

        if self.saveReducedFASTA:
            self.saveReducedFasta(significantOffsets, outputDir)

        self._plotCoverageAndSignificantLocations(referenceId, alignmentFile,
                                                  readCountAtOffset,
                                                  genomeLength,
                                                  significantOffsets,
                                                  outputDir)

        self.saveSignificantOffsets(significantOffsets, outputDir)

        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)

        self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength,
                                            significantOffsets,
                                            baseCountAtOffset,
                                            readCountAtOffset, outputDir)

        # Save the reference.
        filename = join(outputDir, 'reference.fasta')
        self.report('    Saving reference to', filename)
        reference = self.referenceGenomes[referenceId]
        Reads([reference]).save(filename)

        # Extract a consensus according to bcftools.
        self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir)

        return (genomeLength, alignedReads, readCountAtOffset,
                baseCountAtOffset, readsAtOffset, significantOffsets,
                samFilter, paddedSAM)
Exemple #6
0
    def analyzeReferenceId(self, referenceId, alignmentFile, outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """
        analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile,
                                                   outputDir)

        if analysis:
            (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset,
             readsAtOffset, significantOffsets, samFilter,
             paddedSAM) = analysis
        else:
            return

        insignificantOffsets = set(
            range(genomeLength)) - set(significantOffsets)

        reference = self.referenceGenomes[referenceId]
        referenceSequence = reference.sequence

        consensus = []
        for base in referenceSequence:
            ob = OffsetBases()
            ob.incorporateBase(base)
            consensus.append(ob)

        readQueue = PriorityQueue()
        self.updatePriorityQueue(readQueue, alignedReads, consensus,
                                 significantOffsets)

        consensusFilename = join(outputDir, 'reference-consensus.sam')
        nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam')
        self.report('    Writing consensus SAM to', consensusFilename)
        self.report('    Writing non-consensus SAM to', nonConsensusFilename)

        with samfile(alignmentFile) as sam:
            consensusAlignment = AlignmentFile(consensusFilename,
                                               mode='w',
                                               template=sam)
            nonConsensusAlignment = AlignmentFile(nonConsensusFilename,
                                                  mode='w',
                                                  template=sam)

        # Reads with no significant offsets get written to both output files.
        readsWithNoSignificantOffsetsCount = 0
        for read in alignedReads:
            if not read.significantOffsets:
                readsWithNoSignificantOffsetsCount += 1
                consensusAlignment.write(read.alignment)
                nonConsensusAlignment.write(read.alignment)

                for offset in insignificantOffsets:
                    base = read.base(offset)
                    if base is not None:
                        consensus[offset].incorporateBase(base)

        self.report('    %d read%s did not overlap any significant offsets' %
                    (readsWithNoSignificantOffsetsCount,
                     s(readsWithNoSignificantOffsetsCount)))

        readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0
        cutoff = self.cutoff
        while readQueue:
            mismatchFraction, _ = readQueue.lowestPriority()
            read = readQueue.pop()
            if mismatchFraction <= cutoff:
                # We want this read. Incorporate it into the consensus.
                readsMatchingConsensusCount += 1
                consensusAlignment.write(read.alignment)
                affectedReads = set()
                for offset in read.significantOffsets:
                    readBase = read.base(offset)
                    consensus[offset].incorporateBase(readBase)
                    for readAtOffset in readsAtOffset[offset]:
                        if readAtOffset in readQueue:
                            affectedReads.add(readAtOffset)
                self.updatePriorityQueue(readQueue, affectedReads, consensus,
                                         significantOffsets)
            else:
                readsNotMatchingConsensusCount += 1
                nonConsensusAlignment.write(read.alignment)

        consensusAlignment.close()
        nonConsensusAlignment.close()

        self.report(
            '    %d read%s matched the consensus, %d did not.' %
            (readsMatchingConsensusCount, s(readsMatchingConsensusCount),
             readsNotMatchingConsensusCount))

        # Remove the reference bases from the consensus.
        for offset, base in enumerate(referenceSequence):
            consensus[offset].unincorporateBase(base)

        consensusInfoFilename = join(outputDir, 'reference-consensus.txt')
        self.report('    Writing consensus info to', consensusInfoFilename)

        with open(consensusInfoFilename, 'w') as fp:
            consensusSequence = []
            for offset in range(genomeLength):
                # Take a copy of the commonest set because we may pop from
                # it below.
                commonest = set(consensus[offset].commonest)
                referenceBase = referenceSequence[offset]

                if len(commonest) > 1:
                    nucleotides = ' Nucleotides: %s' % (
                        consensus[offset].baseCountsToStr())
                else:
                    nucleotides = ''

                if referenceBase in commonest:
                    consensusBase = referenceBase
                else:
                    if len(commonest) == 1:
                        # Nothing in the included reads covers this offset.
                        consensusBase = '-'
                    elif len(commonest) > 1:
                        # Report a draw (in which the reference base is not
                        # included and so cannot be used to break the draw).
                        commonest.pop()
                    else:
                        consensusBase = commonest.pop()

                consensusSequence.append(consensusBase)

                mismatch = '' if referenceBase == consensusBase else (
                    ' Mismatch (reference has %s)' % referenceBase)

                print('%d: %s%s%s' %
                      (offset + 1, consensusBase, mismatch, nucleotides),
                      file=fp)

        consensusRead = Read('gready-consensus-%s' % referenceId,
                             ''.join(consensusSequence))
        consensusFilename = join(outputDir, 'reference-consensus.fasta')
        self.report('    Writing gready consensus info to', consensusFilename)
        Reads([consensusRead]).save(consensusFilename)

        return {
            'consensusRead': consensusRead,
            'significantOffsets': significantOffsets,
        }
Exemple #7
0
def main(args):
    """
    Print SAM/BAM file reference read counts.

    @param args: An argparse namespace with information about parsed
        command-line options.
    """
    if args.topReferenceIdsFile and args.sortBy != 'count':
        print(
            '--topReferenceIdsFile only makes sense when using --sortBy '
            'count',
            file=sys.stderr)
        sys.exit(1)

    referenceReads = defaultdict(referenceInfo)
    mapped = set()
    unmapped = set()
    readIds = set()

    referenceLengths = SAMFilter(args.samFile).referenceLengths()

    with samfile(args.samFile) as fp:
        for read in fp.fetch():
            id_ = read.query_name
            readIds.add(id_)
            if read.is_unmapped:
                unmapped.add(id_)
            else:
                mapped.add(id_)
                stats = referenceReads[read.reference_name]
                stats['readIds'].add(id_)

                if read.is_secondary:
                    stats['secondary'].add(id_)
                elif read.is_supplementary:
                    stats['supplementary'].add(id_)
                else:
                    stats['primary'].add(id_)

                if read.is_duplicate:
                    stats['duplicate'].add(id_)
                else:
                    stats['nonDuplicate'].add(id_)

                if read.is_qcfail:
                    stats['qcFail'].add(id_)

    totalReads = len(readIds)

    print('Found a total of %d read%s (%d mapped, %d unmapped) mapping '
          'against %d of %d reference%s.' %
          (totalReads, '' if totalReads == 1 else 's', len(mapped),
           len(unmapped), len(referenceReads), len(referenceLengths),
           '' if len(referenceLengths) == 1 else 's'))

    if args.sortBy == 'count':

        def key(referenceId):
            return len(referenceReads[referenceId]['readIds'])

        sortedReferenceReads = sorted(referenceReads, key=key, reverse=True)
        topReference = sortedReferenceReads[0]
    else:
        # Sort the references by name
        sortedReferenceReads = sorted(referenceReads)

    cumulativeReadIds = set()

    for count, referenceId in enumerate(sortedReferenceReads, start=1):
        stats = referenceReads[referenceId]
        readCount = len(stats['readIds'])
        if readCount == 0 and args.excludeZeroes:
            continue
        newReadCount = len(stats['readIds'] - cumulativeReadIds)
        if newReadCount == 0 and args.excludeIfNoAdditional:
            continue
        cumulativeReadIds.update(stats['readIds'])
        print('\nReference %d: %s (%d nt):\n'
              '  Overall reads mapped to the reference: %s\n'
              '  Non-duplicates: %s, Duplicates: %s, QC fails: %s\n'
              '  Primary: %s, Secondary: %s, Supplementary: %s\n'
              '  Reads not matching any reference above: %s\n'
              '  Previously unmatched reads for this reference: %s' %
              (count, referenceId, referenceLengths[referenceId],
               pct(readCount,
                   totalReads), pct(len(stats['nonDuplicate']), readCount),
               pct(len(stats['duplicate']),
                   readCount), pct(len(stats['qcFail']), readCount),
               pct(len(stats['primary']),
                   readCount), pct(len(stats['secondary']), readCount),
               pct(len(stats['supplementary']), readCount),
               pct(newReadCount, totalReads), pct(newReadCount, readCount)))

    # Write out the (sorted) read ids of the reference with the most reads.
    if args.topReferenceIdsFile:
        with open(args.topReferenceIdsFile, 'w') as fp:
            print('\n'.join(sorted(referenceReads[topReference]['readIds'])),
                  file=fp)
def referenceInfo():
    return {
        'readIds': set(),
        'primaryCount': 0,
        'secondaryCount': 0,
        'supplementaryCount': 0,
    }


referenceReads = defaultdict(referenceInfo)
unmappedCount = 0
readIds = set()
mappingCount = 0

with samfile(args.samFile) as fp:
    for read in fp.fetch():
        mappingCount += 1
        readIds.add(read.query_name)
        if read.is_unmapped:
            unmappedCount += 1
        else:
            stats = referenceReads[read.reference_name]
            stats['readIds'].add(read.query_name)
            if read.is_secondary:
                stats['secondaryCount'] += 1
            elif read.is_supplementary:
                stats['supplementaryCount'] += 1
            else:
                stats['primaryCount'] += 1
Exemple #9
0
def consensusFromBAM(bamFilename,
                     bamId=None,
                     referenceFasta=None,
                     fastaId=None,
                     consensusId=None,
                     idLambda=None,
                     threshold=0.8,
                     minCoverage=1,
                     lowCoverage='reference',
                     noCoverage='reference',
                     deletionSymbol='-',
                     deletionThreshold=0.5,
                     ignoreQuality=False,
                     insertionCountThreshold=5,
                     strategy='fetch',
                     includeSoftClipped=False,
                     compareWithPileupFile=None,
                     progress=False,
                     quiet=False):
    """
    Build a consensus sequence from a BAM file.

    @param bamFilename: the BAM file.
    @param bamId: A C{str} BAM file reference name indicating which aligned
        reads to make a consensus from. If not given, will be inferred
        from the BAM file header.
    @param referenceFasta: A C{str} file name containing the sequence that was
        aligned to in making the BAM file.
    @param fastaId: A C{str} reference name indicating which sequence in
        C{referenceFasta} to use as a reference. Only considered if
        C{referenceFasta} is given. If not given and C{referenceFasta} is,
        the reference id will be inferred from reference names in the BAM
        header, or will be taken as the id of the first sequence in
        C{referenceFasta}.
    @param consensusId: The C{str} id to use in the consensus sequence. If not
        given, the BAM reference id with '-consensus' appended will be used.
    @param idLambda: A one-argument function taking and returning a sequence
        id. This can be used to set the id of the consensus sequence based
        on the id of the reference sequence. The function will be called with
        the id of the BAM reference sequence.
    @param threshold: A C{float} threshold. This fraction, at least, of the
        most-common nucleotides at a site are used to determine the consensus
        nucleotide (or ambiguous symbol if more than one nucleotide is
        required to achieve this threshold). If there is a tie in nucleotide
        counts at a site that causes the threshold to be met, all nucleotides
        of equeal frequncy will be included in the ambiguous symbol for that
        site. This is perhaps better explained with an example. See
        https://assets.geneious.com/manual/2020.1/static/GeneiousManualse43.html
        and the corresponding testGeneiousExamplesTie test in test/test_dna.py
    @param minCoverage: An C{int} minimum number of reads that must cover a
        site for a consensus base to be called. If zero reads cover a site, the
        C{noCoverage} value is used or if the number is greater than zero but
        less than C{minCoverage}, the C{lowCoverage} value is used.
    @param lowCoverage: A C{str} indicating what to do when some reads cover a
        site, but fewer than C{minCoverage}. Either 'reference' or a single
        character (e.g., 'N').
    @param noCoverage: A C{str} indicating what to do when no reads cover a
        reference base. Either 'reference' or a single character (e.g., 'N').
    @parm deletionSymbol: The C{str} to insert in the consensus when a deleted
        site is detected.
    @param deletionThreshold: If some reads have a deletion at a site and some
        do not, call the site as a deletion if C{float} the fraction of reads
        with the deletion is at least this value.
    @param insertionCountThreshold: The C{int} number of reads that must have
        an insertion at an offset in order for the insertion to be called in
        the consensus.
    @param ignoreQuality: If C{True}, ignore quality scores.
    @param strategy: A C{str} consensus-making strategy.
    @param includeSoftClipped: Include information from read bases that were
        marked as soft-clipped by the algorithm that made the BAM file.
    @param compareWithPileupFile: If C{True}, compare the base counts from the
        pysam fetch method with those of the pileup methods. This pays no
        attention to insertions. A summary of the result is written to this
        file.
    @param progress: If C{True}, display a progress bar on standard error.
    @param quiet: If C{True}, suppress diagnostic output. Note that this will
        silence warnings about differing reference names.
    @raise UnspecifiedReference: If no id is provided to indicate which BAM
        file reference to call a consensus for.
    @raise UnknownReference: If a requested reference id is unknown.
    @raise UnequalReferenceLengthError: If the passed reference does not have a
        length identical to the length mentioned in the BAM file.
    @raise ReferenceNameMismatchError: If the name of the FASTA reference
        sequence and the BAM reference do not agree (this is not raised if both
        ids are given explicitly).
    @return: A C{Read} instance with the consensus sequence.
    """
    if referenceFasta is None:
        if lowCoverage == 'reference':
            raise UnspecifiedReference('lowCoverage is "reference" but no '
                                       'reference FASTA file was given.')

        if noCoverage == 'reference':
            raise UnspecifiedReference('noCoverage is "reference" but no '
                                       'reference FASTA file was given.')

    with samfile(bamFilename) as bam:

        bamId, reference, referenceLength = getReferenceInfo(
            bam, bamFilename, bamId, referenceFasta, fastaId, quiet)

        if consensusId is None:
            consensusId = getConsensusId(bamId, idLambda)

        correspondences, deletions, insertions = getPairs(
            bam, bamId, referenceLength, ignoreQuality, includeSoftClipped,
            progress)

        if strategy == 'fetch':
            correspondences, consensusBases, otherBases, originalOffsets = (
                fetchConsensus(bam, correspondences, deletions, insertions,
                               reference, referenceLength, noCoverage,
                               deletionThreshold, ignoreQuality,
                               insertionCountThreshold, includeSoftClipped,
                               progress))
        else:
            raise ConsensusError(f'Unknown consensus strategy {strategy!r}.')

        if compareWithPileupFile:
            with openOr(compareWithPileupFile, 'w', sys.stderr) as fp:
                compareCorrespondences(
                    fp, correspondences,
                    pileupCorrespondences(bam, bamId, referenceLength,
                                          includeSoftClipped, progress),
                    threshold, minCoverage)

    consensus = basesToConsensus(consensusBases, otherBases, originalOffsets,
                                 reference, referenceLength, threshold,
                                 minCoverage, lowCoverage, noCoverage,
                                 deletionSymbol, progress)

    return DNARead(consensusId, consensus)