def ReadAdaptersFromScraps(bam):
    handles = []
    if bam.lower().endswith(".scraps.bam"):
        handles.append(IndexedBamReader(bam))
    else:
        # Iterate through each external resource, looking for scraps files to read
        ds = openDataSet(bam)
        for er in ds.externalResources:
            try:
                handle = IndexedBamReader(er.scraps)
            except:
                continue
            handles.append(handle)

    adps = defaultdict(int)
    polyA = defaultdict(int)
    for handle in handles:
        # Parse the scraps.bam as usual
        for record in handle:
            if record.scrapType != "A":
                continue
            hn = record.holeNumber
            seq = record.peer.seq
            adps[hn] += 1
            tFrac = sum(1 for b in seq if b == "T") / float(len(seq))
            if tFrac > MIN_T:
                polyA[hn] += 1

    # Convert our counts into a T/F depending on whether there are polyAs
    res = {}
    for hn, v in adps.iteritems():
        if v >= 2:
            res[hn] = "T" if polyA[hn] >= 1 else "F"
    return res
Ejemplo n.º 2
0
 def __init__(self, subreadsFname, scrapsFname=None):
     if not subreadsFname.endswith(".subreads.bam"):
         raise Exception, "Expecting a subreads.bam"
     if scrapsFname is None:
         scrapsFname = subreadsFname.replace("subreads.bam", "scraps.bam")
     self.subreadsF = IndexedBamReader(subreadsFname)
     self.scrapsF   = IndexedBamReader(scrapsFname)
     if (len(self.subreadsF.movieNames) != 1 or
         self.scrapsF.movieNames != self.subreadsF.movieNames):
         raise Exception, "Requires single movie BAM file, and matching scraps"
Ejemplo n.º 3
0
 def __init__(self, subreadsFname, scrapsFname=None):
     if not subreadsFname.endswith(".subreads.bam"):
         raise Exception, "Expecting a subreads.bam"
     if scrapsFname is None:
         scrapsFname = subreadsFname.replace("subreads.bam", "scraps.bam")
     self.subreadsF = IndexedBamReader(subreadsFname)
     self.scrapsF = IndexedBamReader(scrapsFname)
     if (len(self.subreadsF.movieNames) != 1
             or self.scrapsF.movieNames != self.subreadsF.movieNames):
         raise Exception, "Requires single movie BAM file, and matching scraps"
Ejemplo n.º 4
0
 def test_reheader_bam(self):
     ofn = "subreads_out.bam"
     bam_file = pbtestdata.get_file("subreads-bam")
     reheader_bam(bam_file, ofn, self.BIOSAMPLE_NAME, self.LIBRARY_NAME)
     assert op.isfile(ofn) and op.isfile(ofn + ".pbi")
     with IndexedBamReader(ofn) as bam_out:
         self._validate_bam(bam_out)
         with IndexedBamReader(bam_file) as bam_in:
             self._validate_input_bam(bam_in)
             self._validate_records(bam_in, bam_out)
Ejemplo n.º 5
0
def ReadAdaptersFromScraps(bam, windows):
    handles = []
    if bam.lower().endswith(".scraps.bam"):
        handles.append(IndexedBamReader(bam))
    else:
        # Iterate through each external resource, looking for scraps files to read
        ds = openDataSet(bam)
        for er in ds.externalResources:
            try:
                handle = IndexedBamReader(er.scraps)
            except:
                continue
            handles.append(handle)

    adps = defaultdict(list)
    for handle in handles:
        for record in handle:
            if record.scrapType != "A":
                continue
            hn = record.holeNumber
            # Skip records without alignments that passed QC
            try:
                qS, qE, _, _, _, _, _ = windows[hn]
            except:
                continue
            # Skip records for ZMWs other than the one selected for it's alignment
            if record.qStart not in [qS, qE] and record.qEnd not in [qS, qE]:
                continue
            # If we made it this far, record the position and type of adapter
            seq = record.peer.seq
            tFrac = sum(1 for b in seq if b == "T") / float(len(seq))
            if tFrac < MIN_T:
                adps[hn].append((record.qStart, "TC6"))
            else:
                adps[hn].append((record.qStart, "POLYA"))

    # Convert our counts into a T/F depending on whether there are polyAs
    results = {}
    for hn, adpData in adps.iteritems():
        if len(adpData) != 2:
            print "ERROR! ERROR! {0} adps for hn #{1}".format(
                len(adpTypes), hn)
        # Using the strand, sort the adps left-to-right (by alignment)
        _, _, _, _, _, strand, _ = windows[hn]
        if strand == 0:
            adpData = sorted(adpData)
        else:
            adpData = sorted(adpData, reverse=True)
        # Now ordered we can record both ADP types and locations
        leftTc6 = "T" if adpData[0][1] == "TC6" else "F"
        rightTc6 = "T" if adpData[1][1] == "TC6" else "F"
        leftPolyA = "T" if adpData[0][1] == "POLYA" else "F"
        rightPolyA = "T" if adpData[1][1] == "POLYA" else "F"
        results[hn] = (leftTc6, rightTc6, leftPolyA, rightPolyA)
    return results
 def test_empty_bam_reads_in_range(self):
     with IndexedBamReader(data.getEmptyAlignedBam()) as bam:
         reads = bam.readsInRange("lambda_NEB3011",
                                  0,
                                  50000,
                                  justIndices=True)
         assert len(reads) == 0
Ejemplo n.º 7
0
 def __read_bam(fn):
     if op.exists(fn + ".pbi"):
         with IndexedBamReader(fn) as bam_in:
             return bam_in
     else:
         with BamReader(fn) as bam_in:
             return bam_in
Ejemplo n.º 8
0
def _verify_write_compare_ccs(testobj, inbamfns, zmws, outbamfn,
                              expected_movies, expected_len):
    """First verify input.bam and input.bam.pbi exist,
    next, extract zmws from input and write to an output bam,
    then compare ccs reads and zmws in input and output.
    """
    testobj.assertTrue(all(op.exists(fn) for fn in inbamfns))
    testobj.assertTrue(all(op.exists(fn + ".pbi") for fn in inbamfns))

    reader = BamCollection(*inbamfns)
    # verify movie names and length of reader.

    testobj.assertTrue(set(reader.movieNames) == set(expected_movies))
    testobj.assertTrue(
        len(reader) == expected_len, "%d != %d" % (len(reader), expected_len))

    # write ccs reads.
    with BamWriter(outbamfn, reader.header) as writer:
        for zmw in zmws:
            writer.write(reader[zmw].ccsRead)

    # make pbi and check
    make_pbi(outbamfn)
    testobj.assertTrue(op.exists(outbamfn + ".pbi"))

    # compare ccs reads in input and output.
    reader2 = IndexedBamReader(outbamfn)
    outzmws = []
    for r in reader2:
        outzmws.append(r.zmwName)
        other = reader[r.readName]
        testobj.assertTrue(compareBamRecords(r, other))

    # compare ccs zmws in input and output
    testobj.assertTrue(set(zmws) == set(outzmws))
Ejemplo n.º 9
0
def readAlignments( alnFile, adps, minAlnLength=MIN_ALN_LENGTH, nReads=N_READS ):
    # Using that reader, parse the regions aligned to known adapters
    queryAdps = defaultdict(list)
    queryData = {}
    count = 0
    for record in IndexedBamReader( alnFile ):
        if record.tEnd - record.tStart < minAlnLength:
            continue
        count += 1
        if nReads and count > nReads:
            break
        zmw    = "{0}/{1}".format(record.movieName, record.holeNumber)
        refAdps = adps[record.referenceName]
        alnAdps = [adp for adp in refAdps if adp[0] < record.tEnd
                                          if adp[1] > record.tStart]
        queryData[zmw] = AlignmentData( record )
        read   = record.read(aligned=False, orientation="genomic")
        for adpStart, adpEnd, adpType in alnAdps:
            clip = record.clippedTo(adpStart, adpEnd)
            # Skip adapters in SVs / large deletions, since we never had a chance
            if clip.aStart == clip.aEnd:
                continue
            aStart = clip.aStart - record.aStart
            aEnd   = clip.aEnd - record.aStart
            adpSeq = read[aStart:aEnd]
            alnAdp = AlignmentAdapter(zmw, clip.aStart, clip.aEnd, adpType, adpSeq)
            queryAdps[zmw].append( alnAdp )
    return (queryData, queryAdps)
Ejemplo n.º 10
0
def alignment_info_from_bam(bam_file_name):
    """
    Extract subread information from an indexed BAM file.  This should be
    relatively fast since it will not access the BAM records directly.
    """
    by_movie = {}
    last_zmw_id = None
    with IndexedBamReader(bam_file_name) as bam:
        if len(bam) > 0:
            identities = bam.identity
            subread_lengths = bam.aEnd - bam.aStart
            for i_aln, rgId in enumerate(bam.qId):
                movie_name = bam.readGroupInfo(rgId).MovieName
                if not movie_name in by_movie:
                    by_movie[movie_name] = MovieAlignmentInfo(bam_file_name,
                                                              movie_name)
                m = by_movie[movie_name]
                hole_number = bam.holeNumber[i_aln]
                qs, qe = bam.qStart[i_aln], bam.qEnd[i_aln]
                rstart, rend = bam.aStart[i_aln], bam.aEnd[i_aln]
                identity = None
                if (qs, qe) == (-1, -1):
                    qs = 0
                    # XXX This is only used to key subreads so the exact value is
                    # not important - still clumsy though
                    qe = rend - rstart

                # Compound ids
                zmw_id = (movie_name, hole_number)
                subread_id = (movie_name, hole_number, qs, qe)

                this_a = []
                this_a.append(subread_lengths[i_aln])

                this_a.append(identities[i_aln])
                this_a.append(bam.readQual[i_aln])

                this_a.append(1.0 if zmw_id != last_zmw_id else 0.0)  # isFirst

                # modStart, a value without a clear meaning, so just write some
                # garbage
                this_a.append(99999)

                last_zmw_id = zmw_id

                if subread_id in m.datum:
                    warnings.warn("Duplicate subread %s" % str(subread_id))

                # No Z-score
                m.datum[subread_id] = tuple(this_a)

                if zmw_id not in m.max_subread or subread_lengths[i_aln] > m.max_subread[zmw_id][1]:
                    m.max_subread[zmw_id] = (subread_id, subread_lengths[i_aln])

                m.unrolled.setdefault(zmw_id, [99999, 0])
                m.unrolled[zmw_id][0] = min(m.unrolled[zmw_id][0], rstart)
                m.unrolled[zmw_id][1] = max(m.unrolled[zmw_id][1], rend)

    return by_movie #datum, unrolled, max_subread
def ReadAlignedBamFile(fns, tList):
    # Dictionaries for tracking adapter results
    adps = defaultdict(int)
    altAdps = defaultdict(int)

    # Dictionaries for tracking ZMW-level results
    cov = defaultdict(int)
    windows = {}
    for fn in fns:
        for record in IndexedBamReader(fn):
            # Skip secondary alignments
            if record.MapQV == 0:
                continue

            # We have nothing to learn from subreads with no adapters
            leftT, rightT = ParseAdapterTypes(record)
            if leftT is None and rightT is None:
                continue

            hn = record.holeNumber

            # Adapter stats are duplicated between the subreads that sandwich them, so
            #  we arbitrarily pick the right side here to avoid double-counts when
            #  recording information about our adapters
            if rightT is not None:
                adps[hn] += 1
                if rightT == 1:
                    altAdps[hn] += 1

            # If we made it past all of our filters, parse the rest of the data we want:
            tId = record.tId
            tStart = record.tStart
            tEnd = record.tEnd
            tCov = tEnd - tStart

            # Search our target list for targets that overlap our current subread
            target = "OFF"
            for tName, _, tTid, _, tRS, tRE, _ in tList:
                if tStart < tRS and tEnd > tRE:
                    target = tName
                    break

            # If our coverage for this subread is better than anything we've already seen
            #  for this ZMW, keep it instead
            if tCov > cov[hn]:
                cov[hn] = tCov
                windows[hn] = (hn, tId, tStart, tEnd, target)

    # Convert our dictionary of windows to a flat, sorted list
    windowResults = sorted(v for k, v in windows.iteritems())

    # Convert our adapter counts into a T/F depending on whether there are polyAs
    adpResults = {}
    for hn, v in adps.iteritems():
        if v >= 2:
            adpResults[hn] = "T" if altAdps[hn] >= 1 else "F"

    # Return a tuple containing both our cleaned up Window and Adapter results
    return (windowResults, adpResults)
 def test_alignment_identity(self):
     """
     Check that the values of the 'identity' property are consistent
     between IndexedBamReader (numpy array) and BamAlignment (float)
     """
     fn = data.getAlignedBam()
     with IndexedBamReader(fn) as bam_in:
         i1 = bam_in.identity
         i2 = np.array([rec.identity for rec in bam_in])
         assert (i2 == i1).all()
 def test_alignment_identity_unindexed(self):
     """
     Check that the value of the 'identity' property is the same whether
     or not the .pbi index was used to calculate it.
     """
     fn1 = data.getAlignedBam()
     fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     shutil.copyfile(fn1, fn2)
     with IndexedBamReader(fn1) as bam_pbi:
         with BamReader(fn2) as bam_noindex:
             i1 = np.array([rec.identity for rec in bam_pbi])
             i2 = np.array([rec.identity for rec in bam_noindex])
             assert (i2 == i1).all()
Ejemplo n.º 14
0
def _iter_bam_files(input_file):
    if input_file.endswith(".xml"):
        with openDataFile(input_file) as ds_in:
            if not ds_in.isIndexed:
                log.warning("Unindexed file(s), this may be very slow")
            for rr in ds_in.resourceReaders():
                yield rr
    else:
        if op.exists(input_file + ".pbi"):
            with IndexedBamReader(input_file) as bam_in:
                yield bam_in
        else:
            with BamReader(input_file) as bam_in:
                yield bam_in
Ejemplo n.º 15
0
 def test_split_bam(self):
     bam_file1 = self._get_bam_path(self.DS1)
     CHUNKS_IN = [1, 2, 3, 4]
     CHUNKS_OUT = [1, 2, 3, 3]
     for n_in, n_expected in zip(CHUNKS_IN, CHUNKS_OUT):
         nchunks = split_bam(bam_file1, n_in)
         assert nchunks == n_expected
         bam_in = IndexedBamReader(bam_file1)
         records_in = [rec.qName for rec in bam_in]
         records_out = []
         for i in range(n_expected):
             bam_out = BamReader("reads.chunk%d.bam" % i)
             records_out.extend([rec.qName for rec in bam_out])
         assert records_in == records_out
         self._remove_all()
Ejemplo n.º 16
0
def openIndexedAlignmentFile(fname,
                             referenceFastaFname=None,
                             sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file (BAM),
    requires bam.pbi index

    The reference FASTA, if provided, must have a FASTA index
    (fasta.fai).
    """
    if fname.endswith(".h5"):
        raise_no_h5()
    elif fname.endswith("bam"):
        return IndexedBamReader(fname,
                                referenceFastaFname=referenceFastaFname,
                                sharedIndex=sharedIndex)
    else:
        raise ValueError("Invalid alignment file suffix")
Ejemplo n.º 17
0
 def getMetrics(cls):
     cls.subreads_file = None
     for file_id, file_info in cls.datastore.get_file_dict().iteritems():
         if file_info.file_type_id == FileTypes.DS_SUBREADS.file_type_id:
             cls.subreads_file = file_info.path
             break
     with SubreadSet(cls.subreads_file) as ds_in:
         cls.metric_dict["n_subreads"] = len(ds_in)
         cls.zmws = set()
         for bam in ds_in.resourceReaders():
             cls.zmws.update(set(list(bam.holeNumber)))
         cls.metric_dict["n_reads"] = len(cls.zmws)
         n_bases = 0
         for er in ds_in.externalResources:
             for bam_file in [er.bam, er.scraps]:
                 bam = IndexedBamReader(bam_file)
                 n_bases += int((bam.qEnd - bam.qStart).sum())
         cls.metric_dict["n_bases"] = n_bases
Ejemplo n.º 18
0
    def __init__(self, subread_set_path, zmws=None, subsampleto=None):

        self.subread_set_path = subread_set_path
        self.subread_set = SubreadSet(subread_set_path)
        self.framerate = self.subread_set.resourceReaders(
        )[0].readGroupTable.FrameRate[0]
        self.subsampleto = subsampleto

        dsets = [(self.subread_set, 'subreads')]
        # grab path to scraps if available
        if self.subread_set.externalResources[0].scraps:
            self.scraps = IndexedBamReader(
                self.subread_set.externalResources[0].scraps)
            dsets.append((self.scraps, 'scraps'))

        self.ppa_burst_dtypes = self._set_ppa_burst_dtypes(
        )  # column info of burst table
        self.reads_dtypes = self._set_reads_dtypes(
        )  # column info of reads table

        if self._hasPpaBurstInfo(self.subread_set):
            if zmws is None:
                self.zmws = self._subsample_zmws()
            else:
                self.zmws = zmws
                log.info('Number of ZMWs ' + str(len(zmws)))

            results = []
            # if scraps info was present, scrape that for burst info, too
            for dset in reversed(dsets):
                ppa_bursts, reads = self.retrieve_classifier_bursts(
                    dset[0], dset[1])
                results.append((ppa_bursts, reads))
            if len(results) == 1:
                self.ppa_bursts = results[0][0]
                self.reads = results[0][1]
            elif len(results) == 2:
                subread_ppa_bursts = results[0][0]
                subread_reads = results[0][1]
                scraps_ppa_bursts = results[1][0]
                scraps_reads = results[1][1]
                self.ppa_bursts = np.hstack(
                    (subread_ppa_bursts, scraps_ppa_bursts))
                self.reads = np.hstack((subread_reads, scraps_reads))
Ejemplo n.º 19
0
def openIndexedAlignmentFile(fname,
                             referenceFastaFname=None,
                             sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file
    (cmp.h5 or BAM), requiring index capability (built-in for cmp.h5;
    requires bam.pbi index for BAM

    The reference FASTA, if provided, must have a FASTA index
    (fasta.fai).
    """
    if fname.endswith("cmp.h5"):
        return CmpH5Reader(fname, sharedIndex=sharedIndex)
    elif fname.endswith("bam"):
        return IndexedBamReader(fname,
                                referenceFastaFname=referenceFastaFname,
                                sharedIndex=sharedIndex)
    else:
        raise ValueError, "Invalid alignment file suffix"
Ejemplo n.º 20
0
def ReadAdaptersFromScraps(bam):
    adps = defaultdict(int)
    polyA = defaultdict(int)
    with IndexedBamReader(bam) as handle:
        for record in handle:
            if record.scrapType != "A":
                continue
            hn = record.holeNumber
            seq = record.peer.seq
            adps[hn] += 1
            tFrac = sum(1 for b in seq if b == "T") / float(len(seq))
            if tFrac > MIN_T:
                polyA[hn] += 1

    # Convert our counts into a T/F depending on whether there are polyAs
    res = {}
    for hn, v in adps.iteritems():
        if v >= 2:
            res[hn] = "T" if polyA[hn] >= 1 else "F"
    return res
Ejemplo n.º 21
0
 def test_combine_with_header(self):
     bam_file = self._get_bam_path(self.DS1)
     bam_size = op.getsize(bam_file)
     # see above - these are known boundaries for this particular input
     byte_ranges = [(396, 26575), (26575, 77209), (77209, bam_size)]
     with open(bam_file, "rb") as bam_in:
         with open("header.bam", "wb") as header_out:
             header_out.write(bam_in.read(396))
         for i, (start, end) in enumerate(byte_ranges):
             with open("tmp.chunk%d.bam" % i, "wb") as chunk_out:
                 bam_in.seek(start)
                 nbytes = end - start
                 chunk_out.write(bam_in.read(nbytes))
     for i in range(3):
         combine_with_header("header.bam", "tmp.chunk%d.bam" % i, "combined.chunk%d.bam" % i)
     bam_in = IndexedBamReader(bam_file)
     records_in = [rec.qName for rec in bam_in]
     records_out = []
     for i in range(3):
         bam_out = BamReader("combined.chunk%d.bam" % i)
         records_out.extend([rec.qName for rec in bam_out])
     assert records_in == records_out
Ejemplo n.º 22
0
def ReadAlignedBamFile(genome, fns):
    # Dictionaries for tracking ZMW-level results
    tDict = genome.targetDictionary()
    cov = defaultdict(int)
    adps = {}
    windows = {}
    for fn in fns:
        for record in IndexedBamReader(fn):
            # Skip secondary alignments
            if record.MapQV == 0:
                continue

            hn = record.holeNumber
            tId = record.tId
            tStart = record.tStart
            tEnd = record.tEnd
            tCov = tEnd - tStart

            adpTypes = ParseAdapterTypes(record)

            # Search our target list for targets that overlap our current subread
            target = "OFF"
            for tName, _, tTid, _, tRS, tRE, _ in tDict[tId]:
                if tStart < tRS and tEnd > tRE:
                    target = tName
                    break

            # If our coverage for this subread is better than anything we've already seen
            #  for this ZMW, keep it instead
            if tCov > cov[hn]:
                cov[hn] = tCov
                windows[hn] = (hn, tId, tStart, tEnd, target)
                adps[hn] = adpTypes

    # Return a tuple containing both our cleaned up Window and Adapter results
    return (windows, adps)
Ejemplo n.º 23
0
def main(parser):

    args = parser.parse_args()

    def makeFqName(bcPair):
        return '{}/{}--{}.fastq'.format(args.outDir,
                                        *[bcNames[i] for i in bcPair])

    bcNames = {
        i: rec.name
        for i, rec in enumerate(FastaReader(args.barcodeFasta))
    }
    bcNames[-1] = 'NoBC'
    bam = IndexedBamReader(args.ccsBAM)

    for bcPair in set(zip(bam.bcForward, bam.bcReverse)):
        with FastqWriter(makeFqName(bcPair)) as writer:
            for rec in bam[(bam.bcForward == bcPair[0])
                           & (bam.bcReverse == bcPair[1])]:
                header = rec.readName
                if not args.noBcQual:
                    header += ' bq=%i' % rec.bcQual
                writer.writeRecord(header, rec.read(aligned=False),
                                   rec.peer.query_qualities)
Ejemplo n.º 24
0
def _verify_write_compare_subreads(testobj, inbamfns, zmws, outbamfn):
    """First verify that input bam and pbi files exist,
    next extract zmws from inputs and write to outbamfn,
    then compare bam records in input and output."""
    # Verify that input.bam and input.bam.pbi exist
    testobj.assertTrue(all(op.exists(fn) for fn in inbamfns))
    testobj.assertTrue(all(op.exists(fn + ".pbi") for fn in inbamfns))

    reader = BamCollection(*inbamfns)
    writer = BamWriter(outbamfn, reader.header)
    for zmw in zmws:
        for sr in reader[zmw].subreads:
            writer.write(sr)
    writer.close()

    # make pbi for outbamfn
    make_pbi(outbamfn)
    testobj.assertTrue(op.exists(outbamfn + ".pbi"))

    # Read subreads from outbamfn and compare.
    reader2 = IndexedBamReader(outbamfn)
    for r in reader2:
        other = reader[r.readName]
        testobj.assertTrue(compareBamRecords(r, other))
Ejemplo n.º 25
0
class ZmwReadStitcher(object):
    """
    A reader class that enables viewing the read records corresponding
    to a given ZMW, as present in a paired subreads.bam and
    scraps.bam, as if they were a contiguous ZMW read record.
    """
    def __init__(self, subreadsFname, scrapsFname=None):
        if not subreadsFname.endswith(".subreads.bam"):
            raise Exception, "Expecting a subreads.bam"
        if scrapsFname is None:
            scrapsFname = subreadsFname.replace("subreads.bam", "scraps.bam")
        self.subreadsF = IndexedBamReader(subreadsFname)
        self.scrapsF   = IndexedBamReader(scrapsFname)
        if (len(self.subreadsF.movieNames) != 1 or
            self.scrapsF.movieNames != self.subreadsF.movieNames):
            raise Exception, "Requires single movie BAM file, and matching scraps"

    @property
    def filename(self):
        return self.subreadsF.filename

    @property
    def hasPulseFeatures(self):
        return (self.subreadsF.hasPulseFeatures() and
                self.scrapsF.hasPulseFeatures())

    @property
    @cached
    def sequencingZmws(self):
        """
        Hole numbers for which we have basecalls and an HQ region
        """
        return sorted(set(self.subreadsF.holeNumber))

    @property
    @cached
    def allSequencingZmws(self):
        """
        Hole numbers for which we have basecalls
        """
        return sorted(set.union(set(self.subreadsF.holeNumber),
                                set(self.scrapsF.holeNumber)))

    def __getitem__(self, holeNumber):
        if holeNumber not in self.allSequencingZmws:
            raise IndexError, "Requested hole number has no entry in this BAM file"
        subreads = self.subreadsF.readsByHoleNumber(holeNumber)
        scraps = self.scrapsF.readsByHoleNumber(holeNumber)
        combined = sorted(subreads + scraps, key=lambda x: x.qStart)
        return StitchedZmw(self, combined)

    @property
    @cached
    def featureDescs(self):
      rgs = self.subreadsF.peer.header["RG"]
      assert len(rgs) == 1
      rg = rgs[0]
      dsEntries = set(pair.split("=")[0]
                      for pair in rg["DS"].split(";"))
      manifestNames = dsEntries.intersection(_possibleFeatureManifestNames)
      return { desc.accessorName : desc
               for desc in FEATURE_DESCS
               if desc.nameInManifest in manifestNames }

    @property
    @cached
    def frameRate(self):
        return self.subreadsF.readGroupTable[0].FrameRate

    @property
    @cached
    def movieName(self):
        mns = list(self.subreadsF.movieNames)
        assert len(mns) == 1
        return mns[0]
Ejemplo n.º 26
0
 def test_empty_bam(self):
     fn = data.getEmptyBam()
     bam = IndexedBamReader(fn)
     EQ(len(bam), 0)
 def test_empty_bam(self):
     fn = data.getEmptyBam()
     bam = IndexedBamReader(fn)
     assert len(bam) == 0
 def setup_class(cls):
     cls.f = IndexedBamReader(cls.BAM_FILE)
 def test_read_lima_demultiplexed_bam(self):
     fn = "/pbi/dept/secondary/siv/testdata/pbcore-unittest/data/demultiplex.lbc1--lbc1.bam"
     bam = IndexedBamReader(fn)
     assert str(
         bam[0]
     ) == "Unmapped BAM record: m54008_160219_003234/74056024/1184_3910"
Ejemplo n.º 30
0
def filter_reads(input_bam,
                 output_bam,
                 whitelist=None,
                 blacklist=None,
                 percentage=None,
                 count=None,
                 seed=None,
                 ignore_metadata=False,
                 relative=None,
                 anonymize=False,
                 use_barcodes=False,
                 sample_scraps=False,
                 keep_original_uuid=False,
                 use_subreads=False,
                 min_adapters=None):
    _validate_settings(output_bam, whitelist, blacklist, percentage, count,
                       min_adapters)
    output_bam = op.abspath(output_bam)
    if seed is not None:
        random.seed(seed)
    output_ds = base_name = None
    if output_bam.endswith(".xml"):
        if not input_bam.endswith(".xml"):
            raise UserError(
                "DataSet output only supported for DataSet inputs.")
        ds_type = output_bam.split(".")[-2]
        ext2 = OrderedDict([("subreadset", "subreads"),
                            ("alignmentset", "subreads"),
                            ("consensusreadset", "ccs"),
                            ("consensusalignmentset", "ccs"),
                            ("transcriptset", "transcripts"),
                            ("transcriptalignmentset", "transcripts")])
        if not ds_type in ext2:
            raise ValueError(
                "Invalid output file extension '{t}.xml'; valid extensions are:\n{e}"
                .format(t=ds_type,
                        e="\n".join(["  %s.xml" % e for e in ext2.keys()])))
        output_ds = output_bam
        base_name = ".".join(output_ds.split(".")[:-2])
        output_bam = base_name + "." + ".".join([ext2[ds_type], "bam"])
    if output_bam == input_bam:
        raise UserError("Input and output files must not be the same path")
    elif not output_bam.endswith(".bam"):
        raise UserError("Output file name must end in either '.bam' or '.xml'")
    n_file_reads = 0
    have_zmws = set()
    scraps_bam = barcode_set = sts_xml = None
    with openDataFile(input_bam) as ds_in:
        if not isinstance(ds_in, ReadSet):
            raise UserError("{t} is not an allowed dataset type".format(
                t=type(ds_in).__name__))
        # TODO(nechols)(2016-03-11): refactor this to enable propagation of
        # filtered scraps
        if not ds_in.isIndexed:
            raise UserError("Input BAM must have accompanying .pbi index")
        for ext_res in ds_in.externalResources:
            if ext_res.barcodes is not None:
                assert barcode_set is None or barcode_set == ext_res.barcodes
                barcode_set = barcode_set
            if ext_res.sts is not None:
                if sts_xml is None:
                    sts_xml = ext_res.sts
                else:
                    log.warning("Multiple sts.xml files, will not propagate")
        f1 = ds_in.resourceReaders()[0]
        if percentage is not None or count is not None or min_adapters is not None:
            bam_readers = list(ds_in.resourceReaders())
            if sample_scraps:
                for ext_res in ds_in.externalResources:
                    if ext_res.scraps is not None:
                        scraps_in = IndexedBamReader(ext_res.scraps)
                        bam_readers.append(scraps_in)
            whitelist = _create_whitelist(bam_readers, percentage, count,
                                          min_adapters)
        # convert these to Python sets
        if use_subreads:
            _whitelist = _process_subread_list(whitelist)
            _blacklist = _process_subread_list(blacklist)
        else:
            _whitelist = _process_zmw_list(whitelist)
            _blacklist = _process_zmw_list(blacklist)
        scraps_in = None
        if output_ds is not None and output_ds.endswith(".subreadset.xml"):
            for ext_res in ds_in.externalResources:
                if ext_res.scraps is not None:
                    if use_barcodes:
                        log.warning("Scraps BAM is present but lacks " +
                                    "barcodes - will not be propagated " +
                                    "to output SubreadSet")
                    else:
                        scraps_in = IndexedBamReader(ext_res.scraps)
                    break
        with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out:
            for bam_in in ds_in.resourceReaders():
                n_records, have_zmws_ = _process_bam_whitelist(
                    bam_in,
                    bam_out,
                    whitelist=_whitelist,
                    blacklist=_blacklist,
                    use_barcodes=use_barcodes,
                    anonymize=anonymize,
                    use_subreads=use_subreads,
                    qid2mov=ds_in.qid2mov)
                n_file_reads += n_records
                have_zmws.update(have_zmws_)
        if scraps_in is not None:
            scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam)
            with AlignmentFile(scraps_bam, 'wb',
                               template=scraps_in.peer) as scraps_out:
                for ext_res in ds_in.externalResources:
                    if ext_res.scraps is not None:
                        scraps_in_ = IndexedBamReader(ext_res.scraps)
                        n_records, have_zmws_ = _process_bam_whitelist(
                            scraps_in_,
                            scraps_out,
                            _whitelist,
                            _blacklist,
                            use_barcodes=use_barcodes,
                            anonymize=anonymize,
                            use_subreads=use_subreads)
                        have_zmws.update(have_zmws_)
    if n_file_reads == 0:
        log.warn("No reads written")
    else:
        log.info("{n} records from {z} ZMWs written".format(n=n_file_reads,
                                                            z=len(have_zmws)))

    def _run_pbindex(bam_file):
        try:
            rc = subprocess.call(["pbindex", bam_file])
        except OSError as e:
            if e.errno == 2:
                log.warning("pbindex not present, will not create .pbi file")
            else:
                raise

    _run_pbindex(output_bam)
    if output_ds is not None:
        with openDataSet(input_bam) as ds_in:
            ds_out = ds_in.__class__(output_bam)
            if scraps_bam is not None:
                _run_pbindex(scraps_bam)
                ds_out.externalResources[0].scraps = scraps_bam
                # XXX it doesn't pick up the .pbi file - sort of annoying
                # but since the pbcore API doesn't provide a read for the
                # scraps automatically anyway, the impact is minimal
            if barcode_set is not None:
                ds_out.externalResources[0].barcodes = barcode_set
            if sts_xml is not None:
                sts_xml_out = base_name + ".sts.xml"
                log.info("Copying {s} to {d}".format(s=sts_xml, d=sts_xml_out))
                shutil.copyfile(sts_xml, sts_xml_out)
                ds_out.externalResources[0].sts = sts_xml_out
            if not ignore_metadata:
                ds_out.metadata = ds_in.metadata
                ds_out.updateCounts()
            ds_out.name = ds_in.name + " (bamsieve)"
            ds_out.tags = ds_in.tags
            if relative:
                ds_out.makePathsRelative(op.dirname(output_ds))
            if keep_original_uuid:
                log.warning("Keeping input UUID {u}".format(u=ds_in.uuid))
                ds_out.objMetadata["UniqueId"] = ds_in.uuid
            ds_out.write(output_ds)
            log.info("wrote {t} XML to {x}".format(t=ds_out.__class__.__name__,
                                                   x=output_ds))
    return 0
Ejemplo n.º 31
0
class ZmwReadStitcher(object):
    """
    A reader class that enables viewing the read records corresponding
    to a given ZMW, as present in a paired subreads.bam and
    scraps.bam, as if they were a contiguous ZMW read record.
    """
    def __init__(self, subreadsFname, scrapsFname=None):
        if not subreadsFname.endswith(".subreads.bam"):
            raise Exception, "Expecting a subreads.bam"
        if scrapsFname is None:
            scrapsFname = subreadsFname.replace("subreads.bam", "scraps.bam")
        self.subreadsF = IndexedBamReader(subreadsFname)
        self.scrapsF = IndexedBamReader(scrapsFname)
        if (len(self.subreadsF.movieNames) != 1
                or self.scrapsF.movieNames != self.subreadsF.movieNames):
            raise Exception, "Requires single movie BAM file, and matching scraps"

    @property
    def filename(self):
        return self.subreadsF.filename

    @property
    def hasPulseFeatures(self):
        return (self.subreadsF.hasPulseFeatures()
                and self.scrapsF.hasPulseFeatures())

    @property
    @cached
    def sequencingZmws(self):
        """
        Hole numbers for which we have basecalls and an HQ region
        """
        return sorted(set(self.subreadsF.holeNumber))

    @property
    @cached
    def allSequencingZmws(self):
        """
        Hole numbers for which we have basecalls
        """
        return sorted(
            set.union(set(self.subreadsF.holeNumber),
                      set(self.scrapsF.holeNumber)))

    def __getitem__(self, holeNumber):
        if holeNumber not in self.allSequencingZmws:
            raise IndexError, "Requested hole number has no entry in this BAM file"
        subreads = self.subreadsF.readsByHoleNumber(holeNumber)
        scraps = self.scrapsF.readsByHoleNumber(holeNumber)
        combined = sorted(subreads + scraps, key=lambda x: x.qStart)
        return StitchedZmw(self, combined)

    @property
    @cached
    def featureDescs(self):
        rgs = self.subreadsF.peer.header["RG"]
        assert len(rgs) == 1
        rg = rgs[0]
        dsEntries = set(pair.split("=")[0] for pair in rg["DS"].split(";"))
        manifestNames = dsEntries.intersection(_possibleFeatureManifestNames)
        return {
            desc.accessorName: desc
            for desc in FEATURE_DESCS if desc.nameInManifest in manifestNames
        }

    @property
    @cached
    def frameRate(self):
        return self.subreadsF.readGroupTable[0].FrameRate

    @property
    @cached
    def movieName(self):
        mns = list(self.subreadsF.movieNames)
        assert len(mns) == 1
        return mns[0]
Ejemplo n.º 32
0
 def __init__(self):
     if not op.isfile(self.BAM_FILE):
         raise SkipTest("Testdata not present")
     self.f = IndexedBamReader(self.BAM_FILE)