コード例 #1
0
ファイル: barcode.py プロジェクト: tkerelska/pbreports
def resolved_tool_contract_runner(rtc):
    log.info("Starting {f} version {v} report generation".format(
        f=__file__, v=__version__))
    dataset_uuids = [
        openDataFile(rtc.task.input_files[0]).uuid,
        openDataFile(rtc.task.input_files[1]).uuid
    ]
    report = run_to_report_bam(
        reads=rtc.task.input_files[0],
        barcodes=rtc.task.input_files[1],
        subreads=True,
        dataset_uuids=dataset_uuids)
    log.info(pformat(report.to_dict()))
    report.write_json(rtc.task.output_files[0])
    return 0
コード例 #2
0
 def test_incorrect_len_getitem(self):
     types = [AlignmentSet(data.getXml(8)),
              ReferenceSet(data.getXml(9)),
              SubreadSet(data.getXml(10)),
              HdfSubreadSet(data.getXml(19))]
     fn = tempfile.NamedTemporaryFile(suffix=".xml").name
     for ds in types:
         explen = -2
         with openDataFile(ds.toExternalFiles()[0]) as mystery:
             # try to avoid crashes...
             explen = len(mystery)
             mystery.numRecords = 1000000000
             mystery.write(fn)
         with openDataFile(fn) as mystery:
             self.assertEqual(len(list(mystery)), explen)
コード例 #3
0
 def test_incorrect_len_getitem(self):
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9))
     ]
     fn = tempfile.NamedTemporaryFile(suffix=".xml").name
     for ds in types:
         explen = -2
         with openDataFile(ds.toExternalFiles()[0]) as mystery:
             # try to avoid crashes...
             explen = len(mystery)
             mystery.numRecords = 1000000000
             mystery.write(fn)
         with openDataFile(fn) as mystery:
             assert len(list(mystery)) == explen
コード例 #4
0
ファイル: test_bamsieve.py プロジェクト: mpkocher/pbcoretools
    def test_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([24962, 32901, 30983])

        def _run_with_whitelist(wl):
            rc = bamSieve.filter_reads(
                input_bam=SUBREADS3,
                output_bam=ofn,
                whitelist=wl)
            self.assertEqual(rc, 0)
            with openDataFile(ofn, strict=False) as bam_out:
                have_zmws = set([rec.HoleNumber for rec in bam_out])
                self.assertEqual(have_zmws, WHITELIST)
        _run_with_whitelist(WHITELIST)
        _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(WHITELIST)]))
        _run_with_whitelist(tmp_wl)
        # now with a BAM file as whitelist
        rc = bamSieve.filter_reads(
            input_bam=SUBREADS3,
            output_bam=ofn,
            whitelist=SUBREADS4)
        with openDataFile(ofn, strict=False) as bam_out:
            self.assertEqual(117, len([rec for rec in bam_out]))
コード例 #5
0
ファイル: rainbow.py プロジェクト: lpp1985/lpp_Script
def _read_in_indexed_alignmentset(in_fn, reference=None):
    """
    Extract data from the .pbi files in an AlignmentSet using numpy array
    operations.
    """
    lengths, percent_accs, map_qvs = [], [], []
    with openDataFile(in_fn) as ds:
        for bam in ds.resourceReaders():
            if len(bam) == 0:
                continue
            identities = bam.identity
            ref_name_to_id = {r.Name: r.ID for r in bam.referenceInfoTable}
            sel = np.full(len(identities), True, dtype=bool)
            bam_lengths = bam.pbi.aEnd - bam.pbi.aStart
            if reference is not None:
                ref_id = None
                # FIXME there must be a cleaner way to do this...
                for ref_info in bam.referenceInfoTable:
                    if ref_info.Name == reference:
                        ref_id = ref_info.ID
                        break
                sel = bam.pbi.tId == ref_id
            lengths.extend(bam_lengths[sel])
            percent_accs.extend(identities[sel])
            map_qvs.extend(bam.pbi.mapQV[sel])
    data = np.array([lengths, percent_accs, map_qvs])
    data = data.transpose()
    return data
コード例 #6
0
    def test_split_zmws_targetsize(self):
        N_RECORDS = 117
        N_ZMWS = 48
        test_file = upstreamdata.getUnalignedBam()
        ds1 = openDataFile(test_file)
        assert len([r for r in ds1]) == N_RECORDS
        assert len(ds1) == N_RECORDS
        assert len(set(ds1.index.holeNumber)) == N_ZMWS

        # with no split
        dss = list(ds1.split(targetSize=1000, zmws=True))
        assert len(dss) == 1
        assert sum([len([r for r in ds_]) for ds_ in dss]) == N_RECORDS
        assert sum([len(ds_) for ds_ in dss]) == N_RECORDS
        exp = [48]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        assert exp == obs

        # with a split
        dss = list(ds1.split(targetSize=25, zmws=True))
        assert len(dss) == 2
        assert sum([len([r for r in ds_]) for ds_ in dss]) == N_RECORDS
        assert sum([len(ds_) for ds_ in dss]) == N_RECORDS
        exp = [24, 24]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        assert exp == obs

        # with a split
        dss = list(ds1.split(targetSize=5, zmws=True))
        assert len(dss) == 10
        assert sum([len([r for r in ds_]) for ds_ in dss]) == N_RECORDS
        assert sum([len(ds_) for ds_ in dss]) == N_RECORDS
        exp = [4, 4, 5, 5, 5, 5, 5, 5, 5, 5]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        assert exp == obs
コード例 #7
0
def createXml(args):
    if args.dsType is None:
        dset = openDataFile(*args.infile,
                            strict=args.strict,
                            skipCounts=args.skipCounts,
                            generateIndices=args.generateIndices)
    else:
        dsTypes = DataSet.castableTypes()
        dset = dsTypes[args.dsType](*args.infile,
                                    strict=args.strict,
                                    skipCounts=args.skipCounts,
                                    generateIndices=args.generateIndices)
    if args.generateIndices:
        # we generated the indices with the last open, lets capture them with
        # this one:
        dset = dsTypes[args.dsType](*args.infile,
                                    strict=args.strict,
                                    skipCounts=args.skipCounts)
    if args.dsName != '':
        dset.name = args.dsName
    log.debug("Dataset created")
    dset.write(args.outfile,
               validate=args.novalidate,
               modPaths=True,
               relPaths=args.relative)
    log.debug("Dataset written")
    return 0
コード例 #8
0
 def test_large_split_zmws(self):
     N_RECORDS = 959539
     test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                  "2372215/0007/Analysis_Results/m150404_101626_42"
                  "267_c100807920800000001823174110291514_s1_p0.al"
                  "l.subreadset.xml")
     ds1 = openDataFile(test_file)
     assert len(ds1) == N_RECORDS
     dss = list(ds1.split(chunks=1, zmws=True))
     assert len(dss) == 1
     assert sum([len(ds_) for ds_ in dss]) == N_RECORDS
     dss = list(ds1.split(chunks=12, zmws=True))
     assert len(dss) == 12
     assert sum([len(ds_) for ds_ in dss]) == N_RECORDS
     assert dss[0].zmwRanges == [
         ('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
          7, 14009)
     ]
     assert dss[-1].zmwRanges == [
         ('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
          149881, 163475)
     ]
     ranges = sorted([c.zmwRanges[0][1:] for c in dss])
     interspans = []
     last = None
     for rg in ranges:
         if not last is None:
             interspans.append((last, rg[0]))
             assert not last == rg[0]
         last = rg[1]
     for rg in interspans:
         assert len(
             np.nonzero(
                 np.logical_and(ds1.index.holeNumber < rg[1],
                                ds1.index.holeNumber > rg[0]))[0]) == 0
コード例 #9
0
 def test_anonymize(self):
     ofn1 = tempfile.NamedTemporaryFile(suffix=".bam").name
     ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn1,
                                whitelist=set([24962]))
     assert rc == 0
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn2,
                                whitelist=set([24962]),
                                anonymize=True)
     assert rc == 0
     with openDataFile(ofn1) as bam1:
         with openDataFile(ofn2) as bam2:
             for rec1, rec2 in zip(bam1, bam2):
                 assert rec1.qName == rec2.qName
                 assert rec1.peer.seq != rec2.peer.seq
コード例 #10
0
 def test_anonymize(self):
     ofn1 = tempfile.NamedTemporaryFile(suffix=".bam").name
     ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamSieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn1,
                                whitelist=set([24962]))
     self.assertEqual(rc, 0)
     rc = bamSieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn2,
                                whitelist=set([24962]),
                                anonymize=True)
     self.assertEqual(rc, 0)
     with openDataFile(ofn1) as bam1:
         with openDataFile(ofn2) as bam2:
             for rec1, rec2 in zip(bam1, bam2):
                 self.assertEqual(rec1.qName, rec2.qName)
                 self.assertNotEqual(rec1.peer.seq, rec2.peer.seq)
コード例 #11
0
ファイル: test_bamsieve.py プロジェクト: mpkocher/pbcoretools
 def _run_with_blacklist(bl):
     rc = bamSieve.filter_reads(
         input_bam=SUBREADS2,
         output_bam=ofn,
         blacklist=bl)
     self.assertEqual(rc, 0)
     with openDataFile(ofn, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([9]))
コード例 #12
0
ファイル: test_bamsieve.py プロジェクト: mpkocher/pbcoretools
 def _run_with_whitelist(wl):
     rc = bamSieve.filter_reads(
         input_bam=SUBREADS3,
         output_bam=ofn,
         whitelist=wl)
     self.assertEqual(rc, 0)
     with openDataFile(ofn, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, WHITELIST)
コード例 #13
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [AlignmentSet(data.getXml(8)),
              ReferenceSet(data.getXml(9)),
              SubreadSet(data.getXml(10)),
              #ConsensusAlignmentSet(data.getXml(20)),
              HdfSubreadSet(data.getXml(19))]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         self.assertEqual(type(mystery), type(ds))
コード例 #14
0
ファイル: test_bamsieve.py プロジェクト: mpkocher/pbcoretools
 def test_anonymize(self):
     ofn1 = tempfile.NamedTemporaryFile(suffix=".bam").name
     ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamSieve.filter_reads(
         input_bam=SUBREADS3,
         output_bam=ofn1,
         whitelist=set([24962]))
     self.assertEqual(rc, 0)
     rc = bamSieve.filter_reads(
         input_bam=SUBREADS3,
         output_bam=ofn2,
         whitelist=set([24962]),
         anonymize=True)
     self.assertEqual(rc, 0)
     with openDataFile(ofn1, strict=False) as bam1:
         with openDataFile(ofn2, strict=False) as bam2:
             for rec1, rec2 in zip(bam1, bam2):
                 self.assertEqual(rec1.qName, rec2.qName)
                 self.assertNotEqual(rec1.peer.seq, rec2.peer.seq)
コード例 #15
0
 def test_trust_counts(self):
     import pbtestdata
     f1 = pbtestdata.get_file("aligned-xml")
     f2 = pbtestdata.get_file("aligned-ds-2")
     ds = openDataFile(f1, f2, trustCounts=True)
     assert ds.numRecords == 133
     assert len(ds) == 133
     assert ds.totalLength == 274217
     assert ds._index is None
     assert len(ds._openReaders) == 0
コード例 #16
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         assert type(mystery) == type(ds)
コード例 #17
0
ファイル: test_bamsieve.py プロジェクト: mpkocher/pbcoretools
 def test_count(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamSieve.filter_reads(
         input_bam=SUBREADS3,
         output_bam=ofn,
         count=1,
         seed=12345)
     self.assertEqual(rc, 0)
     with openDataFile(ofn, strict=False) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(len(zmws), 1)
コード例 #18
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(8)),
         ReferenceSet(data.getXml(9)),
         SubreadSet(data.getXml(10)),
         #ConsensusAlignmentSet(data.getXml(20)),
         HdfSubreadSet(data.getXml(19))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         self.assertEqual(type(mystery), type(ds))
コード例 #19
0
ファイル: bamSieve.py プロジェクト: Debian/pbcoretools
def show_zmws(input_file):
    zmws = []
    with openDataFile(input_file) as ds_in:
        is_indexed = ds_in.isIndexed
        if not is_indexed:
            log.warning("Unindexed file(s), this may be very slow")
        for rr in ds_in.resourceReaders():
            if is_indexed:
                zmws.extend(list([int(x) for x in rr.holeNumber]))
            else:
                zmws.extend([int(rec.HoleNumber) for rec in rr])
    print "\n".join([str(x) for x in sorted(list(set(zmws)))])
コード例 #20
0
ファイル: bamSieve.py プロジェクト: pezmaster31/pbcoretools
def show_zmws(input_file):
    zmws = []
    with openDataFile(input_file) as ds_in:
        is_indexed = ds_in.isIndexed
        if not is_indexed:
            log.warning("Unindexed file(s), this may be very slow")
        for rr in ds_in.resourceReaders():
            if is_indexed:
                zmws.extend(list([int(x) for x in rr.holeNumber]))
            else:
                zmws.extend([int(rec.HoleNumber) for rec in rr])
    print "\n".join([str(x) for x in sorted(list(set(zmws)))])
コード例 #21
0
 def test_generate_indices(self):
     import pbtestdata
     tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
     tmp_pbi = tmp_bam + ".pbi"
     tmp_bai = tmp_bam + ".bai"
     shutil.copyfile(pbtestdata.get_file("subreads-bam"), tmp_bam)
     ds = openDataFile(tmp_bam, strict=False, generateIndices=True)
     assert ds.externalResources[0].pbi == tmp_pbi
     assert ds.externalResources[0].bai == tmp_bai
     assert os.path.isfile(tmp_pbi)
     assert os.path.isfile(tmp_bai)
     assert len(ds) == 117
コード例 #22
0
def createXml(args):
    if os.path.exists(args.outfile) and not args.force:
        raise IOError("Output file {} already exists. Use --force to "
                      "clobber".format(args.outfile))
    if args.dsType is None:
        dset = openDataFile(*args.infile,
                            strict=args.strict,
                            skipCounts=args.skipCounts,
                            trustCounts=args.trustCounts,
                            generateIndices=args.generateIndices,
                            referenceFastaFname=args.reference_fasta_fname)
    else:
        dsTypes = DataSet.castableTypes()
        dset = dsTypes[args.dsType](
            *args.infile,
            strict=args.strict,
            skipCounts=args.skipCounts,
            trustCounts=args.trustCounts,
            generateIndices=args.generateIndices,
            referenceFastaFname=args.reference_fasta_fname)
    if args.dsName != '':
        dset.name = args.dsName
    if args.metadata:
        dset.loadMetadata(args.metadata)
    if args.well_sample_name or args.bio_sample_name:
        if args.metadata:
            log.warning(
                "Setting the WellSample or BioSample name will overwrite fields pulled from %s",
                args.metadata)
        n_new_collections = add_mock_collection_metadata(dset)
        if n_new_collections > 0:
            log.warning(
                "Created new CollectionMetadata from blank template for %d movies",
                n_new_collections)
        if args.well_sample_name:
            force_set_all_well_sample_names(dset, args.well_sample_name)
        if args.bio_sample_name:
            force_set_all_bio_sample_names(dset, args.bio_sample_name)
    log.debug("Dataset created")
    if isinstance(dset, ContigSet):
        if args.organism:
            dset.metadata.organism = args.organism
        if args.ploidy:
            dset.metadata.ploidy = args.ploidy
    dset.newUuid()
    if args.no_sub_datasets:
        dset.subdatasets = []
    if args.unique_collections:
        uniqueify_collections(dset.metadata)
    dset.write(args.outfile, validate=args.novalidate, relPaths=args.relative)
    log.debug("Dataset written")
    return 0
コード例 #23
0
ファイル: mapping_stats.py プロジェクト: natechols/pbreports
 def _get_subread_length_histogram_bin_width(self):
     BIN_SIZES = [100, 200, 500]
     subread_length_max = 0
     with openDataFile(self.alignment_file) as ds:
         for rr in ds.resourceReaders():
             if len(rr) == 0:
                 continue
             subread_length_max = max(subread_length_max,
                                      (rr.pbi.aEnd - rr.pbi.aStart).max())
     for bin_width in BIN_SIZES:
         if (subread_length_max / float(bin_width)) < 100:
             return bin_width
     return BIN_SIZES[-1]
コード例 #24
0
ファイル: mapping_stats.py プロジェクト: lpp1985/lpp_Script
 def _get_subread_length_histogram_bin_width(self):
     BIN_SIZES = [100, 200, 500]
     subread_length_max = 0
     with openDataFile(self.alignment_file) as ds:
         for rr in ds.resourceReaders():
             if len(rr) == 0:
                 continue
             subread_length_max = max(subread_length_max,
                                      (rr.pbi.aEnd - rr.pbi.aStart).max())
     for bin_width in BIN_SIZES:
         if (subread_length_max / float(bin_width)) < 100:
             return bin_width
     return BIN_SIZES[-1]
コード例 #25
0
ファイル: bamSieve.py プロジェクト: lpp1985/lpp_Script
def _iter_bam_files(input_file):
    if input_file.endswith(".xml"):
        with openDataFile(input_file) as ds_in:
            if not ds_in.isIndexed:
                log.warning("Unindexed file(s), this may be very slow")
            for rr in ds_in.resourceReaders():
                yield rr
    else:
        if op.exists(input_file + ".pbi"):
            with IndexedBamReader(input_file) as bam_in:
                yield bam_in
        else:
            with BamReader(input_file) as bam_in:
                yield bam_in
コード例 #26
0
ファイル: PbiBamIO.py プロジェクト: lpp1985/lpp_Script
 def __init__(self, file_name):
     self.file_name = file_name
     self._is_fasta = False
     self.ext = op.splitext(file_name)[1].upper()
     if self.ext in [".FA", ".FASTA"]:
         self._dataset = FastaReader(file_name)
         self._is_fasta = True
     elif self.ext == ".BAM":
         self._dataset = openDataFile(file_name)
     else: # either contigset.xml or consensusreadset.xml
         assert self.ext == ".XML"
         self._dataset = openDataSet(file_name)
         if isinstance(self._dataset, ContigSet):
             self._is_fasta = True
コード例 #27
0
 def __init__(self, file_name):
     self.file_name = file_name
     self._is_fasta = False
     self.ext = op.splitext(file_name)[1].upper()
     if self.ext in [".FA", ".FASTA"]:
         self._dataset = FastaReader(file_name)
         self._is_fasta = True
     elif self.ext == ".BAM":
         self._dataset = openDataFile(file_name)
     else:  # either contigset.xml or consensusreadset.xml
         assert self.ext == ".XML"
         self._dataset = openDataSet(file_name)
         if isinstance(self._dataset, ContigSet):
             self._is_fasta = True
コード例 #28
0
def _iter_bam_files(input_file):
    if input_file.endswith(".xml"):
        with openDataFile(input_file) as ds_in:
            if not ds_in.isIndexed:
                log.warning("Unindexed file(s), this may be very slow")
            for rr in ds_in.resourceReaders():
                yield rr
    else:
        if op.exists(input_file + ".pbi"):
            with IndexedBamReader(input_file) as bam_in:
                yield bam_in
        else:
            with BamReader(input_file) as bam_in:
                yield bam_in
コード例 #29
0
 def _get_subreads_from_dataset(subread_list):
     with openDataFile(subread_list) as ds_in:
         if ds_in.isIndexed:
             qid = ds_in.index.qId
             mname = [ds_in.qid2mov[q] for q in qid]
             zmws = ds_in.index.holeNumber
             start = ds_in.index.qStart
             stop = ds_in.index.qEnd
             return set(
                 [_make_qname(*x) for x in zip(mname, zmws, start, stop)])
         else:
             subreads = set()
             for record in ds_in:
                 subreads.add(record.qName)
             return subreads
コード例 #30
0
ファイル: pbi2csv.py プロジェクト: MShaffar19/pbcoretools
def run_args(args):
    ds = openDataFile(args.dataset)
    get_full_bam = args.load_snr or args.load_numpasses
    is_barcoded = ds.isBarcoded
    headers = list(HEADERS)
    if is_barcoded:
        headers += HEADERS_BC
    if args.load_snr:
        headers += HEADERS_SNR
    if args.load_numpasses:
        headers += HEADERS_NPASSES
    rows = []
    for rr in ds.resourceReaders():
        identity = rr.pbi.identity
        for i, holeNumber in enumerate(rr.pbi.holeNumber):
            reference = rr.referenceInfo(rr.pbi.tId[i])[2]
            aLen = rr.pbi.aEnd[i] - rr.pbi.aStart[i]
            if aLen <= 0 or identity[i] < 0:
                log.warning(
                    "ZMW %s has negative-length alignment or negative computed identity, skipping",
                    holeNumber)
                continue
            rc = "FALSE"
            if rr.pbi.isReverseStrand[i]:
                rc = "TRUE"
            row = [
                rr.filename, holeNumber, rr.pbi.qStart[i], rr.pbi.qEnd[i],
                rr.pbi.readQual[i], rr.pbi.virtualFileOffset[i],
                rr.pbi.contextFlag[i], reference, rr.pbi.tStart[i],
                rr.pbi.tEnd[i], rr.pbi.aStart[i], rr.pbi.aEnd[i], rc,
                rr.pbi.nM[i], rr.pbi.nMM[i], rr.pbi.mapQV[i], rr.pbi.nIns[i],
                rr.pbi.nDel[i]
            ]
            if is_barcoded:
                row.extend([
                    rr.pbi.bcForward[i], rr.pbi.bcReverse[i], rr.pbi.bcQual[i]
                ])
            if get_full_bam:
                rec = rr[i]
                if args.load_snr:
                    snr = rec.peer.get_tag("sn")
                    row.extend(snr)
                if args.load_numpasses:
                    row.append(rec.peer.get_tag("np"))
            rows.append(row)
    _write_csv(rows, args.csv_out, headers=headers)
    log.info("Wrote %s", args.csv_out)
    return 0
コード例 #31
0
def _iter_bam_files(input_file):
    def __read_bam(fn):
        if op.exists(fn + ".pbi"):
            with IndexedBamReader(fn) as bam_in:
                return bam_in
        else:
            with BamReader(fn) as bam_in:
                return bam_in

    if input_file.endswith(".xml"):
        with openDataFile(input_file) as ds_in:
            if not ds_in.isIndexed:
                log.warning("Unindexed file(s), this may be very slow")
            for er in ds_in.externalResources:
                for bam in [er.bam, er.scraps]:
                    if bam is not None:
                        yield __read_bam(bam)
    else:
        yield __read_bam(input_file)
コード例 #32
0
ファイル: test_bamsieve.py プロジェクト: mpkocher/pbcoretools
 def test_integration(self):
     args = ["bamSieve", "--help"]
     with tempfile.TemporaryFile() as stdout:
         with tempfile.TemporaryFile() as stderr:
             rc = subprocess.call(args, stdout=stdout, stderr=stderr)
             self.assertEqual(rc, 0)
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     args = [
         "bamSieve",
         "--log-level", "ERROR",
         "--whitelist", "8,233",
         SUBREADS2,
         ofn
     ]
     rc = subprocess.call(args)
     self.assertEqual(rc, 0)
     with openDataFile(ofn, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         self.assertEqual(have_zmws, set([8]))
コード例 #33
0
ファイル: PbiBamIO.py プロジェクト: lpp1985/lpp_Script
    def __init__(self, *args):
        if len(args) == 1:
            args = get_files_from_file_or_fofn(args[0])
        self._dataset = openDataFile(*args)
        # Implementation notes: find all the bam files, and group
        # them together by movieName
        self._header = BamHeader(ignore_pg=True)

        for bam in self._dataset.resourceReaders():
            if not isinstance(bam, IndexedBamReader):
                raise ValueError("%s in %s must have pbi index generated",
                                 bam.filename, str(self._dataset))
            self._header.add(bam.peer.header)
            for rg in bam.peer.header["RG"]: #readGroupTable:
                if rg['PL'] != "PACBIO":
                    raise IOError("Input BAM file %s for %s must be PacBio BAM.",
                                  bam.filename, self.__class__.__name__)
            for rg in bam.readGroupTable:
                assert rg.ReadType in ["CCS", "SUBREAD"]
コード例 #34
0
    def __init__(self, *args):
        if len(args) == 1:
            args = get_files_from_file_or_fofn(args[0])
        self._dataset = openDataFile(*args)
        # Implementation notes: find all the bam files, and group
        # them together by movieName
        self._header = BamHeader(ignore_pg=True)

        for bam in self._dataset.resourceReaders():
            if not isinstance(bam, IndexedBamReader):
                raise ValueError("%s in %s must have pbi index generated",
                                 bam.filename, str(self._dataset))
            self._header.add(bam.peer.header)
            for rg in bam.peer.header["RG"]:  #readGroupTable:
                if rg['PL'] != "PACBIO":
                    raise IOError(
                        "Input BAM file %s for %s must be PacBio BAM.",
                        bam.filename, self.__class__.__name__)
            for rg in bam.readGroupTable:
                assert rg.ReadType in ["CCS", "SUBREAD"]
コード例 #35
0
def _process_zmw_list(zmw_list):
    zmws = set()
    if zmw_list is None:
        return zmws
    elif isinstance(zmw_list, set):
        return zmw_list
    elif isinstance(zmw_list, (list, tuple)):
        return set(zmw_list)
    elif op.isfile(zmw_list):
        base, ext = op.splitext(zmw_list)
        if ext in [".bam", ".xml"]:
            with openDataFile(zmw_list) as ds_zmw:
                zmws.update(ds_zmw.index.holeNumber)
        else:
            with open(zmw_list) as f:
                lines = f.read().splitlines()
                zmws.update(set([int(x) for x in lines]))
    else:
        zmws.update(set([int(x) for x in zmw_list.split(",")]))
    return zmws
コード例 #36
0
def run_args(args):
    sample_name = None
    if not args.single_sample and not args.all_samples:
        bam = openDataFile(args.samples_file)
        sample_name = bam.readGroupTable[0].SampleName
        log.info("Sample name is {}".format(sample_name))
    elif args.all_samples:
        sample_name = "All Samples"
    files = []
    for file_id, file_type, label in FILE_IDS_AND_NAMES:
        file_path = getattr(args, file_id)
        if file_path is None:
            log.info("Skipping {}".format(file_id))
            continue
        assert file_path is not None and op.exists(file_path)
        if sample_name:
            label += " ({})".format(sample_name)
        files.append(to_datastore_file(file_path, file_id, file_type, label))
    DataStore(files).write_json(args.datastore)
    return 0
コード例 #37
0
    def test_split_zmws_targetsize(self):
        N_RECORDS = 117
        N_ZMWS = 48
        test_file = upstreamdata.getUnalignedBam()
        ds1 = openDataFile(test_file)
        self.assertEqual(len([r for r in ds1]), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        self.assertEqual(len(set(ds1.index.holeNumber)), N_ZMWS)

        # with no split
        dss = ds1.split(targetSize=1000, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        exp = [48]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        self.assertListEqual(exp, obs)

        # with a split
        dss = ds1.split(targetSize=25, zmws=True)
        self.assertEqual(len(dss), 2)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        exp = [24, 24]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        self.assertListEqual(exp, obs)

        # with a split
        dss = ds1.split(targetSize=5, zmws=True)
        self.assertEqual(len(dss), 10)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        exp = [4, 4, 5, 5, 5, 5, 5, 5, 5, 5]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        self.assertListEqual(exp, obs)
コード例 #38
0
def _process_zmw_list(zmw_list):
    zmws = set()
    if zmw_list is None:
        return zmws
    elif isinstance(zmw_list, set):
        return zmw_list
    elif isinstance(zmw_list, (list, tuple)):
        return set(zmw_list)
    elif op.isfile(zmw_list):
        base, ext = op.splitext(zmw_list)
        if ext in [".bam", ".xml"]:
            with openDataFile(zmw_list) as ds_zmw:
                for f in ds_zmw.resourceReaders():
                    zmws.update(set(list(f.holeNumber)))
        else:
            with open(zmw_list) as f:
                lines = f.read().splitlines()
                zmws.update(set([int(x) for x in lines]))
    else:
        zmws.update(set([int(x) for x in zmw_list.split(",")]))
    return zmws
コード例 #39
0
    def test_split_zmws(self):
        N_RECORDS = 117
        test_file = upstreamdata.getUnalignedBam()
        ds1 = openDataFile(test_file)
        self.assertEqual(len([r for r in ds1]), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split(chunks=1, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)

        # We have a lower limit on the number of zmws, now
        dss = ds1.split(chunks=12, zmws=True)
        self.assertEqual(len(dss), 2)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        self.assertEqual(
            dss[0].zmwRanges,
            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
              1650, 32328)])
        self.assertEqual(
            dss[-1].zmwRanges,
            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
              32560, 54396)])
        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
        interspans = []
        last = None
        for rg in ranges:
            if not last is None:
                interspans.append((last, rg[0]))
                self.assertFalse(last == rg[0])
            last = rg[1]
        for rg in interspans:
            self.assertEqual(
                len(
                    np.nonzero(
                        np.logical_and(ds1.index.holeNumber < rg[1],
                                       ds1.index.holeNumber > rg[0]))[0]), 0)
コード例 #40
0
    def test_split_zmws(self):
        N_RECORDS = 117
        test_file = upstreamdata.getUnalignedBam()
        ds1 = openDataFile(test_file)
        self.assertEqual(len([r for r in ds1]), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split(chunks=1, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)

        # We have a lower limit on the number of zmws, now
        dss = ds1.split(chunks=12, zmws=True)
        self.assertEqual(len(dss), 2)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(
            dss[0].zmwRanges,
            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
              1650, 32328)])
        self.assertEqual(
            dss[-1].zmwRanges,
            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
              32560, 54396)])
        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
        interspans = []
        last = None
        for rg in ranges:
            if not last is None:
                interspans.append((last, rg[0]))
                self.assertFalse(last == rg[0])
            last = rg[1]
        for rg in interspans:
            self.assertEqual(len(np.nonzero(np.logical_and(
                ds1.index.holeNumber < rg[1],
                ds1.index.holeNumber > rg[0]))[0]), 0)
コード例 #41
0
def createXml(args):
    if args.dsType is None:
        dset = openDataFile(*args.infile, strict=args.strict,
                            skipCounts=args.skipCounts,
                            generateIndices=args.generateIndices)
    else:
        dsTypes = DataSet.castableTypes()
        dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
                                    skipCounts=args.skipCounts,
                                    generateIndices=args.generateIndices)
    if args.generateIndices:
        # we generated the indices with the last open, lets capture them with
        # this one:
        dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
                                    skipCounts=args.skipCounts)
    if args.dsName != '':
        dset.name = args.dsName
    log.debug("Dataset created")
    dset.write(args.outfile, validate=args.novalidate, modPaths=True,
               relPaths=args.relative)
    log.debug("Dataset written")
    return 0
コード例 #42
0
    def test_split_zmws_targetsize(self):
        N_RECORDS = 117
        N_ZMWS = 48
        test_file = upstreamdata.getUnalignedBam()
        ds1 = openDataFile(test_file)
        self.assertEqual(len([r for r in ds1]), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        self.assertEqual(len(set(ds1.index.holeNumber)), N_ZMWS)

        # with no split
        dss = ds1.split(targetSize=1000, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        exp = [48]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        self.assertListEqual(exp, obs)

        # with a split
        dss = ds1.split(targetSize=25, zmws=True)
        self.assertEqual(len(dss), 2)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        exp = [24, 24]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        self.assertListEqual(exp, obs)

        # with a split
        dss = ds1.split(targetSize=5, zmws=True)
        self.assertEqual(len(dss), 10)
        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        exp = [4, 4, 5, 5, 5, 5, 5, 5, 5, 5]
        obs = sorted([len(set(ds.index.holeNumber)) for ds in dss])
        self.assertListEqual(exp, obs)
コード例 #43
0
ファイル: barcode.py プロジェクト: tkerelska/pbreports
def _labels_reads_iterator(reads, barcodes, subreads=True):
    with openDataSet(reads) as ds:
        movies = set()
        apply(movies.update, [rr.movieNames for rr in ds.resourceReaders()])
        if len(movies) != 1:  # FIXME
            raise NotImplementedError("Multiple-movie datasets are not " +
                                      "supported by this application.")
        assert ds.isIndexed
        zmws_by_barcode = defaultdict(set)
        reads_by_zmw = defaultdict(list)
        for rr in ds.resourceReaders():
            for i, (b, z) in enumerate(zip(rr.pbi.bcForward,
                                           rr.pbi.holeNumber)):
                zmws_by_barcode[b].add(z)
                reads_by_zmw[z].append((rr, i))
        with openDataFile(barcodes) as bc:
            for i_bc, barcode in enumerate(bc):
                zmws = sorted(list(zmws_by_barcode[i_bc]))
                for zmw in zmws:
                    for rr, i_read in reads_by_zmw[zmw]:
                        # FIXME(nechols)(2016-03-15) this will not work on CCS
                        qlen = rr.pbi.qEnd[i_read] - rr.pbi.qStart[i_read]
                        yield barcode.id, ["n"] * qlen
コード例 #44
0
 def test_large_split_zmws(self):
     N_RECORDS = 959539
     test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                  "2372215/0007/Analysis_Results/m150404_101626_42"
                  "267_c100807920800000001823174110291514_s1_p0.al"
                  "l.subreadset.xml")
     ds1 = openDataFile(test_file)
     self.assertEqual(len(ds1), N_RECORDS)
     dss = ds1.split(chunks=1, zmws=True)
     self.assertEqual(len(dss), 1)
     self.assertEqual(sum([len(ds_) for ds_ in dss]),
                      N_RECORDS)
     dss = ds1.split(chunks=12, zmws=True)
     self.assertEqual(len(dss), 12)
     self.assertEqual(sum([len(ds_) for ds_ in dss]),
                      N_RECORDS)
     self.assertEqual(
         dss[0].zmwRanges,
         [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
           7, 14009)])
     self.assertEqual(
         dss[-1].zmwRanges,
         [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
           149881, 163475)])
     ranges = sorted([c.zmwRanges[0][1:] for c in dss])
     interspans = []
     last = None
     for rg in ranges:
         if not last is None:
             interspans.append((last, rg[0]))
             self.assertFalse(last == rg[0])
         last = rg[1]
     for rg in interspans:
         self.assertEqual(len(np.nonzero(np.logical_and(
             ds1.index.holeNumber < rg[1],
             ds1.index.holeNumber > rg[0]))[0]), 0)
コード例 #45
0
 def test_file_factory_fofn(self):
     mystery = openDataFile(data.getFofn())
     assert type(mystery) == AlignmentSet
コード例 #46
0
def filter_reads(input_bam,
                 output_bam,
                 whitelist=None,
                 blacklist=None,
                 percentage=None,
                 count=None,
                 seed=None,
                 ignore_metadata=False,
                 relative=None,
                 anonymize=False,
                 use_barcodes=False,
                 sample_scraps=False):
    if output_bam is None:
        log.error("Must specify output file")
        return 1
    output_bam = op.abspath(output_bam)
    if not op.isdir(op.dirname(output_bam)):
        log.error("Output path '{d}' does not exist.".format(
                  d=op.dirname(output_bam)))
        return 1
    n_specified = 4 - [whitelist, blacklist, percentage, count].count(None)
    if n_specified != 1:
        log.error("You must choose one and only one of the following "+
                  "options: --whitelist, --blacklist, --count, --percentage")
        return 1
    if seed is not None:
        random.seed(seed)
    if whitelist is None and blacklist is None:
        if not 0 < percentage < 100 and not count > 0:
            log.error("No reads selected for output.")
            return 1
    output_ds = None
    if output_bam.endswith(".xml"):
        if not input_bam.endswith(".xml"):
            print "DataSet output only supported for DataSet inputs."
            return 1
        ds_type = output_bam.split(".")[-2]
        ext2 = {
            "subreadset": "subreads",
            "alignmentset": "subreads",
            "consensusreadset": "ccs",
            "consensusalignmentset": "ccs"
        }
        if not ds_type in ext2:
            raise ValueError("Invalid dataset type 't'".format(t=ds_type))
        output_ds = output_bam
        output_bam = ".".join(output_ds.split(".")[:-2] +
                              [ext2[ds_type], "bam"])
    if output_bam == input_bam:
        log.error("Input and output files must not be the same path")
        return 1
    elif not output_bam.endswith(".bam"):
        log.error("Output file name must end in either '.bam' or '.xml'")
        return 1
    n_file_reads = 0
    have_zmws = set()
    scraps_bam = barcode_set = None
    with openDataFile(input_bam) as ds_in:
        if not isinstance(ds_in, ReadSet):
            raise TypeError("{t} is not an allowed dataset type".format(
                            t=type(ds_in).__name__))
        # TODO(nechols)(2016-03-11): refactor this to enable propagation of
        # filtered scraps
        if not ds_in.isIndexed:
            log.error("Input BAM must have accompanying .pbi index")
            return 1
        for ext_res in ds_in.externalResources:
            if ext_res.barcodes is not None:
                assert barcode_set is None or barcode_set == ext_res.barcodes
                barcode_set = barcode_set
        f1 = ds_in.resourceReaders()[0]
        if percentage is not None or count is not None:
            bam_readers = list(ds_in.resourceReaders())
            if sample_scraps:
                for ext_res in ds_in.externalResources:
                    if ext_res.scraps is not None:
                        scraps_in = IndexedBamReader(ext_res.scraps)
                        bam_readers.append(scraps_in)
            whitelist = _create_whitelist(bam_readers, percentage, count)
        # convert these to Python sets
        _whitelist = _process_zmw_list(whitelist)
        _blacklist = _process_zmw_list(blacklist)
        scraps_in = None
        if output_ds is not None and output_ds.endswith(".subreadset.xml"):
            for ext_res in ds_in.externalResources:
                if ext_res.scraps is not None:
                    if use_barcodes:
                        log.warn("Scraps BAM is present but lacks "+
                                 "barcodes - will not be propagated "+
                                 "to output SubreadSet")
                    else:
                        scraps_in = IndexedBamReader(ext_res.scraps)
                    break
        with AlignmentFile(output_bam, 'wb',
                           template=f1.peer) as bam_out:
            for bam_in in ds_in.resourceReaders():
                n_records, have_zmws_ =_process_bam_whitelist(
                    bam_in, bam_out,
                    whitelist=_whitelist,
                    blacklist=_blacklist,
                    use_barcodes=use_barcodes,
                    anonymize=anonymize)
                n_file_reads += n_records
                have_zmws.update(have_zmws_)
        if scraps_in is not None:
            scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam)
            with AlignmentFile(scraps_bam, 'wb',
                               template=scraps_in.peer) as scraps_out:
                for ext_res in ds_in.externalResources:
                    if ext_res.scraps is not None:
                        scraps_in_ = IndexedBamReader(ext_res.scraps)
                        n_records, have_zmws_ =_process_bam_whitelist(
                            scraps_in_, scraps_out, _whitelist, _blacklist,
                            use_barcodes=use_barcodes,
                            anonymize=anonymize)
                        have_zmws.update(have_zmws_)
    if n_file_reads == 0:
        log.error("No reads written")
        return 1
    log.info("{n} records from {z} ZMWs written".format(
        n=n_file_reads, z=len(have_zmws)))
    def _run_pbindex(bam_file):
        try:
            rc = subprocess.call(["pbindex", bam_file])
        except OSError as e:
            if e.errno == 2:
                log.warn("pbindex not present, will not create .pbi file")
            else:
                raise
    _run_pbindex(output_bam)
    if output_ds is not None:
        with openDataSet(input_bam) as ds_in:
            ds_out = ds_in.__class__(output_bam)
            if scraps_bam is not None:
                _run_pbindex(scraps_bam)
                ds_out.externalResources[0].scraps = scraps_bam
                # XXX it doesn't pick up the .pbi file - sort of annoying
                # but since the pbcore API doesn't provide a read for the
                # scraps automatically anyway, the impact is minimal
            if barcode_set is not None:
                ds_out.externalResources[0].barcodes = barcode_set
            if not ignore_metadata:
                ds_out.metadata = ds_in.metadata
                ds_out.updateCounts()
            if relative:
                ds_out.makePathsRelative(op.dirname(output_ds))
            ds_out.write(output_ds)
            log.info("wrote {t} XML to {x}".format(
                     t=ds_out.__class__.__name__, x=output_ds))
    return 0
コード例 #47
0
ファイル: rainbow.py プロジェクト: lpp1985/lpp_Script
 def _openAlignments():
     if in_fn.endswith(".cmp.h5"):
         return CmpH5Reader(in_fn)
     else:
         return openDataFile(in_fn)
コード例 #48
0
def filter_reads(input_bam,
                 output_bam,
                 whitelist=None,
                 blacklist=None,
                 percentage=None,
                 count=None,
                 seed=None,
                 ignore_metadata=False,
                 relative=None,
                 anonymize=False,
                 use_barcodes=False,
                 sample_scraps=False,
                 keep_original_uuid=False,
                 use_subreads=False,
                 min_adapters=None):
    _validate_settings(output_bam, whitelist, blacklist, percentage, count,
                       min_adapters)
    output_bam = op.abspath(output_bam)
    if seed is not None:
        random.seed(seed)
    output_ds = base_name = None
    if output_bam.endswith(".xml"):
        if not input_bam.endswith(".xml"):
            raise UserError(
                "DataSet output only supported for DataSet inputs.")
        ds_type = output_bam.split(".")[-2]
        ext2 = OrderedDict([("subreadset", "subreads"),
                            ("alignmentset", "subreads"),
                            ("consensusreadset", "ccs"),
                            ("consensusalignmentset", "ccs"),
                            ("transcriptset", "transcripts"),
                            ("transcriptalignmentset", "transcripts")])
        if not ds_type in ext2:
            raise ValueError(
                "Invalid output file extension '{t}.xml'; valid extensions are:\n{e}"
                .format(t=ds_type,
                        e="\n".join(["  %s.xml" % e for e in ext2.keys()])))
        output_ds = output_bam
        base_name = ".".join(output_ds.split(".")[:-2])
        output_bam = base_name + "." + ".".join([ext2[ds_type], "bam"])
    if output_bam == input_bam:
        raise UserError("Input and output files must not be the same path")
    elif not output_bam.endswith(".bam"):
        raise UserError("Output file name must end in either '.bam' or '.xml'")
    n_file_reads = 0
    have_zmws = set()
    scraps_bam = barcode_set = sts_xml = None
    with openDataFile(input_bam) as ds_in:
        if not isinstance(ds_in, ReadSet):
            raise UserError("{t} is not an allowed dataset type".format(
                t=type(ds_in).__name__))
        # TODO(nechols)(2016-03-11): refactor this to enable propagation of
        # filtered scraps
        if not ds_in.isIndexed:
            raise UserError("Input BAM must have accompanying .pbi index")
        for ext_res in ds_in.externalResources:
            if ext_res.barcodes is not None:
                assert barcode_set is None or barcode_set == ext_res.barcodes
                barcode_set = barcode_set
            if ext_res.sts is not None:
                if sts_xml is None:
                    sts_xml = ext_res.sts
                else:
                    log.warning("Multiple sts.xml files, will not propagate")
        f1 = ds_in.resourceReaders()[0]
        if percentage is not None or count is not None or min_adapters is not None:
            bam_readers = list(ds_in.resourceReaders())
            if sample_scraps:
                for ext_res in ds_in.externalResources:
                    if ext_res.scraps is not None:
                        scraps_in = IndexedBamReader(ext_res.scraps)
                        bam_readers.append(scraps_in)
            whitelist = _create_whitelist(bam_readers, percentage, count,
                                          min_adapters)
        # convert these to Python sets
        if use_subreads:
            _whitelist = _process_subread_list(whitelist)
            _blacklist = _process_subread_list(blacklist)
        else:
            _whitelist = _process_zmw_list(whitelist)
            _blacklist = _process_zmw_list(blacklist)
        scraps_in = None
        if output_ds is not None and output_ds.endswith(".subreadset.xml"):
            for ext_res in ds_in.externalResources:
                if ext_res.scraps is not None:
                    if use_barcodes:
                        log.warning("Scraps BAM is present but lacks " +
                                    "barcodes - will not be propagated " +
                                    "to output SubreadSet")
                    else:
                        scraps_in = IndexedBamReader(ext_res.scraps)
                    break
        with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out:
            for bam_in in ds_in.resourceReaders():
                n_records, have_zmws_ = _process_bam_whitelist(
                    bam_in,
                    bam_out,
                    whitelist=_whitelist,
                    blacklist=_blacklist,
                    use_barcodes=use_barcodes,
                    anonymize=anonymize,
                    use_subreads=use_subreads,
                    qid2mov=ds_in.qid2mov)
                n_file_reads += n_records
                have_zmws.update(have_zmws_)
        if scraps_in is not None:
            scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam)
            with AlignmentFile(scraps_bam, 'wb',
                               template=scraps_in.peer) as scraps_out:
                for ext_res in ds_in.externalResources:
                    if ext_res.scraps is not None:
                        scraps_in_ = IndexedBamReader(ext_res.scraps)
                        n_records, have_zmws_ = _process_bam_whitelist(
                            scraps_in_,
                            scraps_out,
                            _whitelist,
                            _blacklist,
                            use_barcodes=use_barcodes,
                            anonymize=anonymize,
                            use_subreads=use_subreads)
                        have_zmws.update(have_zmws_)
    if n_file_reads == 0:
        log.warn("No reads written")
    else:
        log.info("{n} records from {z} ZMWs written".format(n=n_file_reads,
                                                            z=len(have_zmws)))

    def _run_pbindex(bam_file):
        try:
            rc = subprocess.call(["pbindex", bam_file])
        except OSError as e:
            if e.errno == 2:
                log.warning("pbindex not present, will not create .pbi file")
            else:
                raise

    _run_pbindex(output_bam)
    if output_ds is not None:
        with openDataSet(input_bam) as ds_in:
            ds_out = ds_in.__class__(output_bam)
            if scraps_bam is not None:
                _run_pbindex(scraps_bam)
                ds_out.externalResources[0].scraps = scraps_bam
                # XXX it doesn't pick up the .pbi file - sort of annoying
                # but since the pbcore API doesn't provide a read for the
                # scraps automatically anyway, the impact is minimal
            if barcode_set is not None:
                ds_out.externalResources[0].barcodes = barcode_set
            if sts_xml is not None:
                sts_xml_out = base_name + ".sts.xml"
                log.info("Copying {s} to {d}".format(s=sts_xml, d=sts_xml_out))
                shutil.copyfile(sts_xml, sts_xml_out)
                ds_out.externalResources[0].sts = sts_xml_out
            if not ignore_metadata:
                ds_out.metadata = ds_in.metadata
                ds_out.updateCounts()
            ds_out.name = ds_in.name + " (bamsieve)"
            ds_out.tags = ds_in.tags
            if relative:
                ds_out.makePathsRelative(op.dirname(output_ds))
            if keep_original_uuid:
                log.warning("Keeping input UUID {u}".format(u=ds_in.uuid))
                ds_out.objMetadata["UniqueId"] = ds_in.uuid
            ds_out.write(output_ds)
            log.info("wrote {t} XML to {x}".format(t=ds_out.__class__.__name__,
                                                   x=output_ds))
    return 0
コード例 #49
0
 def test_file_factory_css(self):
     fname = ("/pbi/dept/secondary/siv/testdata/ccs-unittest/"
              "tiny/little.ccs.bam")
     myster = openDataFile(fname)
     assert type(myster) == ConsensusReadSet
コード例 #50
0
ファイル: mapping_stats.py プロジェクト: natechols/pbreports
    def to_report(self, output_dir, report_id=Constants.R_ID):
        """
        This needs to be cleaned up. Keeping the old interface for testing purposes.
        """
        started_at = time.time()

        log.info("Found {n} movies.".format(n=len(self.movies)))

        log.info("Working from {n} alignment file{s}: {f}".format(
            n=len(self.alignment_file_list),
            s='s' if len(self.alignment_file_list) > 1 else '',
            f=self.alignment_file_list))

        # make this a dict {attribute_key_name:Aggreggator} so it's easy to
        # access the instances after they've been computed.
        # there's duplicated keys in the attributes?
        # number_of_aligned_reads/mapped_reads_n
        _total_aggregators = self._get_total_aggregators()
        null_filter = lambda r: True
        total_model = StatisticsModel(
            _total_aggregators.values(), filter_func=null_filter)

        # need to create specific instances for a given movie. This is used to
        # create the mapping reports stats table
        movie_models = {}

        def _my_filter(movie_name1, movie_name2):
            return movie_name1 == movie_name2

        for movie in self.movies:
            ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES]
            # Note this WILL NOT work because of how scope works in python
            # filter_by_movie_func = lambda m_name: movie.name == m_name
            _my_filter_func = functools.partial(_my_filter, movie)
            model = StatisticsModel(ags, filter_func=_my_filter_func)
            movie_models[movie] = model

        # The statistic models that will be run
        all_models = [total_model] + movie_models.values()
        log.debug(all_models)

        # Run all the analysis. Now the aggregators can be accessed

        analyze_movies(self.movies, self.alignment_file_list, all_models)

        # temp structure used to create the report table. The order is
        # important

        # add total values
        _to_a = lambda k: _total_aggregators[k].attribute
        _row = [_to_a(n) for n in self.COLUMN_ATTR]
        _row.insert(0, 'All Movies')
        movie_datum = [_row]

        # Add each individual movie stats
        for movie_name_, model_ in movie_models.iteritems():
            _row = [movie_name_]
            for a in model_.aggregators:
                _row.append(a.attribute)
            movie_datum.append(_row)
        log.info(movie_datum)

        # create the Report table

        table = self._to_table(movie_datum)

        for movie_name, model in movie_models.iteritems():
            log.info("Movie name {n}".format(n=movie_name))
            for a in model.aggregators:
                log.info(movie_name + " " + repr(a))

        log.info("")
        log.info("Total models")
        for a in total_model.aggregators:
            log.info(a)

        attributes = get_attributes(_total_aggregators)

        log.info("Attributes from streaming mapping Report.")
        for a in attributes:
            log.info(a)

        plot_config_views = self._get_plot_view_configs()
        plot_groups = []

        ds = openDataFile(self.alignment_file)
        ds.updateCounts()
        if len(ds) > 0:
            # keeping the ids independent requires a bit of dictionary madness
            # {report_id:HistogramAggregator}
            id_to_aggregators = {k: _total_aggregators[v]
                                 for k, v in self.HISTOGRAM_IDS.iteritems()}
            plot_groups = to_plot_groups(plot_config_views, output_dir,
                                         id_to_aggregators)
            rb_pg = PlotGroup(Constants.PG_RAINBOW)
            rb_png = "mapped_concordance_vs_read_length.png"
            make_rainbow_plot(self.alignment_file, rb_png)
            rb_plt = Plot(Constants.P_RAINBOW, rb_png,
                          caption=get_plot_caption(spec, Constants.PG_RAINBOW,
                                                   Constants.P_RAINBOW))
            rb_pg.add_plot(rb_plt)
            plot_groups.append(rb_pg)
        self.add_more_plots(plot_groups, output_dir)

        tables = [table]
        report = Report(report_id,
                        attributes=attributes,
                        plotgroups=plot_groups,
                        tables=tables,
                        dataset_uuids=self.dataset_uuids)

        log.debug(report)

        run_time = time.time() - started_at
        log.info("Completed running in {s:.2f} sec.".format(s=run_time))
        return report
コード例 #51
0
ファイル: testdata_utils.py プロジェクト: lpp1985/lpp_Script
def extract_small_dataset_sample(file_name, n_reads=0, n_zmws=0,
                                 output_file=None,
                                 randomize=False):  # FIXME
    """
    From the input dataset, extract the first N reads or ZMWs from each .bam
    file and write to an identically named .bam in the current directory.
    Used to generate micro-datasets for very fast testing of pbsmrtpipe.
    """
    assert ([n_reads, n_zmws].count(0) == 1)
    if n_reads == 0:
        n_reads = sys.maxint
    dataset_type = None
    xml_files = []
    if output_file is None:
        output_file = op.basename(file_name)
    if op.abspath(output_file) == op.abspath(file_name):
        output_file = op.splitext(output_file)[0] + "_tiny.xml"
    with openDataFile(file_name) as ds:
        dataset_type = type(ds)
        for bam in ds.resourceReaders():
            logging.info("processing %s" % op.basename(bam.filename))
            bam_output = op.splitext(output_file)[0] + ".bam"
            with pysam.AlignmentFile(bam_output, "wb",
                                     template=bam.peer) as out:
                zmws = set([])
                n_file_reads = n_file_zmws = 0
                if randomize:  # FIXME this should probably be the default
                    if n_zmws > 0:
                        zmw_dict = defaultdict(list)
                        for i_read, zmw in enumerate(bam.holeNumber):
                            zmw_dict[zmw].append(i_read)
                        have_zmws = set()
                        have_reads = set()
                        zmws = zmw_dict.keys()
                        while True:
                            i_zmw = random.randint(0, len(zmws) - 1)
                            if not zmws[i_zmw] in have_zmws:
                                for i_read in zmw_dict[zmws[i_zmw]]:
                                    assert not i_read in have_reads
                                    out.write(bam[i_read].peer)
                                    have_reads.add(i_read)
                                    n_file_reads += 1
                                have_zmws.add(zmws[i_zmw])
                                n_file_zmws += 1
                            if n_file_zmws == n_zmws:
                                break
                    else:
                        have_reads = set()
                        while True:
                            i_read = random.randint(0, len(bam) - 1)
                            if not i_read in have_reads:
                                out.write(bam[i_read].peer)
                                have_reads.add(i_read)
                                n_file_reads += 1
                            if n_file_reads == n_reads:
                                break
                else:
                    for read in bam:
                        if n_zmws:
                            zmws.add(read.HoleNumber)
                        else:
                            n_file_reads += 1
                        if len(zmws) > n_zmws or n_file_reads > n_reads:
                            break
                        else:
                            out.write(read.peer)
                            n_file_reads += 1
                logging.info("Wrote %d reads to file %s" % (n_file_reads,
                                                            bam_output))
            subprocess.call(["samtools", "index", bam_output])
            subprocess.call(["pbindex", bam_output])
            ds_out = op.splitext(bam_output)[0] + ".xml"
            ds_new = dataset_type(bam_output)
            ds_new.write(ds_out)
            xml_files.append(ds_out)
    ds_new = dataset_type(*xml_files)
    ds_new.write(output_file)
    logging.info("%s saved as %s" % (dataset_type.__name__, output_file))
    return output_file
コード例 #52
0
ファイル: mapping_stats.py プロジェクト: lpp1985/lpp_Script
    def to_report(self, output_dir, report_id=Constants.R_ID):
        """
        This needs to be cleaned up. Keeping the old interface for testing purposes.
        """
        started_at = time.time()

        log.info("Found {n} movies.".format(n=len(self.movies)))

        log.info("Working from {n} alignment file{s}: {f}".format(
            n=len(self.alignment_file_list),
            s='s' if len(self.alignment_file_list) > 1 else '',
            f=self.alignment_file_list))

        # make this a dict {attribute_key_name:Aggreggator} so it's easy to
        # access the instances after they've been computed.
        # there's duplicated keys in the attributes?
        # number_of_aligned_reads/mapped_reads_n
        _total_aggregators = self._get_total_aggregators()
        null_filter = lambda r: True
        total_model = StatisticsModel(
            _total_aggregators.values(), filter_func=null_filter)

        # need to create specific instances for a given movie. This is used to
        # create the mapping reports stats table
        movie_models = {}

        def _my_filter(movie_name1, movie_name2):
            return movie_name1 == movie_name2

        for movie in self.movies:
            ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES]
            # Note this WILL NOT work because of how scope works in python
            # filter_by_movie_func = lambda m_name: movie.name == m_name
            _my_filter_func = functools.partial(_my_filter, movie)
            model = StatisticsModel(ags, filter_func=_my_filter_func)
            movie_models[movie] = model

        # The statistic models that will be run
        all_models = [total_model] + movie_models.values()
        log.debug(all_models)

        # Run all the analysis. Now the aggregators can be accessed

        analyze_movies(self.movies, self.alignment_file_list, all_models)

        # temp structure used to create the report table. The order is
        # important

        # add total values
        _to_a = lambda k: _total_aggregators[k].attribute
        _row = [_to_a(n) for n in self.COLUMN_ATTR]
        _row.insert(0, 'All Movies')
        movie_datum = [_row]

        # Add each individual movie stats
        for movie_name_, model_ in movie_models.iteritems():
            _row = [movie_name_]
            for a in model_.aggregators:
                _row.append(a.attribute)
            movie_datum.append(_row)
        log.info(movie_datum)

        # create the Report table

        table = self._to_table(movie_datum)

        for movie_name, model in movie_models.iteritems():
            log.info("Movie name {n}".format(n=movie_name))
            for a in model.aggregators:
                log.info(movie_name + " " + repr(a))

        log.info("")
        log.info("Total models")
        for a in total_model.aggregators:
            log.info(a)

        attributes = get_attributes(_total_aggregators)
        self.add_more_attributes(attributes)

        log.info("Attributes from streaming mapping Report.")
        for a in attributes:
            log.info(a)

        plot_config_views = self._get_plot_view_configs()
        plot_groups = []

        ds = openDataFile(self.alignment_file)
        ds.updateCounts()
        if len(ds) > 0:
            # keeping the ids independent requires a bit of dictionary madness
            # {report_id:HistogramAggregator}
            id_to_aggregators = {k: _total_aggregators[v]
                                 for k, v in self.HISTOGRAM_IDS.iteritems()}
            plot_groups = to_plot_groups(plot_config_views, output_dir,
                                         id_to_aggregators)
            rb_pg = PlotGroup(Constants.PG_RAINBOW)
            rb_png = "mapped_concordance_vs_read_length.png"
            make_rainbow_plot(self.alignment_file, op.join(output_dir, rb_png))
            rb_plt = Plot(Constants.P_RAINBOW, rb_png)
            rb_pg.add_plot(rb_plt)
            plot_groups.append(rb_pg)
        self.add_more_plots(plot_groups, output_dir)

        tables = [table]
        report = Report(report_id,
                        attributes=attributes,
                        plotgroups=plot_groups,
                        tables=tables,
                        dataset_uuids=self.dataset_uuids)

        log.debug(report)

        run_time = time.time() - started_at
        log.info("Completed running in {s:.2f} sec.".format(s=run_time))
        return report
コード例 #53
0
 def test_file_factory_css(self):
     fname = ("/pbi/dept/secondary/siv/testdata/ccs-unittest/"
              "tiny/little.ccs.bam")
     myster = openDataFile(fname)
     self.assertEqual(type(myster), ConsensusReadSet)
コード例 #54
0
 def test_file_factory_fofn(self):
     mystery = openDataFile(data.getFofn())
     self.assertEqual(type(mystery), AlignmentSet)
コード例 #55
0
ファイル: testdata_utils.py プロジェクト: lpp1985/lpp_Script
def extract_small_dataset_sample(file_name,
                                 n_reads=0,
                                 n_zmws=0,
                                 output_file=None,
                                 randomize=False):  # FIXME
    """
    From the input dataset, extract the first N reads or ZMWs from each .bam
    file and write to an identically named .bam in the current directory.
    Used to generate micro-datasets for very fast testing of pbsmrtpipe.
    """
    assert ([n_reads, n_zmws].count(0) == 1)
    if n_reads == 0:
        n_reads = sys.maxint
    dataset_type = None
    xml_files = []
    if output_file is None:
        output_file = op.basename(file_name)
    if op.abspath(output_file) == op.abspath(file_name):
        output_file = op.splitext(output_file)[0] + "_tiny.xml"
    with openDataFile(file_name) as ds:
        dataset_type = type(ds)
        for bam in ds.resourceReaders():
            logging.info("processing %s" % op.basename(bam.filename))
            bam_output = op.splitext(output_file)[0] + ".bam"
            with pysam.AlignmentFile(bam_output, "wb",
                                     template=bam.peer) as out:
                zmws = set([])
                n_file_reads = n_file_zmws = 0
                if randomize:  # FIXME this should probably be the default
                    if n_zmws > 0:
                        zmw_dict = defaultdict(list)
                        for i_read, zmw in enumerate(bam.holeNumber):
                            zmw_dict[zmw].append(i_read)
                        have_zmws = set()
                        have_reads = set()
                        zmws = zmw_dict.keys()
                        while True:
                            i_zmw = random.randint(0, len(zmws) - 1)
                            if not zmws[i_zmw] in have_zmws:
                                for i_read in zmw_dict[zmws[i_zmw]]:
                                    assert not i_read in have_reads
                                    out.write(bam[i_read].peer)
                                    have_reads.add(i_read)
                                    n_file_reads += 1
                                have_zmws.add(zmws[i_zmw])
                                n_file_zmws += 1
                            if n_file_zmws == n_zmws:
                                break
                    else:
                        have_reads = set()
                        while True:
                            i_read = random.randint(0, len(bam) - 1)
                            if not i_read in have_reads:
                                out.write(bam[i_read].peer)
                                have_reads.add(i_read)
                                n_file_reads += 1
                            if n_file_reads == n_reads:
                                break
                else:
                    for read in bam:
                        if n_zmws:
                            zmws.add(read.HoleNumber)
                        else:
                            n_file_reads += 1
                        if len(zmws) > n_zmws or n_file_reads > n_reads:
                            break
                        else:
                            out.write(read.peer)
                            n_file_reads += 1
                logging.info("Wrote %d reads to file %s" %
                             (n_file_reads, bam_output))
            subprocess.call(["samtools", "index", bam_output])
            subprocess.call(["pbindex", bam_output])
            ds_out = op.splitext(bam_output)[0] + ".xml"
            ds_new = dataset_type(bam_output)
            ds_new.write(ds_out)
            xml_files.append(ds_out)
    ds_new = dataset_type(*xml_files)
    ds_new.write(output_file)
    logging.info("%s saved as %s" % (dataset_type.__name__, output_file))
    return output_file
コード例 #56
0
 def test_file_factory_css(self):
     fname = "/mnt/secondary-siv/testdata/ccs/tiny/little.ccs.bam"
     myster = openDataFile(fname)
     self.assertEqual(type(myster), ConsensusReadSet)
コード例 #57
0
ファイル: bamSieve.py プロジェクト: mpkocher/pbcoretools
def filter_reads(input_bam,
                 output_bam,
                 whitelist=None,
                 blacklist=None,
                 percentage=None,
                 count=None,
                 seed=None,
                 ignore_metadata=False,
                 anonymize=False):
    if output_bam is None:
        log.error("Must specify output file")
        return 1
    n_specified = 4 - [whitelist, blacklist, percentage, count].count(None)
    if n_specified != 1:
        log.error("You must choose one and only one of the following "+
                  "options: --whitelist, --blacklist, --count, --percentage")
        return 1
    if seed is not None:
        random.seed(seed)
    if whitelist is None and blacklist is None:
        if not 0 < percentage < 100 and not count > 0:
            log.error("No reads selected for output.")
            return 1
    output_ds = None
    if output_bam.endswith(".xml"):
        if not input_bam.endswith(".xml"):
            print "DataSet output only supported for DataSet inputs."
            return 1
        output_ds = output_bam
        output_bam = op.splitext(output_ds)[0] + ".bam"
    if output_bam == input_bam:
        log.error("Input and output files must not be the same path")
        return 1
    elif not output_bam.endswith(".bam"):
        log.error("Output file name must end in either '.bam' or '.xml'")
        return 1
    n_file_reads = 0
    have_zmws = set()
    with openDataFile(input_bam) as ds_in:
        # TODO(nechols)(2016-03-11): refactor this to enable propagation of
        # filtered scraps
        if not ds_in.isIndexed:
            log.error("Input BAM must have accompanying .pbi index")
            return 1
        f1 = ds_in.resourceReaders()[0]
        if percentage is not None or count is not None:
            with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out:
                zmw_dict = defaultdict(list)
                for i_file, bam in enumerate(ds_in.resourceReaders()):
                    for i_read, zmw in enumerate(bam.holeNumber):
                        movie = bam.readGroupInfo(bam.qId[i_read]).MovieName
                        zmw_dict[(movie, zmw, i_file)].append(i_read)
                zmws = zmw_dict.keys()
                n_zmws_start = len(zmws)
                n_zmws_out = count
                if percentage is not None:
                    n_zmws_out = int(n_zmws_start * percentage / 100.0)
                log.info("Will random select {n} / {d} ZMWs".format(
                    n=n_zmws_out, d=n_zmws_start))
                have_reads = set()
                while True:
                    i_zmw = random.randint(0, len(zmws) - 1)
                    if not zmws[i_zmw] in have_zmws:
                        movie, zmw, i_file = zmws[i_zmw]
                        bam = ds_in.resourceReaders()[i_file]
                        for i_read in zmw_dict[zmws[i_zmw]]:
                            assert not (i_file, i_read) in have_reads
                            if anonymize:
                                _anonymize_sequence(bam[i_read].peer)
                            bam_out.write(bam[i_read].peer)
                            have_reads.add((i_file, i_read))
                            n_file_reads += 1
                        have_zmws.add(zmws[i_zmw])
                    if len(have_zmws) == n_zmws_out:
                        break
        else:
            # convert these to Python sets
            _whitelist = _process_zmw_list(whitelist)
            _blacklist = _process_zmw_list(blacklist)
            have_zmws = set()
            n_reads = 0
            with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out:
                for f in ds_in.resourceReaders():
                    for i_zmw, zmw in enumerate(f.holeNumber):
                        if ((len(_whitelist) > 0 and zmw in _whitelist) or
                                (len(_blacklist) > 0 and not zmw in _blacklist)):
                            rec = f[i_zmw]
                            if anonymize:
                                _anonymize_sequence(rec.peer)
                            bam_out.write(rec.peer)
                            have_zmws.add(zmw)
                            n_file_reads += 1
    if n_file_reads == 0:
        log.error("No reads written")
        return 1
    log.info("{n} records from {z} ZMWs written".format(
        n=n_file_reads, z=len(have_zmws)))
    try:
        rc = subprocess.call(["pbindex", output_bam])
    except OSError as e:
        if e.errno == 2:
            log.warn("pbindex not present, will not create .pbi file")
        else:
            raise
    if output_ds is not None:
        with openDataSet(input_bam) as ds_in:
            ds_out = ds_in.__class__(output_bam)
            if not ignore_metadata:
                ds_out.metadata = ds_in.metadata
                ds_out.updateCounts()
            ds_out.write(output_ds)
            log.info("wrote {t} XML to {x}".format(
                t=ds_in.__class__.__name__, x=output_ds))
    return 0