def resolved_tool_contract_runner(rtc): log.info("Starting {f} version {v} report generation".format( f=__file__, v=__version__)) dataset_uuids = [ openDataFile(rtc.task.input_files[0]).uuid, openDataFile(rtc.task.input_files[1]).uuid ] report = run_to_report_bam( reads=rtc.task.input_files[0], barcodes=rtc.task.input_files[1], subreads=True, dataset_uuids=dataset_uuids) log.info(pformat(report.to_dict())) report.write_json(rtc.task.output_files[0]) return 0
def test_incorrect_len_getitem(self): types = [AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), HdfSubreadSet(data.getXml(19))] fn = tempfile.NamedTemporaryFile(suffix=".xml").name for ds in types: explen = -2 with openDataFile(ds.toExternalFiles()[0]) as mystery: # try to avoid crashes... explen = len(mystery) mystery.numRecords = 1000000000 mystery.write(fn) with openDataFile(fn) as mystery: self.assertEqual(len(list(mystery)), explen)
def test_incorrect_len_getitem(self): types = [ AlignmentSet(data.getXml(7)), ReferenceSet(data.getXml(8)), SubreadSet(data.getXml(9)) ] fn = tempfile.NamedTemporaryFile(suffix=".xml").name for ds in types: explen = -2 with openDataFile(ds.toExternalFiles()[0]) as mystery: # try to avoid crashes... explen = len(mystery) mystery.numRecords = 1000000000 mystery.write(fn) with openDataFile(fn) as mystery: assert len(list(mystery)) == explen
def test_whitelist(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name WHITELIST = set([24962, 32901, 30983]) def _run_with_whitelist(wl): rc = bamSieve.filter_reads( input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) self.assertEqual(rc, 0) with openDataFile(ofn, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, WHITELIST) _run_with_whitelist(WHITELIST) _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)])) tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name with open(tmp_wl, "w") as wl_out: wl_out.write("\n".join([str(x) for x in list(WHITELIST)])) _run_with_whitelist(tmp_wl) # now with a BAM file as whitelist rc = bamSieve.filter_reads( input_bam=SUBREADS3, output_bam=ofn, whitelist=SUBREADS4) with openDataFile(ofn, strict=False) as bam_out: self.assertEqual(117, len([rec for rec in bam_out]))
def _read_in_indexed_alignmentset(in_fn, reference=None): """ Extract data from the .pbi files in an AlignmentSet using numpy array operations. """ lengths, percent_accs, map_qvs = [], [], [] with openDataFile(in_fn) as ds: for bam in ds.resourceReaders(): if len(bam) == 0: continue identities = bam.identity ref_name_to_id = {r.Name: r.ID for r in bam.referenceInfoTable} sel = np.full(len(identities), True, dtype=bool) bam_lengths = bam.pbi.aEnd - bam.pbi.aStart if reference is not None: ref_id = None # FIXME there must be a cleaner way to do this... for ref_info in bam.referenceInfoTable: if ref_info.Name == reference: ref_id = ref_info.ID break sel = bam.pbi.tId == ref_id lengths.extend(bam_lengths[sel]) percent_accs.extend(identities[sel]) map_qvs.extend(bam.pbi.mapQV[sel]) data = np.array([lengths, percent_accs, map_qvs]) data = data.transpose() return data
def test_split_zmws_targetsize(self): N_RECORDS = 117 N_ZMWS = 48 test_file = upstreamdata.getUnalignedBam() ds1 = openDataFile(test_file) assert len([r for r in ds1]) == N_RECORDS assert len(ds1) == N_RECORDS assert len(set(ds1.index.holeNumber)) == N_ZMWS # with no split dss = list(ds1.split(targetSize=1000, zmws=True)) assert len(dss) == 1 assert sum([len([r for r in ds_]) for ds_ in dss]) == N_RECORDS assert sum([len(ds_) for ds_ in dss]) == N_RECORDS exp = [48] obs = sorted([len(set(ds.index.holeNumber)) for ds in dss]) assert exp == obs # with a split dss = list(ds1.split(targetSize=25, zmws=True)) assert len(dss) == 2 assert sum([len([r for r in ds_]) for ds_ in dss]) == N_RECORDS assert sum([len(ds_) for ds_ in dss]) == N_RECORDS exp = [24, 24] obs = sorted([len(set(ds.index.holeNumber)) for ds in dss]) assert exp == obs # with a split dss = list(ds1.split(targetSize=5, zmws=True)) assert len(dss) == 10 assert sum([len([r for r in ds_]) for ds_ in dss]) == N_RECORDS assert sum([len(ds_) for ds_ in dss]) == N_RECORDS exp = [4, 4, 5, 5, 5, 5, 5, 5, 5, 5] obs = sorted([len(set(ds.index.holeNumber)) for ds in dss]) assert exp == obs
def createXml(args): if args.dsType is None: dset = openDataFile(*args.infile, strict=args.strict, skipCounts=args.skipCounts, generateIndices=args.generateIndices) else: dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts, generateIndices=args.generateIndices) if args.generateIndices: # we generated the indices with the last open, lets capture them with # this one: dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts) if args.dsName != '': dset.name = args.dsName log.debug("Dataset created") dset.write(args.outfile, validate=args.novalidate, modPaths=True, relPaths=args.relative) log.debug("Dataset written") return 0
def test_large_split_zmws(self): N_RECORDS = 959539 test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") ds1 = openDataFile(test_file) assert len(ds1) == N_RECORDS dss = list(ds1.split(chunks=1, zmws=True)) assert len(dss) == 1 assert sum([len(ds_) for ds_ in dss]) == N_RECORDS dss = list(ds1.split(chunks=12, zmws=True)) assert len(dss) == 12 assert sum([len(ds_) for ds_ in dss]) == N_RECORDS assert dss[0].zmwRanges == [ ('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 7, 14009) ] assert dss[-1].zmwRanges == [ ('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 149881, 163475) ] ranges = sorted([c.zmwRanges[0][1:] for c in dss]) interspans = [] last = None for rg in ranges: if not last is None: interspans.append((last, rg[0])) assert not last == rg[0] last = rg[1] for rg in interspans: assert len( np.nonzero( np.logical_and(ds1.index.holeNumber < rg[1], ds1.index.holeNumber > rg[0]))[0]) == 0
def test_anonymize(self): ofn1 = tempfile.NamedTemporaryFile(suffix=".bam").name ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn1, whitelist=set([24962])) assert rc == 0 rc = bamsieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn2, whitelist=set([24962]), anonymize=True) assert rc == 0 with openDataFile(ofn1) as bam1: with openDataFile(ofn2) as bam2: for rec1, rec2 in zip(bam1, bam2): assert rec1.qName == rec2.qName assert rec1.peer.seq != rec2.peer.seq
def test_anonymize(self): ofn1 = tempfile.NamedTemporaryFile(suffix=".bam").name ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamSieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn1, whitelist=set([24962])) self.assertEqual(rc, 0) rc = bamSieve.filter_reads(input_bam=SUBREADS3, output_bam=ofn2, whitelist=set([24962]), anonymize=True) self.assertEqual(rc, 0) with openDataFile(ofn1) as bam1: with openDataFile(ofn2) as bam2: for rec1, rec2 in zip(bam1, bam2): self.assertEqual(rec1.qName, rec2.qName) self.assertNotEqual(rec1.peer.seq, rec2.peer.seq)
def _run_with_blacklist(bl): rc = bamSieve.filter_reads( input_bam=SUBREADS2, output_bam=ofn, blacklist=bl) self.assertEqual(rc, 0) with openDataFile(ofn, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([9]))
def _run_with_whitelist(wl): rc = bamSieve.filter_reads( input_bam=SUBREADS3, output_bam=ofn, whitelist=wl) self.assertEqual(rc, 0) with openDataFile(ofn, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, WHITELIST)
def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), #ConsensusAlignmentSet(data.getXml(20)), HdfSubreadSet(data.getXml(19))] for ds in types: mystery = openDataFile(ds.toExternalFiles()[0]) self.assertEqual(type(mystery), type(ds))
def test_anonymize(self): ofn1 = tempfile.NamedTemporaryFile(suffix=".bam").name ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamSieve.filter_reads( input_bam=SUBREADS3, output_bam=ofn1, whitelist=set([24962])) self.assertEqual(rc, 0) rc = bamSieve.filter_reads( input_bam=SUBREADS3, output_bam=ofn2, whitelist=set([24962]), anonymize=True) self.assertEqual(rc, 0) with openDataFile(ofn1, strict=False) as bam1: with openDataFile(ofn2, strict=False) as bam2: for rec1, rec2 in zip(bam1, bam2): self.assertEqual(rec1.qName, rec2.qName) self.assertNotEqual(rec1.peer.seq, rec2.peer.seq)
def test_trust_counts(self): import pbtestdata f1 = pbtestdata.get_file("aligned-xml") f2 = pbtestdata.get_file("aligned-ds-2") ds = openDataFile(f1, f2, trustCounts=True) assert ds.numRecords == 133 assert len(ds) == 133 assert ds.totalLength == 274217 assert ds._index is None assert len(ds._openReaders) == 0
def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [ AlignmentSet(data.getXml(7)), ReferenceSet(data.getXml(8)), SubreadSet(data.getXml(9)) ] for ds in types: mystery = openDataFile(ds.toExternalFiles()[0]) assert type(mystery) == type(ds)
def test_count(self): ofn = tempfile.NamedTemporaryFile(suffix=".bam").name rc = bamSieve.filter_reads( input_bam=SUBREADS3, output_bam=ofn, count=1, seed=12345) self.assertEqual(rc, 0) with openDataFile(ofn, strict=False) as bam_out: zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(len(zmws), 1)
def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [ AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), #ConsensusAlignmentSet(data.getXml(20)), HdfSubreadSet(data.getXml(19)) ] for ds in types: mystery = openDataFile(ds.toExternalFiles()[0]) self.assertEqual(type(mystery), type(ds))
def show_zmws(input_file): zmws = [] with openDataFile(input_file) as ds_in: is_indexed = ds_in.isIndexed if not is_indexed: log.warning("Unindexed file(s), this may be very slow") for rr in ds_in.resourceReaders(): if is_indexed: zmws.extend(list([int(x) for x in rr.holeNumber])) else: zmws.extend([int(rec.HoleNumber) for rec in rr]) print "\n".join([str(x) for x in sorted(list(set(zmws)))])
def test_generate_indices(self): import pbtestdata tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name tmp_pbi = tmp_bam + ".pbi" tmp_bai = tmp_bam + ".bai" shutil.copyfile(pbtestdata.get_file("subreads-bam"), tmp_bam) ds = openDataFile(tmp_bam, strict=False, generateIndices=True) assert ds.externalResources[0].pbi == tmp_pbi assert ds.externalResources[0].bai == tmp_bai assert os.path.isfile(tmp_pbi) assert os.path.isfile(tmp_bai) assert len(ds) == 117
def createXml(args): if os.path.exists(args.outfile) and not args.force: raise IOError("Output file {} already exists. Use --force to " "clobber".format(args.outfile)) if args.dsType is None: dset = openDataFile(*args.infile, strict=args.strict, skipCounts=args.skipCounts, trustCounts=args.trustCounts, generateIndices=args.generateIndices, referenceFastaFname=args.reference_fasta_fname) else: dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType]( *args.infile, strict=args.strict, skipCounts=args.skipCounts, trustCounts=args.trustCounts, generateIndices=args.generateIndices, referenceFastaFname=args.reference_fasta_fname) if args.dsName != '': dset.name = args.dsName if args.metadata: dset.loadMetadata(args.metadata) if args.well_sample_name or args.bio_sample_name: if args.metadata: log.warning( "Setting the WellSample or BioSample name will overwrite fields pulled from %s", args.metadata) n_new_collections = add_mock_collection_metadata(dset) if n_new_collections > 0: log.warning( "Created new CollectionMetadata from blank template for %d movies", n_new_collections) if args.well_sample_name: force_set_all_well_sample_names(dset, args.well_sample_name) if args.bio_sample_name: force_set_all_bio_sample_names(dset, args.bio_sample_name) log.debug("Dataset created") if isinstance(dset, ContigSet): if args.organism: dset.metadata.organism = args.organism if args.ploidy: dset.metadata.ploidy = args.ploidy dset.newUuid() if args.no_sub_datasets: dset.subdatasets = [] if args.unique_collections: uniqueify_collections(dset.metadata) dset.write(args.outfile, validate=args.novalidate, relPaths=args.relative) log.debug("Dataset written") return 0
def _get_subread_length_histogram_bin_width(self): BIN_SIZES = [100, 200, 500] subread_length_max = 0 with openDataFile(self.alignment_file) as ds: for rr in ds.resourceReaders(): if len(rr) == 0: continue subread_length_max = max(subread_length_max, (rr.pbi.aEnd - rr.pbi.aStart).max()) for bin_width in BIN_SIZES: if (subread_length_max / float(bin_width)) < 100: return bin_width return BIN_SIZES[-1]
def _iter_bam_files(input_file): if input_file.endswith(".xml"): with openDataFile(input_file) as ds_in: if not ds_in.isIndexed: log.warning("Unindexed file(s), this may be very slow") for rr in ds_in.resourceReaders(): yield rr else: if op.exists(input_file + ".pbi"): with IndexedBamReader(input_file) as bam_in: yield bam_in else: with BamReader(input_file) as bam_in: yield bam_in
def __init__(self, file_name): self.file_name = file_name self._is_fasta = False self.ext = op.splitext(file_name)[1].upper() if self.ext in [".FA", ".FASTA"]: self._dataset = FastaReader(file_name) self._is_fasta = True elif self.ext == ".BAM": self._dataset = openDataFile(file_name) else: # either contigset.xml or consensusreadset.xml assert self.ext == ".XML" self._dataset = openDataSet(file_name) if isinstance(self._dataset, ContigSet): self._is_fasta = True
def _get_subreads_from_dataset(subread_list): with openDataFile(subread_list) as ds_in: if ds_in.isIndexed: qid = ds_in.index.qId mname = [ds_in.qid2mov[q] for q in qid] zmws = ds_in.index.holeNumber start = ds_in.index.qStart stop = ds_in.index.qEnd return set( [_make_qname(*x) for x in zip(mname, zmws, start, stop)]) else: subreads = set() for record in ds_in: subreads.add(record.qName) return subreads
def run_args(args): ds = openDataFile(args.dataset) get_full_bam = args.load_snr or args.load_numpasses is_barcoded = ds.isBarcoded headers = list(HEADERS) if is_barcoded: headers += HEADERS_BC if args.load_snr: headers += HEADERS_SNR if args.load_numpasses: headers += HEADERS_NPASSES rows = [] for rr in ds.resourceReaders(): identity = rr.pbi.identity for i, holeNumber in enumerate(rr.pbi.holeNumber): reference = rr.referenceInfo(rr.pbi.tId[i])[2] aLen = rr.pbi.aEnd[i] - rr.pbi.aStart[i] if aLen <= 0 or identity[i] < 0: log.warning( "ZMW %s has negative-length alignment or negative computed identity, skipping", holeNumber) continue rc = "FALSE" if rr.pbi.isReverseStrand[i]: rc = "TRUE" row = [ rr.filename, holeNumber, rr.pbi.qStart[i], rr.pbi.qEnd[i], rr.pbi.readQual[i], rr.pbi.virtualFileOffset[i], rr.pbi.contextFlag[i], reference, rr.pbi.tStart[i], rr.pbi.tEnd[i], rr.pbi.aStart[i], rr.pbi.aEnd[i], rc, rr.pbi.nM[i], rr.pbi.nMM[i], rr.pbi.mapQV[i], rr.pbi.nIns[i], rr.pbi.nDel[i] ] if is_barcoded: row.extend([ rr.pbi.bcForward[i], rr.pbi.bcReverse[i], rr.pbi.bcQual[i] ]) if get_full_bam: rec = rr[i] if args.load_snr: snr = rec.peer.get_tag("sn") row.extend(snr) if args.load_numpasses: row.append(rec.peer.get_tag("np")) rows.append(row) _write_csv(rows, args.csv_out, headers=headers) log.info("Wrote %s", args.csv_out) return 0
def _iter_bam_files(input_file): def __read_bam(fn): if op.exists(fn + ".pbi"): with IndexedBamReader(fn) as bam_in: return bam_in else: with BamReader(fn) as bam_in: return bam_in if input_file.endswith(".xml"): with openDataFile(input_file) as ds_in: if not ds_in.isIndexed: log.warning("Unindexed file(s), this may be very slow") for er in ds_in.externalResources: for bam in [er.bam, er.scraps]: if bam is not None: yield __read_bam(bam) else: yield __read_bam(input_file)
def test_integration(self): args = ["bamSieve", "--help"] with tempfile.TemporaryFile() as stdout: with tempfile.TemporaryFile() as stderr: rc = subprocess.call(args, stdout=stdout, stderr=stderr) self.assertEqual(rc, 0) ofn = tempfile.NamedTemporaryFile(suffix=".bam").name args = [ "bamSieve", "--log-level", "ERROR", "--whitelist", "8,233", SUBREADS2, ofn ] rc = subprocess.call(args) self.assertEqual(rc, 0) with openDataFile(ofn, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([8]))
def __init__(self, *args): if len(args) == 1: args = get_files_from_file_or_fofn(args[0]) self._dataset = openDataFile(*args) # Implementation notes: find all the bam files, and group # them together by movieName self._header = BamHeader(ignore_pg=True) for bam in self._dataset.resourceReaders(): if not isinstance(bam, IndexedBamReader): raise ValueError("%s in %s must have pbi index generated", bam.filename, str(self._dataset)) self._header.add(bam.peer.header) for rg in bam.peer.header["RG"]: #readGroupTable: if rg['PL'] != "PACBIO": raise IOError("Input BAM file %s for %s must be PacBio BAM.", bam.filename, self.__class__.__name__) for rg in bam.readGroupTable: assert rg.ReadType in ["CCS", "SUBREAD"]
def __init__(self, *args): if len(args) == 1: args = get_files_from_file_or_fofn(args[0]) self._dataset = openDataFile(*args) # Implementation notes: find all the bam files, and group # them together by movieName self._header = BamHeader(ignore_pg=True) for bam in self._dataset.resourceReaders(): if not isinstance(bam, IndexedBamReader): raise ValueError("%s in %s must have pbi index generated", bam.filename, str(self._dataset)) self._header.add(bam.peer.header) for rg in bam.peer.header["RG"]: #readGroupTable: if rg['PL'] != "PACBIO": raise IOError( "Input BAM file %s for %s must be PacBio BAM.", bam.filename, self.__class__.__name__) for rg in bam.readGroupTable: assert rg.ReadType in ["CCS", "SUBREAD"]
def _process_zmw_list(zmw_list): zmws = set() if zmw_list is None: return zmws elif isinstance(zmw_list, set): return zmw_list elif isinstance(zmw_list, (list, tuple)): return set(zmw_list) elif op.isfile(zmw_list): base, ext = op.splitext(zmw_list) if ext in [".bam", ".xml"]: with openDataFile(zmw_list) as ds_zmw: zmws.update(ds_zmw.index.holeNumber) else: with open(zmw_list) as f: lines = f.read().splitlines() zmws.update(set([int(x) for x in lines])) else: zmws.update(set([int(x) for x in zmw_list.split(",")])) return zmws
def run_args(args): sample_name = None if not args.single_sample and not args.all_samples: bam = openDataFile(args.samples_file) sample_name = bam.readGroupTable[0].SampleName log.info("Sample name is {}".format(sample_name)) elif args.all_samples: sample_name = "All Samples" files = [] for file_id, file_type, label in FILE_IDS_AND_NAMES: file_path = getattr(args, file_id) if file_path is None: log.info("Skipping {}".format(file_id)) continue assert file_path is not None and op.exists(file_path) if sample_name: label += " ({})".format(sample_name) files.append(to_datastore_file(file_path, file_id, file_type, label)) DataStore(files).write_json(args.datastore) return 0
def test_split_zmws_targetsize(self): N_RECORDS = 117 N_ZMWS = 48 test_file = upstreamdata.getUnalignedBam() ds1 = openDataFile(test_file) self.assertEqual(len([r for r in ds1]), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) self.assertEqual(len(set(ds1.index.holeNumber)), N_ZMWS) # with no split dss = ds1.split(targetSize=1000, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), N_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) exp = [48] obs = sorted([len(set(ds.index.holeNumber)) for ds in dss]) self.assertListEqual(exp, obs) # with a split dss = ds1.split(targetSize=25, zmws=True) self.assertEqual(len(dss), 2) self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), N_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) exp = [24, 24] obs = sorted([len(set(ds.index.holeNumber)) for ds in dss]) self.assertListEqual(exp, obs) # with a split dss = ds1.split(targetSize=5, zmws=True) self.assertEqual(len(dss), 10) self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), N_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) exp = [4, 4, 5, 5, 5, 5, 5, 5, 5, 5] obs = sorted([len(set(ds.index.holeNumber)) for ds in dss]) self.assertListEqual(exp, obs)
def _process_zmw_list(zmw_list): zmws = set() if zmw_list is None: return zmws elif isinstance(zmw_list, set): return zmw_list elif isinstance(zmw_list, (list, tuple)): return set(zmw_list) elif op.isfile(zmw_list): base, ext = op.splitext(zmw_list) if ext in [".bam", ".xml"]: with openDataFile(zmw_list) as ds_zmw: for f in ds_zmw.resourceReaders(): zmws.update(set(list(f.holeNumber))) else: with open(zmw_list) as f: lines = f.read().splitlines() zmws.update(set([int(x) for x in lines])) else: zmws.update(set([int(x) for x in zmw_list.split(",")])) return zmws
def test_split_zmws(self): N_RECORDS = 117 test_file = upstreamdata.getUnalignedBam() ds1 = openDataFile(test_file) self.assertEqual(len([r for r in ds1]), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), N_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) # We have a lower limit on the number of zmws, now dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 2) self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), N_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) self.assertEqual( dss[0].zmwRanges, [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0', 1650, 32328)]) self.assertEqual( dss[-1].zmwRanges, [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0', 32560, 54396)]) ranges = sorted([c.zmwRanges[0][1:] for c in dss]) interspans = [] last = None for rg in ranges: if not last is None: interspans.append((last, rg[0])) self.assertFalse(last == rg[0]) last = rg[1] for rg in interspans: self.assertEqual( len( np.nonzero( np.logical_and(ds1.index.holeNumber < rg[1], ds1.index.holeNumber > rg[0]))[0]), 0)
def test_split_zmws(self): N_RECORDS = 117 test_file = upstreamdata.getUnalignedBam() ds1 = openDataFile(test_file) self.assertEqual(len([r for r in ds1]), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), N_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) # We have a lower limit on the number of zmws, now dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 2) self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), N_RECORDS) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) self.assertEqual( dss[0].zmwRanges, [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0', 1650, 32328)]) self.assertEqual( dss[-1].zmwRanges, [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0', 32560, 54396)]) ranges = sorted([c.zmwRanges[0][1:] for c in dss]) interspans = [] last = None for rg in ranges: if not last is None: interspans.append((last, rg[0])) self.assertFalse(last == rg[0]) last = rg[1] for rg in interspans: self.assertEqual(len(np.nonzero(np.logical_and( ds1.index.holeNumber < rg[1], ds1.index.holeNumber > rg[0]))[0]), 0)
def _labels_reads_iterator(reads, barcodes, subreads=True): with openDataSet(reads) as ds: movies = set() apply(movies.update, [rr.movieNames for rr in ds.resourceReaders()]) if len(movies) != 1: # FIXME raise NotImplementedError("Multiple-movie datasets are not " + "supported by this application.") assert ds.isIndexed zmws_by_barcode = defaultdict(set) reads_by_zmw = defaultdict(list) for rr in ds.resourceReaders(): for i, (b, z) in enumerate(zip(rr.pbi.bcForward, rr.pbi.holeNumber)): zmws_by_barcode[b].add(z) reads_by_zmw[z].append((rr, i)) with openDataFile(barcodes) as bc: for i_bc, barcode in enumerate(bc): zmws = sorted(list(zmws_by_barcode[i_bc])) for zmw in zmws: for rr, i_read in reads_by_zmw[zmw]: # FIXME(nechols)(2016-03-15) this will not work on CCS qlen = rr.pbi.qEnd[i_read] - rr.pbi.qStart[i_read] yield barcode.id, ["n"] * qlen
def test_large_split_zmws(self): N_RECORDS = 959539 test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") ds1 = openDataFile(test_file) self.assertEqual(len(ds1), N_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 12) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) self.assertEqual( dss[0].zmwRanges, [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 7, 14009)]) self.assertEqual( dss[-1].zmwRanges, [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 149881, 163475)]) ranges = sorted([c.zmwRanges[0][1:] for c in dss]) interspans = [] last = None for rg in ranges: if not last is None: interspans.append((last, rg[0])) self.assertFalse(last == rg[0]) last = rg[1] for rg in interspans: self.assertEqual(len(np.nonzero(np.logical_and( ds1.index.holeNumber < rg[1], ds1.index.holeNumber > rg[0]))[0]), 0)
def test_file_factory_fofn(self): mystery = openDataFile(data.getFofn()) assert type(mystery) == AlignmentSet
def filter_reads(input_bam, output_bam, whitelist=None, blacklist=None, percentage=None, count=None, seed=None, ignore_metadata=False, relative=None, anonymize=False, use_barcodes=False, sample_scraps=False): if output_bam is None: log.error("Must specify output file") return 1 output_bam = op.abspath(output_bam) if not op.isdir(op.dirname(output_bam)): log.error("Output path '{d}' does not exist.".format( d=op.dirname(output_bam))) return 1 n_specified = 4 - [whitelist, blacklist, percentage, count].count(None) if n_specified != 1: log.error("You must choose one and only one of the following "+ "options: --whitelist, --blacklist, --count, --percentage") return 1 if seed is not None: random.seed(seed) if whitelist is None and blacklist is None: if not 0 < percentage < 100 and not count > 0: log.error("No reads selected for output.") return 1 output_ds = None if output_bam.endswith(".xml"): if not input_bam.endswith(".xml"): print "DataSet output only supported for DataSet inputs." return 1 ds_type = output_bam.split(".")[-2] ext2 = { "subreadset": "subreads", "alignmentset": "subreads", "consensusreadset": "ccs", "consensusalignmentset": "ccs" } if not ds_type in ext2: raise ValueError("Invalid dataset type 't'".format(t=ds_type)) output_ds = output_bam output_bam = ".".join(output_ds.split(".")[:-2] + [ext2[ds_type], "bam"]) if output_bam == input_bam: log.error("Input and output files must not be the same path") return 1 elif not output_bam.endswith(".bam"): log.error("Output file name must end in either '.bam' or '.xml'") return 1 n_file_reads = 0 have_zmws = set() scraps_bam = barcode_set = None with openDataFile(input_bam) as ds_in: if not isinstance(ds_in, ReadSet): raise TypeError("{t} is not an allowed dataset type".format( t=type(ds_in).__name__)) # TODO(nechols)(2016-03-11): refactor this to enable propagation of # filtered scraps if not ds_in.isIndexed: log.error("Input BAM must have accompanying .pbi index") return 1 for ext_res in ds_in.externalResources: if ext_res.barcodes is not None: assert barcode_set is None or barcode_set == ext_res.barcodes barcode_set = barcode_set f1 = ds_in.resourceReaders()[0] if percentage is not None or count is not None: bam_readers = list(ds_in.resourceReaders()) if sample_scraps: for ext_res in ds_in.externalResources: if ext_res.scraps is not None: scraps_in = IndexedBamReader(ext_res.scraps) bam_readers.append(scraps_in) whitelist = _create_whitelist(bam_readers, percentage, count) # convert these to Python sets _whitelist = _process_zmw_list(whitelist) _blacklist = _process_zmw_list(blacklist) scraps_in = None if output_ds is not None and output_ds.endswith(".subreadset.xml"): for ext_res in ds_in.externalResources: if ext_res.scraps is not None: if use_barcodes: log.warn("Scraps BAM is present but lacks "+ "barcodes - will not be propagated "+ "to output SubreadSet") else: scraps_in = IndexedBamReader(ext_res.scraps) break with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out: for bam_in in ds_in.resourceReaders(): n_records, have_zmws_ =_process_bam_whitelist( bam_in, bam_out, whitelist=_whitelist, blacklist=_blacklist, use_barcodes=use_barcodes, anonymize=anonymize) n_file_reads += n_records have_zmws.update(have_zmws_) if scraps_in is not None: scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam) with AlignmentFile(scraps_bam, 'wb', template=scraps_in.peer) as scraps_out: for ext_res in ds_in.externalResources: if ext_res.scraps is not None: scraps_in_ = IndexedBamReader(ext_res.scraps) n_records, have_zmws_ =_process_bam_whitelist( scraps_in_, scraps_out, _whitelist, _blacklist, use_barcodes=use_barcodes, anonymize=anonymize) have_zmws.update(have_zmws_) if n_file_reads == 0: log.error("No reads written") return 1 log.info("{n} records from {z} ZMWs written".format( n=n_file_reads, z=len(have_zmws))) def _run_pbindex(bam_file): try: rc = subprocess.call(["pbindex", bam_file]) except OSError as e: if e.errno == 2: log.warn("pbindex not present, will not create .pbi file") else: raise _run_pbindex(output_bam) if output_ds is not None: with openDataSet(input_bam) as ds_in: ds_out = ds_in.__class__(output_bam) if scraps_bam is not None: _run_pbindex(scraps_bam) ds_out.externalResources[0].scraps = scraps_bam # XXX it doesn't pick up the .pbi file - sort of annoying # but since the pbcore API doesn't provide a read for the # scraps automatically anyway, the impact is minimal if barcode_set is not None: ds_out.externalResources[0].barcodes = barcode_set if not ignore_metadata: ds_out.metadata = ds_in.metadata ds_out.updateCounts() if relative: ds_out.makePathsRelative(op.dirname(output_ds)) ds_out.write(output_ds) log.info("wrote {t} XML to {x}".format( t=ds_out.__class__.__name__, x=output_ds)) return 0
def _openAlignments(): if in_fn.endswith(".cmp.h5"): return CmpH5Reader(in_fn) else: return openDataFile(in_fn)
def filter_reads(input_bam, output_bam, whitelist=None, blacklist=None, percentage=None, count=None, seed=None, ignore_metadata=False, relative=None, anonymize=False, use_barcodes=False, sample_scraps=False, keep_original_uuid=False, use_subreads=False, min_adapters=None): _validate_settings(output_bam, whitelist, blacklist, percentage, count, min_adapters) output_bam = op.abspath(output_bam) if seed is not None: random.seed(seed) output_ds = base_name = None if output_bam.endswith(".xml"): if not input_bam.endswith(".xml"): raise UserError( "DataSet output only supported for DataSet inputs.") ds_type = output_bam.split(".")[-2] ext2 = OrderedDict([("subreadset", "subreads"), ("alignmentset", "subreads"), ("consensusreadset", "ccs"), ("consensusalignmentset", "ccs"), ("transcriptset", "transcripts"), ("transcriptalignmentset", "transcripts")]) if not ds_type in ext2: raise ValueError( "Invalid output file extension '{t}.xml'; valid extensions are:\n{e}" .format(t=ds_type, e="\n".join([" %s.xml" % e for e in ext2.keys()]))) output_ds = output_bam base_name = ".".join(output_ds.split(".")[:-2]) output_bam = base_name + "." + ".".join([ext2[ds_type], "bam"]) if output_bam == input_bam: raise UserError("Input and output files must not be the same path") elif not output_bam.endswith(".bam"): raise UserError("Output file name must end in either '.bam' or '.xml'") n_file_reads = 0 have_zmws = set() scraps_bam = barcode_set = sts_xml = None with openDataFile(input_bam) as ds_in: if not isinstance(ds_in, ReadSet): raise UserError("{t} is not an allowed dataset type".format( t=type(ds_in).__name__)) # TODO(nechols)(2016-03-11): refactor this to enable propagation of # filtered scraps if not ds_in.isIndexed: raise UserError("Input BAM must have accompanying .pbi index") for ext_res in ds_in.externalResources: if ext_res.barcodes is not None: assert barcode_set is None or barcode_set == ext_res.barcodes barcode_set = barcode_set if ext_res.sts is not None: if sts_xml is None: sts_xml = ext_res.sts else: log.warning("Multiple sts.xml files, will not propagate") f1 = ds_in.resourceReaders()[0] if percentage is not None or count is not None or min_adapters is not None: bam_readers = list(ds_in.resourceReaders()) if sample_scraps: for ext_res in ds_in.externalResources: if ext_res.scraps is not None: scraps_in = IndexedBamReader(ext_res.scraps) bam_readers.append(scraps_in) whitelist = _create_whitelist(bam_readers, percentage, count, min_adapters) # convert these to Python sets if use_subreads: _whitelist = _process_subread_list(whitelist) _blacklist = _process_subread_list(blacklist) else: _whitelist = _process_zmw_list(whitelist) _blacklist = _process_zmw_list(blacklist) scraps_in = None if output_ds is not None and output_ds.endswith(".subreadset.xml"): for ext_res in ds_in.externalResources: if ext_res.scraps is not None: if use_barcodes: log.warning("Scraps BAM is present but lacks " + "barcodes - will not be propagated " + "to output SubreadSet") else: scraps_in = IndexedBamReader(ext_res.scraps) break with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out: for bam_in in ds_in.resourceReaders(): n_records, have_zmws_ = _process_bam_whitelist( bam_in, bam_out, whitelist=_whitelist, blacklist=_blacklist, use_barcodes=use_barcodes, anonymize=anonymize, use_subreads=use_subreads, qid2mov=ds_in.qid2mov) n_file_reads += n_records have_zmws.update(have_zmws_) if scraps_in is not None: scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam) with AlignmentFile(scraps_bam, 'wb', template=scraps_in.peer) as scraps_out: for ext_res in ds_in.externalResources: if ext_res.scraps is not None: scraps_in_ = IndexedBamReader(ext_res.scraps) n_records, have_zmws_ = _process_bam_whitelist( scraps_in_, scraps_out, _whitelist, _blacklist, use_barcodes=use_barcodes, anonymize=anonymize, use_subreads=use_subreads) have_zmws.update(have_zmws_) if n_file_reads == 0: log.warn("No reads written") else: log.info("{n} records from {z} ZMWs written".format(n=n_file_reads, z=len(have_zmws))) def _run_pbindex(bam_file): try: rc = subprocess.call(["pbindex", bam_file]) except OSError as e: if e.errno == 2: log.warning("pbindex not present, will not create .pbi file") else: raise _run_pbindex(output_bam) if output_ds is not None: with openDataSet(input_bam) as ds_in: ds_out = ds_in.__class__(output_bam) if scraps_bam is not None: _run_pbindex(scraps_bam) ds_out.externalResources[0].scraps = scraps_bam # XXX it doesn't pick up the .pbi file - sort of annoying # but since the pbcore API doesn't provide a read for the # scraps automatically anyway, the impact is minimal if barcode_set is not None: ds_out.externalResources[0].barcodes = barcode_set if sts_xml is not None: sts_xml_out = base_name + ".sts.xml" log.info("Copying {s} to {d}".format(s=sts_xml, d=sts_xml_out)) shutil.copyfile(sts_xml, sts_xml_out) ds_out.externalResources[0].sts = sts_xml_out if not ignore_metadata: ds_out.metadata = ds_in.metadata ds_out.updateCounts() ds_out.name = ds_in.name + " (bamsieve)" ds_out.tags = ds_in.tags if relative: ds_out.makePathsRelative(op.dirname(output_ds)) if keep_original_uuid: log.warning("Keeping input UUID {u}".format(u=ds_in.uuid)) ds_out.objMetadata["UniqueId"] = ds_in.uuid ds_out.write(output_ds) log.info("wrote {t} XML to {x}".format(t=ds_out.__class__.__name__, x=output_ds)) return 0
def test_file_factory_css(self): fname = ("/pbi/dept/secondary/siv/testdata/ccs-unittest/" "tiny/little.ccs.bam") myster = openDataFile(fname) assert type(myster) == ConsensusReadSet
def to_report(self, output_dir, report_id=Constants.R_ID): """ This needs to be cleaned up. Keeping the old interface for testing purposes. """ started_at = time.time() log.info("Found {n} movies.".format(n=len(self.movies))) log.info("Working from {n} alignment file{s}: {f}".format( n=len(self.alignment_file_list), s='s' if len(self.alignment_file_list) > 1 else '', f=self.alignment_file_list)) # make this a dict {attribute_key_name:Aggreggator} so it's easy to # access the instances after they've been computed. # there's duplicated keys in the attributes? # number_of_aligned_reads/mapped_reads_n _total_aggregators = self._get_total_aggregators() null_filter = lambda r: True total_model = StatisticsModel( _total_aggregators.values(), filter_func=null_filter) # need to create specific instances for a given movie. This is used to # create the mapping reports stats table movie_models = {} def _my_filter(movie_name1, movie_name2): return movie_name1 == movie_name2 for movie in self.movies: ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES] # Note this WILL NOT work because of how scope works in python # filter_by_movie_func = lambda m_name: movie.name == m_name _my_filter_func = functools.partial(_my_filter, movie) model = StatisticsModel(ags, filter_func=_my_filter_func) movie_models[movie] = model # The statistic models that will be run all_models = [total_model] + movie_models.values() log.debug(all_models) # Run all the analysis. Now the aggregators can be accessed analyze_movies(self.movies, self.alignment_file_list, all_models) # temp structure used to create the report table. The order is # important # add total values _to_a = lambda k: _total_aggregators[k].attribute _row = [_to_a(n) for n in self.COLUMN_ATTR] _row.insert(0, 'All Movies') movie_datum = [_row] # Add each individual movie stats for movie_name_, model_ in movie_models.iteritems(): _row = [movie_name_] for a in model_.aggregators: _row.append(a.attribute) movie_datum.append(_row) log.info(movie_datum) # create the Report table table = self._to_table(movie_datum) for movie_name, model in movie_models.iteritems(): log.info("Movie name {n}".format(n=movie_name)) for a in model.aggregators: log.info(movie_name + " " + repr(a)) log.info("") log.info("Total models") for a in total_model.aggregators: log.info(a) attributes = get_attributes(_total_aggregators) log.info("Attributes from streaming mapping Report.") for a in attributes: log.info(a) plot_config_views = self._get_plot_view_configs() plot_groups = [] ds = openDataFile(self.alignment_file) ds.updateCounts() if len(ds) > 0: # keeping the ids independent requires a bit of dictionary madness # {report_id:HistogramAggregator} id_to_aggregators = {k: _total_aggregators[v] for k, v in self.HISTOGRAM_IDS.iteritems()} plot_groups = to_plot_groups(plot_config_views, output_dir, id_to_aggregators) rb_pg = PlotGroup(Constants.PG_RAINBOW) rb_png = "mapped_concordance_vs_read_length.png" make_rainbow_plot(self.alignment_file, rb_png) rb_plt = Plot(Constants.P_RAINBOW, rb_png, caption=get_plot_caption(spec, Constants.PG_RAINBOW, Constants.P_RAINBOW)) rb_pg.add_plot(rb_plt) plot_groups.append(rb_pg) self.add_more_plots(plot_groups, output_dir) tables = [table] report = Report(report_id, attributes=attributes, plotgroups=plot_groups, tables=tables, dataset_uuids=self.dataset_uuids) log.debug(report) run_time = time.time() - started_at log.info("Completed running in {s:.2f} sec.".format(s=run_time)) return report
def extract_small_dataset_sample(file_name, n_reads=0, n_zmws=0, output_file=None, randomize=False): # FIXME """ From the input dataset, extract the first N reads or ZMWs from each .bam file and write to an identically named .bam in the current directory. Used to generate micro-datasets for very fast testing of pbsmrtpipe. """ assert ([n_reads, n_zmws].count(0) == 1) if n_reads == 0: n_reads = sys.maxint dataset_type = None xml_files = [] if output_file is None: output_file = op.basename(file_name) if op.abspath(output_file) == op.abspath(file_name): output_file = op.splitext(output_file)[0] + "_tiny.xml" with openDataFile(file_name) as ds: dataset_type = type(ds) for bam in ds.resourceReaders(): logging.info("processing %s" % op.basename(bam.filename)) bam_output = op.splitext(output_file)[0] + ".bam" with pysam.AlignmentFile(bam_output, "wb", template=bam.peer) as out: zmws = set([]) n_file_reads = n_file_zmws = 0 if randomize: # FIXME this should probably be the default if n_zmws > 0: zmw_dict = defaultdict(list) for i_read, zmw in enumerate(bam.holeNumber): zmw_dict[zmw].append(i_read) have_zmws = set() have_reads = set() zmws = zmw_dict.keys() while True: i_zmw = random.randint(0, len(zmws) - 1) if not zmws[i_zmw] in have_zmws: for i_read in zmw_dict[zmws[i_zmw]]: assert not i_read in have_reads out.write(bam[i_read].peer) have_reads.add(i_read) n_file_reads += 1 have_zmws.add(zmws[i_zmw]) n_file_zmws += 1 if n_file_zmws == n_zmws: break else: have_reads = set() while True: i_read = random.randint(0, len(bam) - 1) if not i_read in have_reads: out.write(bam[i_read].peer) have_reads.add(i_read) n_file_reads += 1 if n_file_reads == n_reads: break else: for read in bam: if n_zmws: zmws.add(read.HoleNumber) else: n_file_reads += 1 if len(zmws) > n_zmws or n_file_reads > n_reads: break else: out.write(read.peer) n_file_reads += 1 logging.info("Wrote %d reads to file %s" % (n_file_reads, bam_output)) subprocess.call(["samtools", "index", bam_output]) subprocess.call(["pbindex", bam_output]) ds_out = op.splitext(bam_output)[0] + ".xml" ds_new = dataset_type(bam_output) ds_new.write(ds_out) xml_files.append(ds_out) ds_new = dataset_type(*xml_files) ds_new.write(output_file) logging.info("%s saved as %s" % (dataset_type.__name__, output_file)) return output_file
def to_report(self, output_dir, report_id=Constants.R_ID): """ This needs to be cleaned up. Keeping the old interface for testing purposes. """ started_at = time.time() log.info("Found {n} movies.".format(n=len(self.movies))) log.info("Working from {n} alignment file{s}: {f}".format( n=len(self.alignment_file_list), s='s' if len(self.alignment_file_list) > 1 else '', f=self.alignment_file_list)) # make this a dict {attribute_key_name:Aggreggator} so it's easy to # access the instances after they've been computed. # there's duplicated keys in the attributes? # number_of_aligned_reads/mapped_reads_n _total_aggregators = self._get_total_aggregators() null_filter = lambda r: True total_model = StatisticsModel( _total_aggregators.values(), filter_func=null_filter) # need to create specific instances for a given movie. This is used to # create the mapping reports stats table movie_models = {} def _my_filter(movie_name1, movie_name2): return movie_name1 == movie_name2 for movie in self.movies: ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES] # Note this WILL NOT work because of how scope works in python # filter_by_movie_func = lambda m_name: movie.name == m_name _my_filter_func = functools.partial(_my_filter, movie) model = StatisticsModel(ags, filter_func=_my_filter_func) movie_models[movie] = model # The statistic models that will be run all_models = [total_model] + movie_models.values() log.debug(all_models) # Run all the analysis. Now the aggregators can be accessed analyze_movies(self.movies, self.alignment_file_list, all_models) # temp structure used to create the report table. The order is # important # add total values _to_a = lambda k: _total_aggregators[k].attribute _row = [_to_a(n) for n in self.COLUMN_ATTR] _row.insert(0, 'All Movies') movie_datum = [_row] # Add each individual movie stats for movie_name_, model_ in movie_models.iteritems(): _row = [movie_name_] for a in model_.aggregators: _row.append(a.attribute) movie_datum.append(_row) log.info(movie_datum) # create the Report table table = self._to_table(movie_datum) for movie_name, model in movie_models.iteritems(): log.info("Movie name {n}".format(n=movie_name)) for a in model.aggregators: log.info(movie_name + " " + repr(a)) log.info("") log.info("Total models") for a in total_model.aggregators: log.info(a) attributes = get_attributes(_total_aggregators) self.add_more_attributes(attributes) log.info("Attributes from streaming mapping Report.") for a in attributes: log.info(a) plot_config_views = self._get_plot_view_configs() plot_groups = [] ds = openDataFile(self.alignment_file) ds.updateCounts() if len(ds) > 0: # keeping the ids independent requires a bit of dictionary madness # {report_id:HistogramAggregator} id_to_aggregators = {k: _total_aggregators[v] for k, v in self.HISTOGRAM_IDS.iteritems()} plot_groups = to_plot_groups(plot_config_views, output_dir, id_to_aggregators) rb_pg = PlotGroup(Constants.PG_RAINBOW) rb_png = "mapped_concordance_vs_read_length.png" make_rainbow_plot(self.alignment_file, op.join(output_dir, rb_png)) rb_plt = Plot(Constants.P_RAINBOW, rb_png) rb_pg.add_plot(rb_plt) plot_groups.append(rb_pg) self.add_more_plots(plot_groups, output_dir) tables = [table] report = Report(report_id, attributes=attributes, plotgroups=plot_groups, tables=tables, dataset_uuids=self.dataset_uuids) log.debug(report) run_time = time.time() - started_at log.info("Completed running in {s:.2f} sec.".format(s=run_time)) return report
def test_file_factory_css(self): fname = ("/pbi/dept/secondary/siv/testdata/ccs-unittest/" "tiny/little.ccs.bam") myster = openDataFile(fname) self.assertEqual(type(myster), ConsensusReadSet)
def test_file_factory_fofn(self): mystery = openDataFile(data.getFofn()) self.assertEqual(type(mystery), AlignmentSet)
def test_file_factory_css(self): fname = "/mnt/secondary-siv/testdata/ccs/tiny/little.ccs.bam" myster = openDataFile(fname) self.assertEqual(type(myster), ConsensusReadSet)
def filter_reads(input_bam, output_bam, whitelist=None, blacklist=None, percentage=None, count=None, seed=None, ignore_metadata=False, anonymize=False): if output_bam is None: log.error("Must specify output file") return 1 n_specified = 4 - [whitelist, blacklist, percentage, count].count(None) if n_specified != 1: log.error("You must choose one and only one of the following "+ "options: --whitelist, --blacklist, --count, --percentage") return 1 if seed is not None: random.seed(seed) if whitelist is None and blacklist is None: if not 0 < percentage < 100 and not count > 0: log.error("No reads selected for output.") return 1 output_ds = None if output_bam.endswith(".xml"): if not input_bam.endswith(".xml"): print "DataSet output only supported for DataSet inputs." return 1 output_ds = output_bam output_bam = op.splitext(output_ds)[0] + ".bam" if output_bam == input_bam: log.error("Input and output files must not be the same path") return 1 elif not output_bam.endswith(".bam"): log.error("Output file name must end in either '.bam' or '.xml'") return 1 n_file_reads = 0 have_zmws = set() with openDataFile(input_bam) as ds_in: # TODO(nechols)(2016-03-11): refactor this to enable propagation of # filtered scraps if not ds_in.isIndexed: log.error("Input BAM must have accompanying .pbi index") return 1 f1 = ds_in.resourceReaders()[0] if percentage is not None or count is not None: with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out: zmw_dict = defaultdict(list) for i_file, bam in enumerate(ds_in.resourceReaders()): for i_read, zmw in enumerate(bam.holeNumber): movie = bam.readGroupInfo(bam.qId[i_read]).MovieName zmw_dict[(movie, zmw, i_file)].append(i_read) zmws = zmw_dict.keys() n_zmws_start = len(zmws) n_zmws_out = count if percentage is not None: n_zmws_out = int(n_zmws_start * percentage / 100.0) log.info("Will random select {n} / {d} ZMWs".format( n=n_zmws_out, d=n_zmws_start)) have_reads = set() while True: i_zmw = random.randint(0, len(zmws) - 1) if not zmws[i_zmw] in have_zmws: movie, zmw, i_file = zmws[i_zmw] bam = ds_in.resourceReaders()[i_file] for i_read in zmw_dict[zmws[i_zmw]]: assert not (i_file, i_read) in have_reads if anonymize: _anonymize_sequence(bam[i_read].peer) bam_out.write(bam[i_read].peer) have_reads.add((i_file, i_read)) n_file_reads += 1 have_zmws.add(zmws[i_zmw]) if len(have_zmws) == n_zmws_out: break else: # convert these to Python sets _whitelist = _process_zmw_list(whitelist) _blacklist = _process_zmw_list(blacklist) have_zmws = set() n_reads = 0 with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out: for f in ds_in.resourceReaders(): for i_zmw, zmw in enumerate(f.holeNumber): if ((len(_whitelist) > 0 and zmw in _whitelist) or (len(_blacklist) > 0 and not zmw in _blacklist)): rec = f[i_zmw] if anonymize: _anonymize_sequence(rec.peer) bam_out.write(rec.peer) have_zmws.add(zmw) n_file_reads += 1 if n_file_reads == 0: log.error("No reads written") return 1 log.info("{n} records from {z} ZMWs written".format( n=n_file_reads, z=len(have_zmws))) try: rc = subprocess.call(["pbindex", output_bam]) except OSError as e: if e.errno == 2: log.warn("pbindex not present, will not create .pbi file") else: raise if output_ds is not None: with openDataSet(input_bam) as ds_in: ds_out = ds_in.__class__(output_bam) if not ignore_metadata: ds_out.metadata = ds_in.metadata ds_out.updateCounts() ds_out.write(output_ds) log.info("wrote {t} XML to {x}".format( t=ds_in.__class__.__name__, x=output_ds)) return 0