def test_ccsread_build(self): ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) self.assertEquals(type(ds1).__name__, 'ConsensusReadSet') self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata') ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) self.assertEquals(type(ds2).__name__, 'ConsensusReadSet') self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
def test_ccsread_build(self): ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) assert type(ds1).__name__ == 'ConsensusReadSet' assert type(ds1._metadata).__name__ == 'SubreadSetMetadata' ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) assert type(ds2).__name__ == 'ConsensusReadSet' assert type(ds2._metadata).__name__ == 'SubreadSetMetadata'
def test_ccsset_from_bam(self): # DONE bug 28698 ds1 = ConsensusReadSet(upstreamData.getCCSBAM(), strict=False) fn = tempfile.NamedTemporaryFile(suffix=".consensusreadset.xml").name log.debug(fn) ds1.write(fn, validate=False) ds1.write(fn)
def test_integration(self): ccs_barcoded = pbtestdata.get_file("ccs-barcoded") datastore = tempfile.NamedTemporaryFile(suffix=".datastore.json").name lima_out = tempfile.NamedTemporaryFile( suffix=".consensusreadset.xml").name ccs_in = tempfile.NamedTemporaryFile( suffix=".consensusreadset.xml").name with ConsensusReadSet(ccs_barcoded) as ccs_tmp: ccs_tmp.name = "My Data (filtered)" ccs_tmp.tags = "ccs,filtered" ccs_tmp.write(ccs_in) ccs_tmp.name = "lima out" ccs_tmp.write(lima_out) ds = DataStore([ DataStoreFile(uuid.uuid4(), "lima", FileTypes.DS_CCS.file_type_id, lima_out) ]) ds.write_json(datastore) args = [ "python3", "-m", "pbcoretools.tasks.make_trimmed_dataset", datastore, ccs_in ] self._check_call(args) with ConsensusReadSet("trimmed.consensusreadset.xml", trustCounts=True) as ccs_out: assert ccs_out.numRecords > 0 assert ccs_out.name == "My Data (trimmed)" assert ccs_out.tags == "ccs"
def resolved_tool_contract_runner(rtc): """Given resolved tool contract, run""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, PartialChunkTask) for task in p]) dummy_sentinel_file = rtc.task.input_files[1] ccs_file = rtc.task.input_files[2] nproc = rtc.task.nproc tmp_dir = rtc.task.tmpdir_resources[0].path \ if len(rtc.task.tmpdir_resources) > 0 else None log.info("Looking for QVs in CCS input...") with ConsensusReadSet(ccs_file) as ds: for bam in ds.resourceReaders(): qvs = bam.pulseFeaturesAvailable() if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']): log.warn( "Missing QV fields from %s, will use default probabilities", bam.filename) ccs_file = None break with open(rtc.task.output_files[0], 'w') as writer: for task in p: log.info("Running ice_partial on cluster bin %s, nfl chunk %s/%s", str(task.cluster_bin_index), str(task.nfl_index), str(task.n_nfl_chunks)) task_runner(task=task, ccs_file=ccs_file, nproc=nproc, tmp_dir=tmp_dir) writer.write( "ice_partial of cluster bin %s, nfl chunk %s/%s in %s is DONE: %s\n" % (task.cluster_bin_index, task.nfl_index, task.n_nfl_chunks, task.cluster_out_dir, task.nfl_pickle))
def resolved_tool_contract_runner(rtc): ccs_set = rtc.task.input_files[2] # FIXME we have to ignore the new CCS output for now because it doesn't # contain the necessary QV fields; however, since the old behavior appears # to be to use this always (independent of --use_finer_qv), it will still # accommodate the older CCS files we use for testing log.info("Looking for QVs in CCS input...") with ConsensusReadSet(ccs_set) as ds: for bam in ds.resourceReaders(): qvs = bam.pulseFeaturesAvailable() if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']): log.warn( "Missing QV fields from %s, will use default probabilities" % bam.filename) ccs_set = None break tmp_dir = rtc.task.tmpdir_resources[0].path \ if len(rtc.task.tmpdir_resources) > 0 else None print 'my tmp_dir is ' print tmp_dir return IcePartialOne(input_fasta=rtc.task.input_files[0], ref_fasta=rtc.task.input_files[1], out_pickle=rtc.task.output_files[0], ccs_fofn=ccs_set, blasr_nproc=rtc.task.nproc, tmp_dir=tmp_dir).run()
def update_consensus_reads(ccs_in, subreads_in, ccs_out, use_run_design_uuid=False): ds_subreads = SubreadSet(subreads_in, skipCounts=True) with ConsensusReadSet(ccs_in) as ds: ds.name = ds_subreads.name + " (CCS)" run_design_uuid = None if use_run_design_uuid: uuids = set([]) for collection in ds.metadata.collections: if collection.consensusReadSetRef is not None: uuids.add(collection.consensusReadSetRef.uuid) if len(uuids) == 1: run_design_uuid = list(uuids)[0] elif len(uuids) == 0: log.warning("No pre-defined ConsensusReadSetRef UUID found") else: log.warning("Multiple ConsensusReadSetRef UUIDs found") for collection in ds.metadata.collections: if len(collection.wellSample.bioSamples) == 0: for collection2 in ds_subreads.metadata.collections: for bio_sample in collection2.wellSample.bioSamples: collection.bioSamples.append(bio_sample) ds.updateCounts() if run_design_uuid is not None: ds.uuid = run_design_uuid else: ds.newUuid() sanitize_dataset_tags(ds, remove_hidden=True) ds.write(ccs_out) return 0
def test_ccs_barcodes_table_asymmetric(self): CCS_DS = op.join(ROOT_DATA_DIR, "ccs", "asym_barcodes", "ccs.consensusreadset.xml") ds = ConsensusReadSet(CCS_DS) r = to_report(ds, tempfile.mkdtemp()) self.assertEqual(r.tables[1].columns[0].values, ['F5--R5', 'F8--R8', 'F20--R20', 'F29--R29', 'F30--R30'])
def test_ccs_mulitple_movies_single_bam(self): """ Check that the report doesn't crash when a single BAM file contains reads from multiple movies """ ds = ConsensusReadSet(self.CCS_BAM) r = to_report(ds, tempfile.mkdtemp())
def test_ccs_bam_np_is_at_least_npasses(self): """ Check that the number of passes of each read in the ConsensusReadSet output is at least equal to the minimum specified in the resolved tool contract. """ nchecked = nskipped = 0 for rtc in self.resolved_tool_contracts: if rtc.task.task_id == "pbccs.tasks.ccs": min_passes = rtc.task.options["pbccs.task_options.min_passes"] with ConsensusReadSet(rtc.task.output_files[0]) as ccs: for bam in ccs.resourceReaders(): if len(bam) > NRECORDS_MAX_ITER: nskipped += 1 else: for rec in bam: np = rec.peer.opt("np") self.assertTrue( np >= min_passes, "{r} has np {n} < {e}".format( r=rec.qName, n=np, e=min_passes)) nchecked += 1 if nchecked == 0: if nskipped == 0: raise unittest.SkipTest("No CCS BAM files found") else: raise unittest.SkipTest( "File size over limit - 'np' not checked")
def run_report(input_file, report_json, output_dir): log.info("Running {f} v{v}.".format(f=os.path.basename(__file__), v=__version__)) report = None ds = ConsensusReadSet(input_file) report = to_report(ds, output_dir) log.info(pformat(report.to_dict())) report.write_json(report_json) return 0
def add_bash5(self, filename): """Add a bas.h5/ccs.h5/ccs.bam to cacher.""" basename = os.path.basename(filename) if filename.endswith('.bax.h5'): movie = basename[:-9] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-9], suffix='.bax.h5') elif filename.endswith('.1.ccs.h5') or \ filename.endswith('.2.ccs.h5') or \ filename.endswith('.3.ccs.h5'): movie = basename[:-9] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-9]) elif filename.endswith('.ccs.h5'): # a single .ccs.h5 (post 150k runs), treat the same as .bas.h5 movie = basename[:-7] self.bas_files[movie] = defaultdict(lambda: filename) elif filename.endswith('.1.subreads.bam') or \ filename.endswith('.2.subreads.bam') or \ filename.endswith('.3.subreads.bam'): movie = basename[:-15] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-15]) elif filename.endswith('subreads.bam'): raise NotImplementedError( "%s add_bash5 *.subreads.bam not implemented." % (self.__class__.__name__)) elif filename.endswith('.1.ccs.bam') or \ filename.endswith('.2.ccs.bam') or \ filename.endswith('.3.ccs.bam'): movie = basename[:-10] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-10]) elif filename.endswith('.bas.h5'): movie = basename[:-7] self.bas_files[movie] = defaultdict(lambda: filename) elif filename.endswith(".consensusreadset.xml"): ds = ConsensusReadSet(filename) for rr in ds.resourceReaders(): for rg in rr.readGroupTable: self.bas_files[rg.MovieName] = dataset_wrapper(filename) else: raise IOError("Unsupported file format: %s" % filename)
def test_ccs_barcodes_table(self): CCS_DS = pbtestdata.get_file("ccs-barcoded") ds = ConsensusReadSet(CCS_DS) r = to_report(ds, tempfile.mkdtemp()) self.assertEqual([c.values for c in r.tables[1].columns[0:4]], [["lbc1--lbc1", "lbc3--lbc3"], [1, 1], [1958, 1954], [1958, 1954]]) self.assertAlmostEqual(r.tables[1].columns[4].values[0], 0.9724, places=4) self.assertAlmostEqual(r.tables[1].columns[4].values[1], 0.9926, places=4)
def test_get_bio_sample_name(self): filename = pbtestdata.get_file("subreads-sequel") ds1 = SubreadSet(filename) get_bio_sample_name(ds1) == "Narwhale" filename = pbtestdata.get_file("subreads-biosample-2") ds2 = SubreadSet(filename) get_bio_sample_name(ds2) == "UnnamedSample" ds3 = ds1 + ds2 get_bio_sample_name(ds3) == "Multiple" filename = pbtestdata.get_file("rsii-ccs-multi-cell") ds4 = ConsensusReadSet(filename) get_bio_sample_name(ds4) == "Multiple" filename = pbtestdata.get_file("ccs-sequel") ds4 = ConsensusReadSet(filename) get_bio_sample_name(ds4) == "NarwhalCcs"
def add_bash5(self, filename): """Add a bas.h5/ccs.h5/ccs.bam to cacher.""" basename = os.path.basename(filename) if filename.endswith('.bax.h5'): movie = basename[:-9] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-9], suffix='.bax.h5') elif filename.endswith('.1.ccs.h5') or \ filename.endswith('.2.ccs.h5') or \ filename.endswith('.3.ccs.h5'): movie = basename[:-9] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-9]) elif filename.endswith('.ccs.h5'): # a single .ccs.h5 (post 150k runs), treat the same as .bas.h5 movie = basename[:-7] self.bas_files[movie] = defaultdict(lambda: filename) elif filename.endswith('.1.subreads.bam') or \ filename.endswith('.2.subreads.bam') or \ filename.endswith('.3.subreads.bam'): movie = basename[:-15] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-15]) elif filename.endswith('subreads.bam'): raise NotImplementedError("%s add_bash5 *.subreads.bam not implemented." % (self.__class__.__name__)) elif filename.endswith('.1.ccs.bam') or \ filename.endswith('.2.ccs.bam') or \ filename.endswith('.3.ccs.bam'): movie = basename[:-10] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-10]) elif filename.endswith('.bas.h5'): movie = basename[:-7] self.bas_files[movie] = defaultdict(lambda: filename) elif filename.endswith(".consensusreadset.xml"): ds = ConsensusReadSet(filename) for rr in ds.resourceReaders(): for rg in rr.readGroupTable: self.bas_files[rg.MovieName] = dataset_wrapper(filename) else: raise IOError("Unsupported file format: %s" % filename)
def test_read_ccs_multiple_movies_one_bam(self): """ Test for sane BamCollection.__getitem__() behavior when a .bam file contains multiple read groups. """ dataset_xml = op.join(self.ioDataDir, "ccs_multi_movie.consensusreadset.xml") bc = BamCollection(dataset_xml) with ConsensusReadSet(dataset_xml) as ds: for read in ds: self.assertEqual(bc[read.qName].readName, read.qName)
def test_qname_css(self): fn = ('/pbi/dept/secondary/siv/testdata/ccs-unittest/' 'tiny/little.ccs.bam') sset = ConsensusReadSet(fn) self.assertEqual(len(sset), 14) size = 4 qn = [r.qName for r in sset[:size]] good_qn = [('=', qn)] sset.filters.addRequirement(qname=good_qn) self.assertEqual(size, sum(1 for _ in sset)) self.assertEqual(size, len(sset))
def run_args(args): dstore = DataStore.load_from_json(os.path.realpath(args.datastore)) ds_in = ConsensusReadSet(args.ccs_in, trustCounts=True) ds_out = ConsensusReadSet(*([f.path for f in dstore.files.values()]), trustCounts=True) sanitize_dataset_tags(ds_out, remove_hidden=True) ds_out.name = ds_in.name.replace(" (filtered)", "") + " (trimmed)" ds_out.subdatasets = [] ds_out.write("trimmed.consensusreadset.xml") return 0
def run_dev_ccs_report(rtc): from pbcore.io import ConsensusReadSet with ConsensusReadSet(rtc.task.input_files[0]) as ds: ds.updateCounts() attr = [ Attribute("number_of_records", value=ds.numRecords), Attribute("total_length", value=ds.totalLength) ] report = Report("ccs_report", title="ConsensusReadSet XML Report", attributes=attr) report.write_json(rtc.task.output_files[0]) return 0
def read_isoseq3_refine_flnc(flnc_ccs): """ Read the header and index of the full-length non-chimeric (FLNC) CCS reads output by the 'isoseq3 refine' command, and return a dictionary mapping CCS read IDs to sample names. """ flnc_read_samples = {} ds = ConsensusReadSet(flnc_ccs, strict=True) for rg in ds.readGroupTable: sel = ds.index.qId == rg.ID zmws = ds.index.holeNumber[sel] for zmw in zmws: qname = "{m}/{z}/ccs".format(m=rg.MovieName, z=zmw) assert not qname in flnc_read_samples, qname flnc_read_samples[qname] = rg.SampleName return flnc_read_samples
def _run_args(args): ds = ConsensusReadSet(args.ccsxml, strict=True) orig_uuid = ds.uuid ds.consolidate("reads.bam", useTmp=False) bam_res = ds.externalResources[0] if args.zmws_json: bam_res._setSubResByMetaType("PacBio.FileTypes.json", args.zmws_json) if args.report_ccs_processing: bam_res._setSubResByMetaType("PacBio.FileTypes.JsonReport", args.report_ccs_processing) ds.uuid = orig_uuid ds.write("final.consensusreadset.xml") with open("reads.fofn", "wt") as fofn: fofn.write(os.path.abspath("reads.bam")) return 0
def test_validity_ccs_accuracy(self): """ check that ccs accuracy is greater than a preset threshold. this can be specified in the 'ccs' section of test_values.json, otherwise the permissive default value of 0.90 will be used. """ with ConsensusReadSet(self.final_ccs_file) as ds: values_sum = n_values = 0 for rr in ds.resourceReaders(): values_sum += rr.readQual.sum() n_values += len(rr) # XXX see BamAlignment.readScore docstring for explanation readScore = values_sum / n_values vmin = MIN_CCS_MEAN_ACCURACY if "min_ccs_mean_accuracy" in self.expected_values: vmin = self.expected_values["min_ccs_mean_accuracy"] self.assertGreater(readScore, vmin)
def run_args(args): datastore_out = op.abspath(args.datastore_out) base_dir = op.dirname(datastore_out) datastore_files = [] with ConsensusReadSet(args.dataset_file, strict=True) as ds: bam_file_name, file_prefix = get_prefix_and_bam_file_name( ds, is_barcoded=False) if args.mode == "fasta": datastore_files.extend(to_fastx_files( FileTypes.FASTA, ds, args.dataset_file, Constants.FASTA_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip)) elif args.mode == "fastq": datastore_files.extend(to_fastx_files( FileTypes.FASTQ, ds, args.dataset_file, Constants.FASTQ_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip)) elif args.mode == "consolidate": if bam_file_name is None: datastore_files.append( consolidate_bam(base_dir, file_prefix, ds, min_rq=args.min_rq)) DataStore(datastore_files).write_json(datastore_out) return 0
def setUpClass(cls): super(TestAccuracy, cls).setUpClass() ref_fasta = cls.test_values["ccs"].get("reference", None) cls.ref_csv = cls.test_values["ccs"].get("ccscheck_out", None) if cls.ref_csv is None: raise unittest.skipTest("No CSV file defined") ref_dir = op.dirname(ref_fasta) cls.run_dir = tempfile.mkdtemp() tmp_ref_fasta = op.join(cls.run_dir, op.basename(ref_fasta)) shutil.copyfile(ref_fasta, tmp_ref_fasta) cls.ref_fasta = tmp_ref_fasta pysam.faidx(tmp_ref_fasta) cls.final_ccs_file = None for file_id, file_info in cls.datastore.get_file_dict().iteritems(): if file_info.is_chunked: continue if file_info.file_type_id == FileTypes.DS_CCS.file_type_id: cls.final_ccs_file = file_info.path break cls.ccs_ds = ConsensusReadSet(cls.final_ccs_file)
def test_ccs_barcoding_propagation(self): """ Test that any BarcodeSet defined as an external resource of the subreads BAM file(s) in the input SubreadSet is also an external resource of the output ConsensusReadSet. """ if self.is_barcoded: with ConsensusReadSet(self.final_ccs_file) as ccs: self.assertTrue(ccs.isBarcoded) for ext_res_out in ccs.externalResources: self.assertEqual(self.barcode_set, ext_res_out.barcodes) if "barcodes" in self.expected_values: barcodes = set() for bam in ccs.resourceReaders(): bc_eq = bam.pbi.bcForward == bam.pbi.bcReverse self.assertTrue(bc_eq.all()) barcodes.update(set(list(bam.pbi.bcForward))) self.assertEqual(sorted(list(barcodes)), self.expected_values["barcodes"]) else: raise unittest.SkipTest("SubreadSet was not barcoded, skipping")
def run_ccs_bam_fastq_exports(ccs_dataset_file, base_dir, is_barcoded=False, min_rq=Constants.HIFI_RQ, no_zip=False): """ Take a ConsensusReadSet and write BAM/FASTQ files to the output directory. If this is a demultiplexed dataset, it is assumed to have a single BAM file within a dataset that is already imported in SMRT Link. Note that this function runs the exports serially, and is therefore no longer used in this specific task, but rather in the barcoded version that runs in parallel. """ datastore_files = [] with ConsensusReadSet(ccs_dataset_file, strict=True) as ds: bam_file_name, file_prefix = get_prefix_and_bam_file_name( ds, is_barcoded) if bam_file_name is None: datastore_files.append(consolidate_bam(base_dir, file_prefix, ds, min_rq)) fasta_file_ids = [Constants.FASTA_ID, Constants.FASTA2_ID] fastq_file_ids = [Constants.FASTQ_ID, Constants.FASTQ2_ID] datastore_files.extend( to_fastx_files(FileTypes.FASTA, ds, ccs_dataset_file, fasta_file_ids, base_dir, file_prefix, min_rq=min_rq, no_zip=no_zip)) datastore_files.extend( to_fastx_files(FileTypes.FASTQ, ds, ccs_dataset_file, fastq_file_ids, base_dir, file_prefix, min_rq=min_rq, no_zip=no_zip)) return datastore_files
def test_consensus_read_set_ref(self): import pbtestdata ds = ConsensusReadSet(pbtestdata.get_file("ccs-sequel"), strict=True) uuid = ds.metadata.collections[0].consensusReadSetRef.uuid assert uuid == "5416f525-d3c7-496b-ba8c-18d7ec1b4499"
def test_ccs_bam_index(self): """ Test that the output includes .pbi index file(s). """ with ConsensusReadSet(self.final_ccs_file) as ds: ds.assertIndexed()
def setUpClass(cls): ds = ConsensusReadSet(pbcore.data.getCCSBAM(), strict=True) ds.write(cls.INPUT_FILES[0])
def setUpClass(cls): super(TestToolContract, cls).setUpClass() ds = ConsensusReadSet(pbcore.data.getCCSBAM(), strict=True) ds.write(cls.INPUT_FILES[0])
def run_after(self, rtc, output_dir): with ConsensusReadSet(rtc.task.output_files[0]) as ds_out: zmws = set(ds_out.resourceReaders()[0].holeNumber) logging.info("ZMWs = {z}".format(z=zmws)) for z in zmws: self.assertTrue(self.zmw_range[0] < z < self.zmw_range[1])
def is_ccs_demultiplexed(input_file): log.info("Checking {} is lima-demultiplexed or not.".format(input_file)) # keep memory to an absolute minimum ds = ConsensusReadSet(input_file, skipCounts=True) pbi_headers = [PbiHeaderOnly(er.pbi) for er in ds.externalResources] return any([pbi.hasBarcodeInfo for pbi in pbi_headers])
def setUpData(cls): cls.bam_file_name = pbcore.data.getCCSBAM() cls.xml_file_name = tempfile.NamedTemporaryFile( suffix=".consensusreadset.xml").name ds = ConsensusReadSet(cls.bam_file_name) ds.write(cls.xml_file_name)