def test_split_hdfsubreadset(self): hdfds = HdfSubreadSet(*upstreamData.getBaxH5_v23()) self.assertEqual(len(hdfds.toExternalFiles()), 3) hdfdss = hdfds.split(chunks=2, ignoreSubDatasets=True) self.assertEqual(len(hdfdss), 2) self.assertEqual(len(hdfdss[0].toExternalFiles()), 2) self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
def _run_bax_to_bam(input_file_name, output_file_name): base_name = ".".join(output_file_name.split(".")[:-2]) input_file_name_tmp = input_file_name # XXX bax2bam won't write an hdfsubreadset unless the input is XML too if input_file_name.endswith(".bax.h5"): input_file_name_tmp = tempfile.NamedTemporaryFile( suffix=".hdfsubreadset.xml").name ds_tmp = HdfSubreadSet(input_file_name) ds_tmp.write(input_file_name_tmp) args =[ "bax2bam", "--subread", "-o", base_name, "--output-xml", output_file_name, "--xml", input_file_name_tmp ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code with SubreadSet(output_file_name) as ds: ds.assertIndexed() return 0
def test_len_h5(self): # HdfSubreadSet # len means something else in bax/bas land. These numbers may actually # be correct... sset = HdfSubreadSet(data.getXml(17), strict=True) self.assertEqual(len(sset), 9) self.assertEqual(sset._length, (9, 128093)) self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) # AlignmentSet with cmp.h5 aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True) self.assertEqual(len(aln), 112) self.assertEqual(aln._length, (112, 59970)) self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112)
def test_split_hdfsubreadset(self): hdfds = HdfSubreadSet(*upstreamdata.getBaxH5_v23()) self.assertEqual(len(hdfds.toExternalFiles()), 3) hdfdss = hdfds.split(chunks=2, ignoreSubDatasets=True) self.assertEqual(len(hdfdss), 2) self.assertEqual(len(hdfdss[0].toExternalFiles()), 2) self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
def _run_bax_to_bam(input_file_name, output_file_name): base_name = ".".join(output_file_name.split(".")[:-2]) input_file_name_tmp = input_file_name # XXX bax2bam won't write an hdfsubreadset unless the input is XML too if input_file_name.endswith(".bax.h5"): input_file_name_tmp = tempfile.NamedTemporaryFile( suffix=".hdfsubreadset.xml").name ds_tmp = HdfSubreadSet(input_file_name) ds_tmp.write(input_file_name_tmp) args =[ "bax2bam", "--subread", "-o", base_name, "--output-xml", output_file_name, "--xml", input_file_name_tmp ] logging.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name shutil.move(output_file_name, tmp) # FIXME it would be better to leave this to bax2bam with SubreadSet(tmp) as ds: if not ds.isIndexed: ds.induceIndices() ds.write(output_file_name) return 0
def to_chunked_hdfsubreadset_files(hdfsubreadset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): dset = HdfSubreadSet(hdfsubreadset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, ignoreSubDatasets=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) c = PipelineChunk(chunk_id, **d) yield c
def run_bax_to_bam(input_file_name, output_file_name): with HdfSubreadSet(input_file_name) as ds_in: movies = set() for rr in ds_in.resourceReaders(): movies.add(rr.movieName) if len(movies) > 1: out_dir = os.path.dirname(output_file_name) ds_out_files = [] for bax_file in ds_in.toExternalFiles(): output_file_name_tmp = os.path.join( out_dir, ".".join(os.path.basename(bax_file).split(".")[:-2]) + ".hdfsubreadset.xml") rc = _run_bax_to_bam(bax_file, output_file_name_tmp) if rc != 0: log.error("bax2bam failed") return rc ds_out_files.append(output_file_name_tmp) ds = SubreadSet(*ds_out_files) ds.name = ds_in.name if 'Description' in ds_in.objMetadata: ds.objMetadata['Description'] = ds_in.objMetadata[ 'Description'] ds.metadata.merge(ds_in.metadata) ds.write(output_file_name) else: return _run_bax_to_bam(input_file_name, output_file_name) return 0
def _get_bax2bam_inputs(): """Little hackery to get the setup class Inputs and to avoid calls to setupclass if skiptest is used Nat: we want to test that this behaves properly when multiple movies are supplied as input, so we make an HdfSubreadSet on the fly from various bax files in testdata """ if HAVE_DATA_AND_BAX2BAM: hdf_subread_xml = tempfile.NamedTemporaryFile(suffix=".hdfsubreadset.xml").name bax_files = (SIV_DATA_DIR + "/SA3-RS/lambda/2372215/0007_tiny/Analysis_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.bax.h5", pbtestdata.get_file("rsii-bax-h5")) ds = HdfSubreadSet(*bax_files) ds.name = "lambda_rsii" assert len(set([f.movieName for f in ds.resourceReaders()])) == 2 ds.write(hdf_subread_xml) return [hdf_subread_xml] else: # Assume the test data isn't found and the test won't be run return ["/path/to/this-test-should-be-skipped.txt"]
def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [ AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), #ConsensusAlignmentSet(data.getXml(20)), HdfSubreadSet(data.getXml(19)) ] for ds in types: mystery = openDataFile(ds.toExternalFiles()[0]) self.assertEqual(type(mystery), type(ds))
def _get_bax2bam_inputs(): """Little hackery to get the setup class Inputs and to avoid calls to setupclass if skiptest is used Nat: we want to test that this behaves properly when multiple movies are supplied as input, so we make an HdfSubreadSet on the fly from various bax files in testdata """ if HAVE_DATA_AND_BAX2BAM: hdf_subread_xml = tempfile.NamedTemporaryFile( suffix=".hdfsubreadset.xml").name bax_files = ( SIV_DATA_DIR + "/SA3-RS/lambda/2372215/0007_tiny/Analysis_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.bax.h5", pbcore.data.getBaxH5_v23()[0]) ds = HdfSubreadSet(*bax_files) ds.name = "lambda_rsii" assert len(set([f.movieName for f in ds.resourceReaders()])) == 2 ds.write(hdf_subread_xml) return [hdf_subread_xml] else: # Assume the test data isn't found and the test won't be run return ["/path/to/this-test-should-be-skipped.txt"]
def test_incorrect_len_getitem(self): types = [ AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), HdfSubreadSet(data.getXml(19)) ] fn = tempfile.NamedTemporaryFile(suffix=".xml").name for ds in types: explen = -2 with openDataFile(ds.toExternalFiles()[0]) as mystery: # try to avoid crashes... explen = len(mystery) mystery.numRecords = 1000000000 mystery.write(fn) with openDataFile(fn) as mystery: self.assertEqual(len(list(mystery)), explen)
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(8), strict=True) self.assertEqual(len(aln), 92) self.assertEqual(aln._length, (92, 123588)) self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) # AlignmentSet with filters aln = AlignmentSet(data.getXml(15), strict=True) self.assertEqual(len(aln), 40) self.assertEqual(aln._length, (40, 52023)) self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) # NO LONGER SUPPORTED AlignmentSet with cmp.h5 #aln = AlignmentSet(upstreamData.getCmpH5(), strict=True) #self.assertEqual(len(aln), 84) #self.assertEqual(aln._length, (84, 26103)) #self.assertEqual(aln.totalLength, 26103) #self.assertEqual(aln.numRecords, 84) #aln.totalLength = -1 #aln.numRecords = -1 #self.assertEqual(aln.totalLength, -1) #self.assertEqual(aln.numRecords, -1) #aln.updateCounts() #self.assertEqual(aln.totalLength, 26103) #self.assertEqual(aln.numRecords, 84) # SubreadSet # TODO Turn this back on when pbi's are fixed for subreadsets #sset = SubreadSet(data.getXml(10), strict=True) #self.assertEqual(len(sset), 92) #self.assertEqual(sset._length, (92, 123588)) #self.assertEqual(sset.totalLength, 123588) #self.assertEqual(sset.numRecords, 92) #sset.totalLength = -1 #sset.numRecords = -1 #self.assertEqual(sset.totalLength, -1) #self.assertEqual(sset.numRecords, -1) #sset.updateCounts() #self.assertEqual(sset.totalLength, 123588) #self.assertEqual(sset.numRecords, 92) # HdfSubreadSet # len means something else in bax/bas land. These numbers may actually # be correct... sset = HdfSubreadSet(data.getXml(17), strict=True) self.assertEqual(len(sset), 9) self.assertEqual(sset._length, (9, 128093)) self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) # ReferenceSet sset = ReferenceSet(data.getXml(9), strict=True) self.assertEqual(len(sset), 59) self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59)
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(8), strict=True) self.assertEqual(len(aln), 92) self.assertEqual(aln._length, (92, 123588)) self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) self.assertEqual(sum(1 for _ in aln), 92) self.assertEqual(sum(len(rec) for rec in aln), 123588) # AlignmentSet with filters aln = AlignmentSet(data.getXml(15), strict=True) self.assertEqual(len(aln), 40) self.assertEqual(aln._length, (40, 52023)) self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) # AlignmentSet with cmp.h5 aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True) self.assertEqual(len(aln), 112) self.assertEqual(aln._length, (112, 59970)) self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112) # SubreadSet sset = SubreadSet(data.getXml(10), strict=True) self.assertEqual(len(sset), 92) self.assertEqual(sset._length, (92, 124093)) self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) self.assertEqual(sum(1 for _ in sset), 92) self.assertEqual(sum(len(rec) for rec in sset), 124093) # HdfSubreadSet # len means something else in bax/bas land. These numbers may actually # be correct... sset = HdfSubreadSet(data.getXml(17), strict=True) self.assertEqual(len(sset), 9) self.assertEqual(sset._length, (9, 128093)) self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) # ReferenceSet sset = ReferenceSet(data.getXml(9), strict=True) self.assertEqual(len(sset), 59) self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59)