def test_contigset_build(self): ds1 = ContigSet(data.getXml(3), skipMissing=True) assert type(ds1).__name__ == 'ContigSet' assert type(ds1._metadata).__name__ == 'ContigSetMetadata' ds2 = ContigSet(data.getXml(3), skipMissing=True) assert type(ds2).__name__ == 'ContigSet' assert type(ds2._metadata).__name__ == 'ContigSetMetadata'
def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets ]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_getitem(self): types = [AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), ] for ds in types: self.assertTrue(ds[0])
def test_subreadset_metadata_element_name(self): # without touching the element: sset = SubreadSet(data.getXml(9)) log.debug(data.getXml(9)) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml") log.debug(fn.name) sset.write(fn.name) f = ET.parse(fn.name) assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')) == 0 assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')) == 1 fn.close() # with touching the element: sset = SubreadSet(data.getXml(9)) sset.metadata.description = 'foo' fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml") sset.write(fn.name, validate=False) f = ET.parse(fn.name) assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')) == 0 assert len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')) == 1 fn.close()
def test_contigset_build(self): ds1 = ContigSet(data.getXml(3), skipMissing=True) self.assertEquals(type(ds1).__name__, 'ContigSet') self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata') ds2 = ContigSet(data.getXml(3), skipMissing=True) self.assertEquals(type(ds2).__name__, 'ContigSet') self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
def test_alignment_reference(self): rs1 = ReferenceSet(data.getXml(9)) fasta_res = rs1.externalResources[0] fasta_file = urlparse(fasta_res.resourceId).path ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=rs1) aln_ref = None for aln in ds1: aln_ref = aln.reference() break self.assertTrue(aln_ref is not None) ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=fasta_file) aln_ref = None for aln in ds1: aln_ref = aln.reference() break self.assertTrue(aln_ref is not None) ds1 = AlignmentSet(data.getXml(8)) ds1.addReference(fasta_file) aln_ref = None for aln in ds1: aln_ref = aln.reference() break self.assertTrue(aln_ref is not None)
def test_split(self): ds1 = DataSet(data.getXml()) self.assertTrue(ds1.numExternalResources > 1) dss = ds1.split() self.assertTrue(len(dss) == ds1.numExternalResources) dss = ds1.split(chunks=1) self.assertTrue(len(dss) == 1) dss = ds1.split(chunks=2, ignoreSubDatasets=True) self.assertTrue(len(dss) == 2) self.assertFalse(dss[0].uuid == dss[1].uuid) self.assertTrue(dss[0].name == dss[1].name) # Lets try merging and splitting on subdatasets ds1 = DataSet(data.getXml(8)) self.assertEquals(ds1.totalLength, 123588) ds1tl = ds1.totalLength ds2 = DataSet(data.getXml(11)) self.assertEquals(ds2.totalLength, 117086) ds2tl = ds2.totalLength dss = ds1 + ds2 self.assertTrue(dss.totalLength == (ds1tl + ds2tl)) ds1, ds2 = sorted(dss.split(2), key=lambda x: x.totalLength, reverse=True) self.assertTrue(ds1.totalLength == ds1tl) self.assertTrue(ds2.totalLength == ds2tl)
def test_ccsread_build(self): ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) self.assertEquals(type(ds1).__name__, 'ConsensusReadSet') self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata') ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) self.assertEquals(type(ds2).__name__, 'ConsensusReadSet') self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
def test_subreadset_split_metadata_element_name(self): fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) sset = SubreadSet(data.getXml(10), data.getXml(13)) chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True) self.assertEqual(len(chunks), 2) chunks[0].write(fn)
def test_subreadset_metadata_element_name(self): # without touching the element: sset = SubreadSet(data.getXml(10)) log.debug(data.getXml(10)) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) sset.write(fn) f = ET.parse(fn) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')), 0) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')), 1) # with touching the element: sset = SubreadSet(data.getXml(10)) sset.metadata.description = 'foo' fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn, validate=False) f = ET.parse(fn) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')), 0) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')), 1)
def test_ccsread_build(self): ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) assert type(ds1).__name__ == 'ConsensusReadSet' assert type(ds1._metadata).__name__ == 'SubreadSetMetadata' ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True) assert type(ds2).__name__ == 'ConsensusReadSet' assert type(ds2._metadata).__name__ == 'SubreadSetMetadata'
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(8), strict=True) self.assertEqual(len(aln), 92) self.assertEqual(aln._length, (92, 123588)) self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) self.assertEqual(sum(1 for _ in aln), 92) self.assertEqual(sum(len(rec) for rec in aln), 123588) # AlignmentSet with filters aln = AlignmentSet(data.getXml(15), strict=True) self.assertEqual(len(aln), 40) self.assertEqual(aln._length, (40, 52023)) self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) # SubreadSet sset = SubreadSet(data.getXml(10), strict=True) self.assertEqual(len(sset), 92) self.assertEqual(sset._length, (92, 124093)) self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) self.assertEqual(sum(1 for _ in sset), 92) self.assertEqual(sum(len(rec) for rec in sset), 124093) # ReferenceSet sset = ReferenceSet(data.getXml(9), strict=True) self.assertEqual(len(sset), 59) self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59)
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(7), strict=True) assert len(aln) == 92 assert aln._length == (92, 123588) assert aln.totalLength == 123588 assert aln.numRecords == 92 aln.totalLength = -1 aln.numRecords = -1 assert aln.totalLength == -1 assert aln.numRecords == -1 aln.updateCounts() assert aln.totalLength == 123588 assert aln.numRecords == 92 assert sum(1 for _ in aln) == 92 assert sum(len(rec) for rec in aln) == 123588 # AlignmentSet with filters aln = AlignmentSet(data.getXml(14), strict=True) assert len(aln) == 40 assert aln._length == (40, 52023) assert aln.totalLength == 52023 assert aln.numRecords == 40 aln.totalLength = -1 aln.numRecords = -1 assert aln.totalLength == -1 assert aln.numRecords == -1 aln.updateCounts() assert aln.totalLength == 52023 assert aln.numRecords == 40 # SubreadSet sset = SubreadSet(data.getXml(9), strict=True) assert len(sset) == 92 assert sset._length == (92, 124093) assert sset.totalLength == 124093 assert sset.numRecords == 92 sset.totalLength = -1 sset.numRecords = -1 assert sset.totalLength == -1 assert sset.numRecords == -1 sset.updateCounts() assert sset.totalLength == 124093 assert sset.numRecords == 92 assert sum(1 for _ in sset) == 92 assert sum(len(rec) for rec in sset) == 124093 # ReferenceSet sset = ReferenceSet(data.getXml(8), strict=True) assert len(sset) == 59 assert sset.totalLength == 85774 assert sset.numRecords == 59 sset.totalLength = -1 sset.numRecords = -1 assert sset.totalLength == -1 assert sset.numRecords == -1 sset.updateCounts() assert sset.totalLength == 85774 assert sset.numRecords == 59
def test_getitem(self): types = [ AlignmentSet(data.getXml(7)), ReferenceSet(data.getXml(8)), SubreadSet(data.getXml(9)), ] for ds in types: assert ds[0]
def test_copyTo_cli_absolute_dir(self): # to a directory: # absolute: outdir = tempfile.mkdtemp(suffix="dataset-unittest") fn = os.path.join(outdir, os.path.split(data.getXml(7))[1]) cmd = "dataset copyto {i} {o}".format(i=data.getXml(7), o=outdir) self._run_cmd_with_output(cmd, fn) sset = AlignmentSet(fn, strict=True) assert not _is_relative(fn)
def test_contigset_build(self): ds1 = ContigSet(data.getXml(3)) self.assertEquals(type(ds1).__name__, 'ContigSet') self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata') ds2 = ContigSet(data.getXml(3)) self.assertEquals(type(ds2).__name__, 'ContigSet') self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata') for contigmd in ds2.metadata.contigs: self.assertEquals(type(contigmd).__name__, 'ContigMetadata')
def test_file_arg(self): fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 10 qn = [r.qName for r in sset[:size]] with open(fn, 'w') as ofh: for q in qn: ofh.write(q) ofh.write('\n') good_qn = [('=', fn)] sset.filters.addRequirement(qname=good_qn) assert size == sum(1 for _ in sset) assert size == len(sset) og = set(qn) for r in sset: og.discard(r.qName) assert len(og) == 0 fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 10 qn = [r.qName for r in sset[:size]] with open(fn, 'w') as ofh: for q in qn: ofh.write(q) ofh.write('\n') good_qn = [('=', fn)] sset.filters.addRequirement(qname_file=good_qn) assert size == sum(1 for _ in sset) assert size == len(sset) og = set(qn) for r in sset: og.discard(r.qName) assert len(og) == 0 fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 4 hn = [r for r in sorted(list(set(sset.index.holeNumber)))[:size]] with open(fn, 'w') as ofh: for h in hn: ofh.write(str(h)) ofh.write('\n') good_hn = [('=', fn)] sset.filters.addRequirement(zm=good_hn) assert size == len(set(sset.index.holeNumber)) og = set(hn) for r in sset: og.discard(r.holeNumber) assert len(og) == 0
def test_create_cli(self): log.debug("Absolute") outdir = tempfile.mkdtemp(suffix="dataset-unittest") cmd = "dataset create --type AlignmentSet {o} {i1} {i2}".format( o=os.path.join(outdir, 'pbalchemysim.alignmentset.xml'), i1=data.getXml(7), i2=data.getXml(10)) self._check_cmd(cmd) assert os.path.exists( os.path.join(outdir, os.path.basename(data.getXml(11))))
def test_create_cli_relative(self): log.debug("Relative") outdir = tempfile.mkdtemp(suffix="dataset-unittest") ofn = self._get_mock_alignment_set_out(outdir) cmd = ("dataset create --relative --type AlignmentSet " "{o} {i1} {i2}".format(o=ofn, i1=data.getXml(7), i2=data.getXml(10))) self._check_cmd(cmd) assert os.path.exists(ofn)
def test_create_cli_automatic_type(self): log.debug("No type specified") outdir = tempfile.mkdtemp(suffix="dataset-unittest") ofn = self._get_mock_alignment_set_out(outdir) cmd = "dataset create {o} {i1} {i2}".format(o=ofn, i1=data.getXml(7), i2=data.getXml(10)) self._run_cmd_with_output(cmd, ofn) aset = AlignmentSet(ofn) shutil.rmtree(outdir)
def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), #ConsensusAlignmentSet(data.getXml(20)), HdfSubreadSet(data.getXml(19))] for ds in types: mystery = openDataFile(ds.toExternalFiles()[0]) self.assertEqual(type(mystery), type(ds))
def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [ AlignmentSet(data.getXml(7)), ReferenceSet(data.getXml(8)), SubreadSet(data.getXml(9)) ] for ds in types: mystery = openDataFile(ds.toExternalFiles()[0]) assert type(mystery) == type(ds)
def test_checkFilterMatch(self): # different resourceIds, compatible filters: ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=11)) #self.assertTrue(ds1._checkFilterMatch(ds2.filters)) self.assertTrue(ds1.filters.testCompatibility(ds2.filters)) # different resourceIds, incompatible filters: ds3 = DataSet(data.getXml(no=11)) ds3.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) #self.assertFalse(ds1._checkFilterMatch(ds3.filters)) self.assertFalse(ds1.filters.testCompatibility(ds3.filters))
def test_create_cli_generate_indices_2(self): log.debug("Generate existing indices no type specified") outdir = tempfile.mkdtemp(suffix="dataset-unittest") ofn = self._get_mock_alignment_set_out(outdir) cmd = ("dataset create " "--generateIndices {o} {i1} {i2}").format(o=ofn, i1=data.getXml(7), i2=data.getXml(10)) self._run_cmd_with_output(cmd, ofn) aset = AlignmentSet(ofn, strict=True) shutil.rmtree(outdir)
def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [ AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), #ConsensusAlignmentSet(data.getXml(20)), HdfSubreadSet(data.getXml(19)) ] for ds in types: mystery = openDataFile(ds.toExternalFiles()[0]) self.assertEqual(type(mystery), type(ds))
def test_subreadset_consolidate(self): log.debug("Test through API") aln = SubreadSet(data.getXml(10), data.getXml(13)) self.assertEqual(len(aln.toExternalFiles()), 2) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'merged.bam') aln.consolidate(outfn) self.assertTrue(os.path.exists(outfn)) self.assertEqual(len(aln.toExternalFiles()), 1) nonCons = SubreadSet(data.getXml(10), data.getXml(13)) self.assertEqual(len(nonCons.toExternalFiles()), 2) for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))): self.assertEqual(read1, read2) self.assertEqual(len(aln), len(nonCons))
def test_nested_external_resources(self): log.debug("Testing nested externalResources in AlignmentSets") aln = AlignmentSet(data.getXml(0), skipMissing=True) self.assertTrue(aln.externalResources[0].pbi) self.assertTrue(aln.externalResources[0].reference) self.assertEqual( aln.externalResources[0].externalResources[0].metaType, 'PacBio.ReferenceFile.ReferenceFastaFile') self.assertEqual(aln.externalResources[0].scraps, None) log.debug("Testing nested externalResources in SubreadSets") subs = SubreadSet(data.getXml(5), skipMissing=True) self.assertTrue(subs.externalResources[0].scraps) self.assertEqual( subs.externalResources[0].externalResources[0].metaType, 'PacBio.SubreadFile.ScrapsBamFile') self.assertEqual(subs.externalResources[0].reference, None) log.debug("Testing added nested externalResoruces to SubreadSet") subs = SubreadSet(data.getXml(10)) self.assertFalse(subs.externalResources[0].scraps) subs.externalResources[0].scraps = 'fake.fasta' self.assertTrue(subs.externalResources[0].scraps) self.assertEqual( subs.externalResources[0].externalResources[0].metaType, 'PacBio.SubreadFile.ScrapsBamFile') subs.externalResources[0].barcodes = 'bc.fasta' self.assertTrue(subs.externalResources[0].barcodes) self.assertEqual( subs.externalResources[0].externalResources[1].metaType, "PacBio.DataSet.BarcodeSet") subs.externalResources[0].adapters = 'foo.adapters.fasta' self.assertEqual(subs.externalResources[0].adapters, 'foo.adapters.fasta') self.assertEqual( subs.externalResources[0].externalResources[2].metaType, "PacBio.SubreadFile.AdapterFastaFile") log.debug("Testing adding nested externalResources to AlignmetnSet " "manually") aln = AlignmentSet(data.getXml(8)) self.assertTrue(aln.externalResources[0].bai) self.assertTrue(aln.externalResources[0].pbi) self.assertFalse(aln.externalResources[0].reference) aln.externalResources[0].reference = 'fake.fasta' self.assertTrue(aln.externalResources[0].reference) self.assertEqual( aln.externalResources[0].externalResources[0].metaType, 'PacBio.ReferenceFile.ReferenceFastaFile')
def test_incorrect_len_getitem(self): types = [AlignmentSet(data.getXml(8)), ReferenceSet(data.getXml(9)), SubreadSet(data.getXml(10)), HdfSubreadSet(data.getXml(19))] fn = tempfile.NamedTemporaryFile(suffix=".xml").name for ds in types: explen = -2 with openDataFile(ds.toExternalFiles()[0]) as mystery: # try to avoid crashes... explen = len(mystery) mystery.numRecords = 1000000000 mystery.write(fn) with openDataFile(fn) as mystery: self.assertEqual(len(list(mystery)), explen)
def test_subread_build(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) ds2 = SubreadSet(data.getXml(no=5), skipMissing=True) assert type(ds1).__name__ == 'SubreadSet' assert ds1._metadata.__class__.__name__ == 'SubreadSetMetadata' assert type(ds1._metadata).__name__ == 'SubreadSetMetadata' assert type(ds1.metadata).__name__ == 'SubreadSetMetadata' assert len(ds1.metadata.collections) == 1 assert len(ds2.metadata.collections) == 1 ds3 = ds1 + ds2 assert len(ds3.metadata.collections) == 2 ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True) assert type(ds4).__name__ == 'SubreadSet' assert type(ds4._metadata).__name__ == 'SubreadSetMetadata' assert len(ds4.metadata.collections) == 1
def test_filter_cli(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, "filtered8.xml") log.debug(outfn) cmd = "dataset filter {i} {o} {f}".format(i=data.getXml(7), o=outfn, f="rname=E.faecalis.1") self._run_cmd_with_output(cmd, outfn) aln = AlignmentSet(data.getXml(7)) aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) aln.updateCounts() dset = AlignmentSet(outfn) assert str(aln.filters) == str(dset.filters) assert aln.totalLength == dset.totalLength assert aln.numRecords == dset.numRecords
def test_factory_function(self): bam = data.getBam() aln = data.getXml(8) ref = data.getXml(9) sub = data.getXml(10) inTypes = [bam, aln, ref, sub] expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet] for infn, exp in zip(inTypes, expTypes): # TODO enable this for all when simulated subread files can be # pbi'd if exp in [DataSet, ReferenceSet, AlignmentSet]: ds = openDataSet(infn, strict=True) else: ds = openDataSet(infn) self.assertEqual(type(ds), exp)
def test_loadmetadata_from_dataset_create_cli(self): fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name log.debug(fn) aln = AlignmentSet(data.getXml(8)) aln.metadata.collections = None aln.copyTo(fn) aln.close() del aln self.assertTrue(os.path.exists(fn)) aln = AlignmentSet(fn) self.assertFalse(aln.metadata.collections) cmd = "dataset create --metadata {m} {o} {i}".format( o=fn2, i=fn, m=("/pbi/dept/secondary/siv/testdata/" "SA3-Sequel/lambda/roche_SAT/" "m54013_151205_032353.subreadset.xml")) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0, m) aln = AlignmentSet(fn2) self.assertTrue(aln.metadata.collections)
def test_referenceset_contigs(self): names = [ 'A.baumannii.1', 'A.odontolyticus.1', 'B.cereus.1', 'B.cereus.2', 'B.cereus.4', 'B.cereus.6', 'B.vulgatus.1', 'B.vulgatus.2', 'B.vulgatus.3', 'B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.1', 'C.beijerinckii.2', 'C.beijerinckii.3', 'C.beijerinckii.4', 'C.beijerinckii.5', 'C.beijerinckii.6', 'C.beijerinckii.7', 'C.beijerinckii.8', 'C.beijerinckii.9', 'C.beijerinckii.10', 'C.beijerinckii.11', 'C.beijerinckii.12', 'C.beijerinckii.13', 'C.beijerinckii.14', 'D.radiodurans.1', 'D.radiodurans.2', 'E.faecalis.1', 'E.faecalis.2', 'E.coli.1', 'E.coli.2', 'E.coli.4', 'E.coli.5', 'E.coli.6', 'E.coli.7', 'H.pylori.1', 'L.gasseri.1', 'L.monocytogenes.1', 'L.monocytogenes.2', 'L.monocytogenes.3', 'L.monocytogenes.5', 'N.meningitidis.1', 'P.acnes.1', 'P.aeruginosa.1', 'P.aeruginosa.2', 'R.sphaeroides.1', 'R.sphaeroides.3', 'S.aureus.1', 'S.aureus.4', 'S.aureus.5', 'S.epidermidis.1', 'S.epidermidis.2', 'S.epidermidis.3', 'S.epidermidis.4', 'S.epidermidis.5', 'S.agalactiae.1', 'S.mutans.1', 'S.mutans.2', 'S.pneumoniae.1'] seqlens = [1458, 1462, 1472, 1473, 1472, 1472, 1449, 1449, 1449, 1449, 1449, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1423, 1423, 1482, 1482, 1463, 1463, 1463, 1463, 1463, 1463, 1424, 1494, 1471, 1471, 1471, 1471, 1462, 1446, 1457, 1457, 1386, 1388, 1473, 1473, 1473, 1472, 1472, 1472, 1472, 1472, 1470, 1478, 1478, 1467] ds = ReferenceSet(data.getXml(9)) log.debug([contig.id for contig in ds]) for contig, name, seqlen in zip(ds.contigs, names, seqlens): self.assertEqual(contig.id, name) self.assertEqual(len(contig.sequence), seqlen) for name in names: self.assertTrue(ds.get_contig(name))
def test_len_h5(self): # HdfSubreadSet # len means something else in bax/bas land. These numbers may actually # be correct... sset = HdfSubreadSet(data.getXml(17), strict=True) self.assertEqual(len(sset), 9) self.assertEqual(sset._length, (9, 128093)) self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 128093) self.assertEqual(sset.numRecords, 9) # AlignmentSet with cmp.h5 aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True) self.assertEqual(len(aln), 112) self.assertEqual(aln._length, (112, 59970)) self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 59970) self.assertEqual(aln.numRecords, 112)
def test_reads_in_reference(self): ds = DataSet(data.getBam()) refNames = ds.refNames # See test_ref_names for why this is expected: rn = refNames[15] reads = ds.readsInReference(rn) self.assertEqual(len(list(reads)), 11) ds2 = DataSet(data.getBam(0)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 3) ds2 = DataSet(data.getXml(8)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) # Because of the filter! reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 0)
def test_subread_build(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) ds2 = SubreadSet(data.getXml(no=5), skipMissing=True) self.assertEquals(type(ds1).__name__, 'SubreadSet') self.assertEquals(ds1._metadata.__class__.__name__, 'SubreadSetMetadata') self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata') self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') self.assertEquals(len(ds1.metadata.collections), 1) self.assertEquals(len(ds2.metadata.collections), 1) ds3 = ds1 + ds2 self.assertEquals(len(ds3.metadata.collections), 2) ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True) self.assertEquals(type(ds4).__name__, 'SubreadSet') self.assertEquals(type(ds4._metadata).__name__, 'SubreadSetMetadata') self.assertEquals(len(ds4.metadata.collections), 1)
def test_qname_filter_scaling(self): # unaligned bam bam0 = ("/pbi/dept/secondary/siv/testdata/" "SA3-DS/ecoli/2590956/0003/" "Analysis_Results/m140913_222218_42240_c10069" "9952400000001823139203261564_s1_p0.all.subreadset.xml") bam1 = ("/pbi/dept/secondary/siv/testdata/" "SA3-DS/ecoli/2590953/0001/" "Analysis_Results/m140913_005018_42139_c10071" "3652400000001823152404301534_s1_p0.all.subreadset.xml") sset = SubreadSet(bam0, bam1) self.assertEqual(len(sset), 178570) size = 10 qn = [r.qName for r in sset[:size]] good_qn = [('=', name) for name in qn] sset.filters.addRequirement(qname=good_qn) self.assertEqual(size, sum(1 for _ in sset)) self.assertEqual(size, len(sset)) sset = SubreadSet(data.getXml(10)) self.assertEqual(len(sset), 92) size = 10 qn = [r.qName for r in sset[:size]] good_qn = [('=', name) for name in qn] sset.filters.addRequirement(qname=good_qn) self.assertEqual(size, sum(1 for _ in sset)) self.assertEqual(size, len(sset))
def test_add_double_bound_filters(self): ds1 = AlignmentSet(data.getXml(8)) ds1.filters.addRequirement(rq=[('>', '0.85'), ('<', '0.99')]) self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )') ds1 = AlignmentSet(data.getXml(8)) self.assertEquals(str(ds1.filters), '') ds1.filters.addFilter(rq=[('>', '0.85'), ('<', '0.99')]) self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )') ds1.filters.addFilter(length=[('>', '1000')]) self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )') ds1.filters.removeFilter(0) self.assertEquals(str(ds1.filters), '( length > 1000 )')
def test_contigset_consolidate_int_names(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
def test_pbmerge_indexing(self): log.debug("Test through API") aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(aln.toExternalFiles()), 2) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'merged.bam') log.info(outfn) consolidateXml(aln, outfn, cleanup=False) self.assertTrue(os.path.exists(outfn)) self.assertTrue(os.path.exists(outfn + '.pbi')) cons = AlignmentSet(outfn) self.assertEqual(len(aln), len(cons)) orig_stats = os.stat(outfn + '.pbi') cons.externalResources[0].pbi = None self.assertEqual(None, cons.externalResources[0].pbi) # test is too quick, stat times might be within the same second time.sleep(1) cons.induceIndices() self.assertEqual(outfn + '.pbi', cons.externalResources[0].pbi) self.assertEqual(orig_stats, os.stat(cons.externalResources[0].pbi)) cons.externalResources[0].pbi = None self.assertEqual(None, cons.externalResources[0].pbi) # test is too quick, stat times might be within the same second time.sleep(1) cons.induceIndices(force=True) self.assertNotEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(10)) col = CollectionMetadata() self.assertFalse(ss.metadata.collections) ss.metadata.collections.append(col) self.assertTrue(ss.metadata.collections) col.cellIndex = 1 self.assertTrue(ss.metadata.collections[0].cellIndex, 1) col.instrumentName = "foo" self.assertTrue(ss.metadata.collections[0].instrumentName, "foo") col.context = 'bar' self.assertTrue(ss.metadata.collections[0].context, "bar") ss.metadata.collections[0].runDetails.name = 'foo' self.assertEqual('foo', ss.metadata.collections[0].runDetails.name) ss.metadata.collections[0].wellSample.name = 'bar' self.assertEqual('bar', ss.metadata.collections[0].wellSample.name) ss.metadata.collections[0].wellSample.wellName = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName) ss.metadata.collections[0].wellSample.concentration = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.concentration) ss.write(ofn, validate=False)
def test_newUuid_random_cli(self): fn_orig = data.getXml(8) outdir = tempfile.mkdtemp(suffix="dataset-unittest") fn = os.path.join(outdir, 'fn.alignmentset.xml') fn2 = os.path.join(outdir, 'fn2.alignmentset.xml') with AlignmentSet(fn_orig) as aln: aln.copyTo(fn) shutil.copy(fn, fn2) pre_uuid = AlignmentSet(fn).uuid pre_uuid2 = AlignmentSet(fn2).uuid self.assertEqual(pre_uuid, pre_uuid2) cmd = "dataset newuuid --random {d}".format(d=fn) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn)) cmd = "dataset newuuid --random {d}".format(d=fn2) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn2)) post_uuid = AlignmentSet(fn).uuid post_uuid2 = AlignmentSet(fn2).uuid self.assertNotEqual(pre_uuid, post_uuid) self.assertNotEqual(pre_uuid2, post_uuid2) # RANDOM, THEREFORE THESE ARE NOT EQUAL: self.assertNotEqual(post_uuid, post_uuid2)
def test_split_by_contigs_presplit(self): # Consumes too much memory for Jenkins # Test to make sure the result of a split by contigs has an appropriate # number of records (make sure filters are appropriately aggressive) ds2 = DataSet(data.getXml(15)) bams = ds2.externalResources.resourceIds self.assertEqual(len(bams), 2) refwindows = ds2.refWindows self.assertEqual(refwindows, [(0, 0, 224992)]) res1 = openIndexedAlignmentFile(bams[0][7:]) res2 = openIndexedAlignmentFile(bams[1][7:]) def count(iterable): count = 0 for _ in iterable: count += 1 return count self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409) self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375) self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784) self.assertEqual(count(ds2.records), 2784) ds2.disableFilters() self.assertEqual(count(ds2.records), 53552) self.assertEqual(ds2.countRecords(), 53552)
def test_subset_filter(self): ds2 = AlignmentSet(data.getXml(7)) assert len(ds2) == 92 modvalue = 8 # manually: hns = ds2.index.holeNumber assert np.count_nonzero(hns % modvalue == 0) == 26 # dset filters: ds2.filters.addRequirement(zm=[('=', '0', modvalue)]) assert len(ds2) == 26 # written: filtstr = '( Uint32Cast(zm) % 8 = 0 )' assert str(ds2.filters) == filtstr filtxmlstr = ('<pbbase:Property Hash="Uint32Cast" Modulo="8" ' 'Name="zm" Operator="=" Value="0"/>') fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name ds2.write(fn) with open(fn, 'r') as ifh: found = False for line in ifh: if filtxmlstr in line: found = True assert found
def test_filter(self): ds2 = DataSet(data.getXml(8)) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) self.assertEqual(len(list(ds2.records)), 20) ds2.disableFilters() self.assertEqual(len(list(ds2.records)), 92) ds2.enableFilters() self.assertEqual(len(list(ds2.records)), 20)
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join([exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual(acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def test_contigset_len(self): ref = ReferenceSet(data.getXml(9)) exp_n_contigs = len(ref) refs = ref.split(10) self.assertEqual(len(refs), 10) obs_n_contigs = 0 for r in refs: obs_n_contigs += len(r) self.assertEqual(obs_n_contigs, exp_n_contigs)
def test_addFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) self.assertEquals(str(ds1.filters), '( rq > 0.85 )') # Or added from a source XML ds2 = DataSet(data.getXml(16)) self.assertTrue(str(ds2.filters).startswith( '( rname = E.faecalis'))
def test_merge(self): # xmls with different resourceIds: success ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=11)) ds3 = ds1 + ds2 expected = ds1.numExternalResources + ds2.numExternalResources self.assertTrue(ds3.numExternalResources == expected) # xmls with different resourceIds but conflicting filters: # failure to merge ds2 = DataSet(data.getXml(no=11)) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) ds3 = ds1 + ds2 self.assertEqual(ds3, None) # xmls with same resourceIds: ignores new inputs ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=8)) ds3 = ds1 + ds2 expected = ds1.numExternalResources self.assertTrue(ds3.numExternalResources == expected)