def test_reads_in_reference(self): ds = DataSet(data.getBam()) refNames = ds.refNames # See test_ref_names for why this is expected: rn = refNames[15] reads = ds.readsInReference(rn) self.assertEqual(len(list(reads)), 11) ds2 = DataSet(data.getBam(0)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 3) ds2 = DataSet(data.getXml(8)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) # Because of the filter! reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 0)
def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets ]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_refWindows(self): ds = AlignmentSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue( '( rname = E.faecalis.2 ' in str(dss[0].filters) or '( rname = E.faecalis.2 ' in str(dss[1].filters)) ds = AlignmentSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'), ('=', 'E.faecalis.2')], tStart=[('<', '99'), ('<', '299')], tEnd=[('>', '0'), ('>', '100')]) self.assertEqual(str(ds.filters), '( rname = E.faecalis.2 AND tstart ' '< 99 AND tend > 0 ) OR ( rname = ' 'E.faecalis.2 AND tstart < 299 AND tend > 100 )') self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99), ('E.faecalis.2', 100, 299)])
def test_readGroupTable(self): aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2)) readers = aln.resourceReaders() self.assertEqual(len(readers[0].readGroupTable), 1) self.assertEqual(len(readers[1].readGroupTable), 1) self.assertEqual(len(readers[2].readGroupTable), 1) self.assertEqual(len(aln.readGroupTable), 3)
def test_referenceInfoTable(self): aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2)) readers = aln.resourceReaders() self.assertEqual(len(readers[0].referenceInfoTable), 1) self.assertEqual(len(readers[1].referenceInfoTable), 59) self.assertEqual(len(readers[2].referenceInfoTable), 1) self.assertEqual(readers[0].referenceInfoTable.Name, readers[2].referenceInfoTable.Name) self.assertEqual(len(aln.referenceInfoTable), 60)
def test_updateCounts(self): log.info("Testing updateCounts without filters") aln = AlignmentSet(data.getBam(0)) readers = aln.resourceReaders() expLen = 0 for reader in readers: for record in reader: expLen += record.readLength self.assertEqual( record.aStart, record.bam.pbi[record.rowNumber]['aStart']) self.assertEqual( record.aEnd, record.bam.pbi[record.rowNumber]['aEnd']) expNum = 0 for reader in readers: expNum += len(reader) accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum) log.info("Testing whether filters are respected") aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) aln.updateCounts() accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords def count(gen): count = 0 for _ in gen: count += 1 return count expLen = 0 for reader in readers: for record in reader: expLen += record.readLength bfile = openIndexedAlignmentFile(data.getBam(0)) rWin = (bfile.referenceInfo('E.faecalis.1').ID, 0, bfile.referenceInfo('E.faecalis.1').Length) reads = bfile.readsInRange(*rWin) expNum = count(reads) expLen = 0 reads = bfile.readsInRange(*rWin) for read in reads: expLen += read.readLength self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum)
def test_updateCounts(self): log.info("Testing updateCounts without filters") aln = AlignmentSet(data.getBam(0)) readers = aln.resourceReaders() expLen = 0 for reader in readers: for record in reader: expLen += record.readLength self.assertEqual(record.aStart, record.bam.pbi[record.rowNumber]['aStart']) self.assertEqual(record.aEnd, record.bam.pbi[record.rowNumber]['aEnd']) expNum = 0 for reader in readers: expNum += len(reader) accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum) log.info("Testing whether filters are respected") aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) aln.updateCounts() accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords def count(gen): count = 0 for _ in gen: count += 1 return count expLen = 0 for reader in readers: for record in reader: expLen += record.readLength bfile = openIndexedAlignmentFile(data.getBam(0)) rWin = (bfile.referenceInfo('E.faecalis.1').ID, 0, bfile.referenceInfo('E.faecalis.1').Length) reads = bfile.readsInRange(*rWin) expNum = count(reads) expLen = 0 reads = bfile.readsInRange(*rWin) for read in reads: expLen += read.readLength self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum)
def test_write(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfile = os.path.join(outdir, 'tempfile.xml') ds1 = AlignmentSet(data.getBam()) ds1.write(outfile) log.debug('Validated file: {f}'.format(f=outfile)) validateFile(outfile) ds2 = AlignmentSet(outfile) self.assertTrue(ds1 == ds2) # Should fail when strict: ds3 = AlignmentSet(data.getBam()) ds3.write(outfile)
def test_loading_reference(self): log.info('Opening Reference') r = ReferenceSet(data.getRef()).toExternalFiles()[0] log.info('Done Opening Reference') log.info('Opening AlignmentSet') d = AlignmentSet(data.getBam(), referenceFastaFname=r) log.info('Done Opening AlignmentSet') bfile = openIndexedAlignmentFile(data.getBam(), referenceFastaFname=r) self.assertTrue(bfile.isReferenceLoaded) for res in d.resourceReaders(): self.assertTrue(res.isReferenceLoaded) aln = AlignmentSet(data.getBam()) aln.addReference(r) for res in aln.resourceReaders(): self.assertTrue(res.isReferenceLoaded)
def test_refLengths(self): ds = DataSet(data.getBam(0)) random_few = { 'B.cereus.6': 1472, 'S.agalactiae.1': 1470, 'B.cereus.4': 1472 } for key, value in random_few.items(): self.assertEqual(ds.refLengths[key], value) # this is a hack to only emit refNames that actually have records # associated with them: dss = ds.split(contigs=True, chunks=1)[0] self.assertEqual( dss.refLengths, { 'B.vulgatus.4': 1449, 'B.vulgatus.5': 1449, 'C.beijerinckii.13': 1433, 'C.beijerinckii.14': 1433, 'C.beijerinckii.9': 1433, 'E.coli.6': 1463, 'E.faecalis.1': 1482, 'E.faecalis.2': 1482, 'R.sphaeroides.1': 1386, 'S.epidermidis.2': 1472, 'S.epidermidis.3': 1472, 'S.epidermidis.4': 1472 })
def test_repr(self): ds = DataSet(data.getBam()) rep = str(ds) self.assertTrue(re.search('DataSet', rep)) self.assertTrue(re.search('uuid:', rep)) self.assertTrue(re.search('pbalchemysim0.pbalign.bam', rep))
def test_referenceInfo(self): aln = AlignmentSet(data.getBam(0)) readers = aln.resourceReaders() self.assertEqual(len(readers[0].referenceInfoTable), 59) self.assertEqual( str(readers[0].referenceInfo('E.faecalis.1')), "(27, 27, 'E.faecalis.1', 'E.faecalis.1', 1482, " "'a1a59c267ac1341e5a12bce7a7d37bcb', 0L, 0L)")
def test_attributes(self): aln = AlignmentSet(data.getBam(0)) self.assertEqual(aln.sequencingChemistry, ['unknown']) self.assertEqual(aln.isSorted, True) self.assertEqual(aln.isEmpty, False) self.assertEqual(aln.readType, 'standard') self.assertEqual(len(aln.tStart), aln.metadata.numRecords) self.assertEqual(len(aln.tEnd), aln.metadata.numRecords)
def test_refWindows(self): ds = DataSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue('( rname = E.faecalis.2 ) ' in str(dss[0].filters) or '( rname = E.faecalis.2 ) ' in str(dss[1].filters)) ds = DataSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), ('=', 'lambda_NEB3011')], tStart=[('<', '0'), ('<', '100')], tEnd=[('>', '99'), ('>', '299')]) self.assertEqual( str(ds.filters), '( rname = lambda_NEB3011 AND tstart ' '< 0 AND tend > 99 ) OR ( rname = lambd' 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
def test_updateCounts_without_pbi(self): log.info("Testing updateCounts without pbi") data_fname = data.getBam(0) outdir = tempfile.mkdtemp(suffix="dataset-unittest") tempout = os.path.join(outdir, os.path.basename(data_fname)) backticks('cp {i} {o}'.format(i=data_fname, o=tempout)) aln = AlignmentSet(tempout, strict=False) self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1)
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_refWindows(self): ds = AlignmentSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue('( rname = E.faecalis.2 ' in str(dss[0].filters) or '( rname = E.faecalis.2 ' in str(dss[1].filters)) ds = AlignmentSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'), ('=', 'E.faecalis.2')], tStart=[('<', '99'), ('<', '299')], tEnd=[('>', '0'), ('>', '100')]) self.assertEqual( str(ds.filters), '( rname = E.faecalis.2 AND tstart ' '< 99 AND tend > 0 ) OR ( rname = ' 'E.faecalis.2 AND tstart < 299 AND tend > 100 )') self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99), ('E.faecalis.2', 100, 299)])
def test_reads_in_range(self): ds = DataSet(data.getBam()) refNames = ds.refNames rn = refNames[15] reads = ds.readsInRange(rn, 10, 100) self.assertEqual(len(list(reads)), 10) ds2 = DataSet(data.getBam(0)) reads = ds2.readsInRange("E.faecalis.1", 0, 1400) self.assertEqual(len(list(reads)), 20) lengths = ds.refLengths for rname, rId in ds.refInfo('ID'): rn = ds._idToRname(rId) self.assertEqual(rname, rn) rlen = lengths[rn] self.assertEqual(len(list(ds.readsInReference(rn))), len(list(ds.readsInReference(rId)))) self.assertEqual(len(list(ds.readsInRange(rn, 0, rlen))), len(list(ds.readsInRange(rId, 0, rlen))))
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = AlignmentSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_build(self): # Progs like pbalign provide a .bam file: # e.g. d = DataSet("aligned.bam") # Something like the test files we have: inBam = data.getBam() self.assertTrue(inBam.endswith('.bam')) d = DataSet(inBam) # A UniqueId is generated, despite being a BAM input self.assertTrue(d.uuid != '') dOldUuid = d.uuid # They can write this BAM to an XML: # e.g. d.write("alignmentset.xml") outdir = tempfile.mkdtemp(suffix="dataset-unittest") outXml = os.path.join(outdir, 'tempfile.xml') d.write(outXml) # And then recover the same XML (or a different one): # e.g. d = DataSet("alignmentset.xml") d = DataSet(outXml) # The UniqueId will be the same self.assertTrue(d.uuid == dOldUuid) # Inputs can be many and varied ds1 = DataSet(data.getXml(11), data.getBam()) self.assertEquals(ds1.numExternalResources, 2) ds1 = DataSet(data.getFofn()) self.assertEquals(ds1.numExternalResources, 2) # New! Use the correct constructor: self.assertEquals(type(SubreadSet(data.getSubreadSet())).__name__, 'SubreadSet') # Even with untyped inputs self.assertTrue(str(SubreadSet(data.getBam())).startswith( '<SubreadSet')) self.assertEquals(type(SubreadSet(data.getBam())).__name__, 'SubreadSet') self.assertEquals(type(DataSet(data.getBam())).__name__, 'DataSet') # You can also cast up and down, but casting between siblings # is limited (abuse at your own risk) self.assertEquals( type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__, 'SubreadSet') self.assertEquals( type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__, 'DataSet') # Add external Resources: ds = DataSet() ds.externalResources.addResources(["IdontExist.bam"]) self.assertTrue( ds.externalResources[-1].resourceId == "IdontExist.bam") # Add an index file ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"]) self.assertTrue( ds.externalResources[-1].indices[0].resourceId == "IdontExist.bam.pbi")
def test_referenceInfoTableMerging(self): log.info("Testing refIds, etc. after merging") ds = DataSet(data.getXml(17)) also_lambda = ds.toExternalFiles()[0] aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda) readers = aln.resourceReaders() ids = sorted([i for _, i in aln.refInfo('ID')]) self.assertEqual(range(len(ids)), ids) accNames = aln.refNames expNames = reduce(np.append, [reader.referenceInfoTable['Name'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames)) accNames = aln.fullRefNames expNames = reduce(np.append, [reader.referenceInfoTable['FullName'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames))
def test_referenceInfoTableMerging(self): log.info("Testing refIds, etc. after merging") ds = DataSet(data.getXml(17)) also_lambda = ds.toExternalFiles()[0] aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda) readers = aln.resourceReaders() ids = sorted([i for _, i in aln.refInfo('ID')]) self.assertEqual(range(len(ids)), ids) accNames = aln.refNames expNames = reduce( np.append, [reader.referenceInfoTable['Name'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames)) accNames = aln.fullRefNames expNames = reduce( np.append, [reader.referenceInfoTable['FullName'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames))
def test_refWindows(self): ds = DataSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue( '( rname = E.faecalis.2 ) ' in str(dss[0].filters) or '( rname = E.faecalis.2 ) ' in str(dss[1].filters)) ds = DataSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), ('=', 'lambda_NEB3011')], tStart=[('<', '0'), ('<', '100')], tEnd=[('>', '99'), ('>', '299')]) self.assertEqual(str(ds.filters), '( rname = lambda_NEB3011 AND tstart ' '< 0 AND tend > 99 ) OR ( rname = lambd' 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
def test_factory_function(self): bam = data.getBam() aln = data.getXml(8) ref = data.getXml(9) sub = data.getXml(10) inTypes = [bam, aln, ref, sub] expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet] for infn, exp in zip(inTypes, expTypes): # TODO enable this for all when simulated subread files can be # pbi'd if exp in [DataSet, ReferenceSet, AlignmentSet]: ds = openDataSet(infn, strict=True) else: ds = openDataSet(infn) self.assertEqual(type(ds), exp)
def test_build(self): # Progs like pbalign provide a .bam file: # e.g. d = DataSet("aligned.bam") # Something like the test files we have: inBam = data.getBam() self.assertTrue(inBam.endswith('.bam')) d = DataSet(inBam) # A UniqueId is generated, despite being a BAM input self.assertTrue(d.uuid != '') dOldUuid = d.uuid # They can write this BAM to an XML: # e.g. d.write("alignmentset.xml") outdir = tempfile.mkdtemp(suffix="dataset-unittest") outXml = os.path.join(outdir, 'tempfile.xml') d.write(outXml) # And then recover the same XML (or a different one): # e.g. d = DataSet("alignmentset.xml") d = DataSet(outXml) # The UniqueId will be the same self.assertTrue(d.uuid == dOldUuid) # Inputs can be many and varied ds1 = DataSet(data.getXml(11), data.getBam()) self.assertEquals(ds1.numExternalResources, 2) ds1 = DataSet(data.getFofn()) self.assertEquals(ds1.numExternalResources, 2) # New! Use the correct constructor: self.assertEquals( type(SubreadSet(data.getSubreadSet())).__name__, 'SubreadSet') # Even with untyped inputs self.assertTrue( str(SubreadSet(data.getBam())).startswith('<SubreadSet')) self.assertEquals( type(SubreadSet(data.getBam())).__name__, 'SubreadSet') self.assertEquals(type(DataSet(data.getBam())).__name__, 'DataSet') # You can also cast up and down, but casting between siblings # is limited (abuse at your own risk) self.assertEquals( type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__, 'SubreadSet') self.assertEquals( type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__, 'DataSet') # Add external Resources: ds = DataSet() ds.externalResources.addResources(["IdontExist.bam"]) self.assertTrue( ds.externalResources[-1].resourceId == "IdontExist.bam") # Add an index file ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"]) self.assertTrue(ds.externalResources[-1].indices[0].resourceId == "IdontExist.bam.pbi")
def test_refLengths(self): ds = DataSet(data.getBam(0)) random_few = {'B.cereus.6': 1472, 'S.agalactiae.1': 1470, 'B.cereus.4': 1472} for key, value in random_few.items(): self.assertEqual(ds.refLengths[key], value) # this is a hack to only emit refNames that actually have records # associated with them: dss = ds.split(contigs=True, chunks=1)[0] self.assertEqual(dss.refLengths, {'B.vulgatus.4': 1449, 'B.vulgatus.5': 1449, 'C.beijerinckii.13': 1433, 'C.beijerinckii.14': 1433, 'C.beijerinckii.9': 1433, 'E.coli.6': 1463, 'E.faecalis.1': 1482, 'E.faecalis.2': 1482, 'R.sphaeroides.1': 1386, 'S.epidermidis.2': 1472, 'S.epidermidis.3': 1472, 'S.epidermidis.4': 1472 })
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = AlignmentSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) self.assertEqual(refWindows, [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449), ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433), ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463), ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482), ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472), ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)]) old_refWindows = refWindows random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) # test with maxchunks but no breaking contigs dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with maxchunks and breaking contigs is allowed (triggers # targetsize, may result in fewer chunks) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with previous setup and smaller targetSize, resulting in more # chunks dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True, targetSize=10) self.assertEqual(len(dss), 9) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and fewer chunks than atoms dss = ds3.split(contigs=True, chunks=3, byRecords=True) self.assertEqual(len(dss), 3) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and more chunks than atoms orf = random_few random_few = [('C.beijerinckii.13', 0, 747), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 742)] dss = ds3.split(contigs=True, chunks=16, byRecords=True) self.assertEqual(len(dss), 16) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, byRecords=True, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [30, 31, 31]) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRefLength and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [20, 24, 48]) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_stats_metadata(self): ds = DataSet(data.getBam()) ds.loadStats(data.getStats()) self.assertEqual(ds.metadata.summaryStats.prodDist.numBins, 4) self.assertEqual(ds.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds3 = ds1 + ds2 self.assertEqual(ds1.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds2.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds3.metadata.summaryStats.prodDist.bins, [3152, 1802, 798, 0]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 124, 78, 72, 58, 74, 38, 58, 74, 64, 64, 80, 90, 108, 146, 154, 194, 190, 98, 34, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # Lets check some manual values ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 20 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 10, 10, 9, 8, 7, 5, 3, 2, 1, 0, 1, 1]) # now lets swap ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 20 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 10 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 1, 11, 10, 9, 8, 7, 5, 3, 1, 0, 1, 1]) # now lets do some non-overlapping ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [1, 1, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [1, 1, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [2, 2, 2]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [2, 2, 2]) ds2.metadata.summaryStats.readLenDist.minBinValue = 50 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [1, 1, 1, 0, 2, 2, 2]) # now lets test the subdataset metadata retention: ss = SubreadSet(data.getXml(10)) ss.loadStats(data.getStats(0)) ss.loadStats(data.getStats(1)) self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws) self.assertEqual( 2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws) self.assertEqual( 150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
def test_resourceReaders(self): ds = DataSet(data.getBam()) for seqFile in ds.resourceReaders(): self.assertEqual(len([row for row in seqFile]), 92)
def test_ref_names(self): ds = DataSet(data.getBam()) refNames = ds.refNames self.assertEqual(sorted(refNames)[0], 'A.baumannii.1') self.assertEqual(len(refNames), 59)
def test_nonempty_metatype(self): inBam = data.getBam() d = AlignmentSet(inBam) for extRes in d.externalResources: self.assertEqual(extRes.metaType, "PacBio.SubreadFile.SubreadBamFile")
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_empty_metatype(self): inBam = data.getBam() d = DataSet(inBam) for extRes in d.externalResources: self.assertEqual(extRes.metaType, "")
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = AlignmentSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) self.assertEqual(refWindows, [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449), ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433), ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463), ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482), ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472), ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)]) old_refWindows = refWindows random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) # test with maxchunks but no breaking contigs dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with maxchunks and breaking contigs is allowed (triggers # targetsize, may result in fewer chunks) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with previous setup and smaller targetSize, resulting in more # chunks dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True, targetSize=10) self.assertEqual(len(dss), 9) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and fewer chunks than atoms dss = ds3.split(contigs=True, chunks=3, byRecords=True) self.assertEqual(len(dss), 3) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and more chunks than atoms orf = random_few random_few = [('C.beijerinckii.13', 0, 747), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 742)] dss = ds3.split(contigs=True, chunks=16, byRecords=True) self.assertEqual(len(dss), 16) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, byRecords=True, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [30, 31, 31]) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRefLength and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [20, 24, 48]) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_stats_metadata(self): ds = DataSet(data.getBam()) ds.loadStats(data.getStats()) self.assertEqual(ds.metadata.summaryStats.prodDist.numBins, 4) self.assertEqual(ds.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds3 = ds1 + ds2 self.assertEqual(ds1.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds2.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds3.metadata.summaryStats.prodDist.bins, [3152, 1802, 798, 0]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [ 0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [ 0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [ 0, 124, 78, 72, 58, 74, 38, 58, 74, 64, 64, 80, 90, 108, 146, 154, 194, 190, 98, 34, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) # Lets check some manual values ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ([ 0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1 ]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ([ 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1 ]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 20 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 10, 10, 9, 8, 7, 5, 3, 2, 1, 0, 1, 1]) # now lets swap ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ([ 0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1 ]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 20 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ([ 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1 ]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 10 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 1, 11, 10, 9, 8, 7, 5, 3, 1, 0, 1, 1]) # now lets do some non-overlapping ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ([1, 1, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [1, 1, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ([2, 2, 2]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [2, 2, 2]) ds2.metadata.summaryStats.readLenDist.minBinValue = 50 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [1, 1, 1, 0, 2, 2, 2]) # now lets test the subdataset metadata retention: ss = SubreadSet(data.getXml(10)) ss.loadStats(data.getStats(0)) ss.loadStats(data.getStats(1)) self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws) self.assertEqual( 2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws) self.assertEqual( 150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)