def test_split(self): ds1 = DataSet(data.getXml()) self.assertTrue(ds1.numExternalResources > 1) dss = ds1.split() self.assertTrue(len(dss) == ds1.numExternalResources) dss = ds1.split(chunks=1) self.assertTrue(len(dss) == 1) dss = ds1.split(chunks=2, ignoreSubDatasets=True) self.assertTrue(len(dss) == 2) self.assertFalse(dss[0].uuid == dss[1].uuid) self.assertTrue(dss[0].name == dss[1].name) # Lets try merging and splitting on subdatasets ds1 = DataSet(data.getXml(8)) self.assertEquals(ds1.totalLength, 123588) ds1tl = ds1.totalLength ds2 = DataSet(data.getXml(11)) self.assertEquals(ds2.totalLength, 117086) ds2tl = ds2.totalLength dss = ds1 + ds2 self.assertTrue(dss.totalLength == (ds1tl + ds2tl)) ds1, ds2 = sorted(dss.split(2), key=lambda x: x.totalLength, reverse=True) self.assertTrue(ds1.totalLength == ds1tl) self.assertTrue(ds2.totalLength == ds2tl)
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_refLengths(self): ds = DataSet(data.getBam(0)) random_few = { 'B.cereus.6': 1472, 'S.agalactiae.1': 1470, 'B.cereus.4': 1472 } for key, value in random_few.items(): self.assertEqual(ds.refLengths[key], value) # this is a hack to only emit refNames that actually have records # associated with them: dss = ds.split(contigs=True, chunks=1)[0] self.assertEqual( dss.refLengths, { 'B.vulgatus.4': 1449, 'B.vulgatus.5': 1449, 'C.beijerinckii.13': 1433, 'C.beijerinckii.14': 1433, 'C.beijerinckii.9': 1433, 'E.coli.6': 1463, 'E.faecalis.1': 1482, 'E.faecalis.2': 1482, 'R.sphaeroides.1': 1386, 'S.epidermidis.2': 1472, 'S.epidermidis.3': 1472, 'S.epidermidis.4': 1472 })
def test_reads_in_subdataset(self): ds = DataSet(data.getXml(8)) #refs = ['E.faecalis.1', 'E.faecalis.2'] #readRefs = ['E.faecalis.1'] * 2 + ['E.faecalis.2'] * 9 #ds.filters.removeRequirement('rname') dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) self.assertEqual([ 'B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.13', 'C.beijerinckii.14', 'C.beijerinckii.9', 'E.coli.6', 'E.faecalis.1', 'E.faecalis.2', 'R.sphaeroides.1', 'S.epidermidis.2', 'S.epidermidis.3', 'S.epidermidis.4' ], sorted([ds.filters[0][0].value for ds in dss])) self.assertEqual(len(list(dss[0].readsInSubDatasets())), 3) self.assertEqual(len(list(dss[1].readsInSubDatasets())), 20)
def test_reads_in_subdataset(self): ds = DataSet(data.getXml(8)) #refs = ['E.faecalis.1', 'E.faecalis.2'] #readRefs = ['E.faecalis.1'] * 2 + ['E.faecalis.2'] * 9 #ds.filters.removeRequirement('rname') dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) self.assertEqual(['B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.13', 'C.beijerinckii.14', 'C.beijerinckii.9', 'E.coli.6', 'E.faecalis.1', 'E.faecalis.2', 'R.sphaeroides.1', 'S.epidermidis.2', 'S.epidermidis.3', 'S.epidermidis.4'], sorted([ds.filters[0][0].value for ds in dss])) self.assertEqual(len(list(dss[0].readsInSubDatasets())), 3) self.assertEqual(len(list(dss[1].readsInSubDatasets())), 20)
def splitXml(args): log.debug("Starting split") dataSet = DataSet(args.infile, strict=args.strict) chunks = len(args.outfiles) if args.chunks: chunks = args.chunks dss = dataSet.split(chunks=chunks, ignoreSubDatasets=(not args.subdatasets), contigs=args.contigs, maxChunks=args.maxChunks, breakContigs=args.breakContigs) log.debug("Split into {i} chunks".format(i=len(dss))) infix = 'chunk{i}' if args.contigs: infix += 'contigs' if not args.outfiles: if not args.outdir: args.outfiles = [ '.'.join( args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss)) ] else: args.outfiles = [ '.'.join( args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss)) ] args.outfiles = [ os.path.join(args.outdir, os.path.basename(outfn)) for outfn in args.outfiles ] num = len(dss) end = '' if num > 5: num = 5 end = '...' log.debug("Emitting {f} {e}".format(f=', '.join( args.outfiles[:num]), e=end)) log.debug("Finished splitting, now writing") for out_fn, dset in zip(args.outfiles, dss): dset.write(out_fn) log.debug("Done writing files")
def test_refWindows(self): ds = DataSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue('( rname = E.faecalis.2 ) ' in str(dss[0].filters) or '( rname = E.faecalis.2 ) ' in str(dss[1].filters)) ds = DataSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), ('=', 'lambda_NEB3011')], tStart=[('<', '0'), ('<', '100')], tEnd=[('>', '99'), ('>', '299')]) self.assertEqual( str(ds.filters), '( rname = lambda_NEB3011 AND tstart ' '< 0 AND tend > 99 ) OR ( rname = lambd' 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
def splitXml(args): log.debug("Starting split") dataSet = DataSet(args.infile, strict=args.strict) chunks = len(args.outfiles) if args.chunks: chunks = args.chunks dss = dataSet.split(chunks=chunks, ignoreSubDatasets=(not args.subdatasets), contigs=args.contigs, maxChunks=args.maxChunks, breakContigs=args.breakContigs) log.debug("Split into {i} chunks".format(i=len(dss))) infix = 'chunk{i}' if args.contigs: infix += 'contigs' if not args.outfiles: if not args.outdir: args.outfiles = ['.'.join(args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss))] else: args.outfiles = ['.'.join(args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss))] args.outfiles = [os.path.join(args.outdir, os.path.basename(outfn)) for outfn in args.outfiles] num = len(dss) end = '' if num > 5: num = 5 end = '...' log.debug("Emitting {f} {e}".format( f=', '.join(args.outfiles[:num]), e=end)) log.debug("Finished splitting, now writing") for out_fn, dset in zip(args.outfiles, dss): dset.write(out_fn) log.debug("Done writing files")
def test_refLengths(self): ds = DataSet(data.getBam(0)) random_few = {'B.cereus.6': 1472, 'S.agalactiae.1': 1470, 'B.cereus.4': 1472} for key, value in random_few.items(): self.assertEqual(ds.refLengths[key], value) # this is a hack to only emit refNames that actually have records # associated with them: dss = ds.split(contigs=True, chunks=1)[0] self.assertEqual(dss.refLengths, {'B.vulgatus.4': 1449, 'B.vulgatus.5': 1449, 'C.beijerinckii.13': 1433, 'C.beijerinckii.14': 1433, 'C.beijerinckii.9': 1433, 'E.coli.6': 1463, 'E.faecalis.1': 1482, 'E.faecalis.2': 1482, 'R.sphaeroides.1': 1386, 'S.epidermidis.2': 1472, 'S.epidermidis.3': 1472, 'S.epidermidis.4': 1472 })
def test_refWindows(self): ds = DataSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue( '( rname = E.faecalis.2 ) ' in str(dss[0].filters) or '( rname = E.faecalis.2 ) ' in str(dss[1].filters)) ds = DataSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), ('=', 'lambda_NEB3011')], tStart=[('<', '0'), ('<', '100')], tEnd=[('>', '99'), ('>', '299')]) self.assertEqual(str(ds.filters), '( rname = lambda_NEB3011 AND tstart ' '< 0 AND tend > 99 ) OR ( rname = lambd' 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
def test_reads_in_contig(self): log.info("Testing reads in contigs") ds = DataSet(data.getXml(8)) dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) efaec1TimesFound = 0 efaec1TotFound = 0 efaec2TimesFound = 0 efaec2TotFound = 0 for ds in dss: ef1 = len(list(ds.readsInReference('E.faecalis.1'))) ef2 = len(list(ds.readsInReference('E.faecalis.2'))) if ef1: efaec1TimesFound += 1 efaec1TotFound += ef1 if ef2: efaec2TimesFound += 1 efaec2TotFound += ef2 self.assertEqual(efaec1TimesFound, 1) self.assertEqual(efaec1TotFound, 20) self.assertEqual(efaec2TimesFound, 1) self.assertEqual(efaec2TotFound, 3) ds = DataSet(data.getXml(8)) filt = Filters() filt.addRequirement(length=[('>', '100')]) ds.addFilters(filt) dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) efaec1TimesFound = 0 efaec1TotFound = 0 efaec2TimesFound = 0 efaec2TotFound = 0 for ds in dss: ef1 = len(list(ds.readsInReference('E.faecalis.1'))) ef2 = len(list(ds.readsInReference('E.faecalis.2'))) if ef1: efaec1TimesFound += 1 efaec1TotFound += ef1 if ef2: efaec2TimesFound += 1 efaec2TotFound += ef2 self.assertEqual(efaec1TimesFound, 1) self.assertEqual(efaec1TotFound, 20) self.assertEqual(efaec2TimesFound, 1) self.assertEqual(efaec2TotFound, 3) ds = DataSet(data.getXml(8)) filt = Filters() filt.addRequirement(length=[('>', '1000')]) ds.addFilters(filt) dss = ds.split(contigs=True) self.assertEqual(len(dss), 9) efaec1TimesFound = 0 efaec1TotFound = 0 efaec2TimesFound = 0 efaec2TotFound = 0 for ds in dss: ef1 = len(list(ds.readsInReference('E.faecalis.1'))) ef2 = len(list(ds.readsInReference('E.faecalis.2'))) if ef1: efaec1TimesFound += 1 efaec1TotFound += ef1 if ef2: efaec2TimesFound += 1 efaec2TotFound += ef2 self.assertEqual(efaec1TimesFound, 1) self.assertEqual(efaec1TotFound, 20) self.assertEqual(efaec2TimesFound, 1) self.assertEqual(efaec2TotFound, 1)
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)