def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = AlignmentSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = AlignmentSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_refWindows(self): ds = AlignmentSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue( '( rname = E.faecalis.2 ' in str(dss[0].filters) or '( rname = E.faecalis.2 ' in str(dss[1].filters)) ds = AlignmentSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'), ('=', 'E.faecalis.2')], tStart=[('<', '99'), ('<', '299')], tEnd=[('>', '0'), ('>', '100')]) self.assertEqual(str(ds.filters), '( rname = E.faecalis.2 AND tstart ' '< 99 AND tend > 0 ) OR ( rname = ' 'E.faecalis.2 AND tstart < 299 AND tend > 100 )') self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99), ('E.faecalis.2', 100, 299)])
def to_chunked_alignmentset_files(alignmentset_path, reference_path, max_total_nchunks, chunk_key, dir_name, base_name, ext, by_zmw): dset = AlignmentSet(alignmentset_path, strict=True) if by_zmw: dset_chunks = dset.split(zmws=True, maxChunks=max_total_nchunks) else: dset_chunks = dset.split(contigs=True, maxChunks=max_total_nchunks, breakContigs=True) # sanity checking reference_set = ReferenceSet(reference_path, strict=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) _add_chunked_tag_if_missing(dset) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) d['$chunk.reference_id'] = reference_path c = PipelineChunk(chunk_id, **d) yield c
def to_chunked_alignmentset_files(alignmentset_path, reference_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): dset = AlignmentSet(alignmentset_path, strict=True) dset_chunks = dset.split(contigs=True, maxChunks=max_total_nchunks, breakContigs=True) # sanity checking reference_set = ReferenceSet(reference_path, strict=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) d['$chunk.reference_id'] = reference_path c = PipelineChunk(chunk_id, **d) yield c
def test_refWindows(self): ds = AlignmentSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue('( rname = E.faecalis.2 ' in str(dss[0].filters) or '( rname = E.faecalis.2 ' in str(dss[1].filters)) ds = AlignmentSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'), ('=', 'E.faecalis.2')], tStart=[('<', '99'), ('<', '299')], tEnd=[('>', '0'), ('>', '100')]) self.assertEqual( str(ds.filters), '( rname = E.faecalis.2 AND tstart ' '< 99 AND tend > 0 ) OR ( rname = ' 'E.faecalis.2 AND tstart < 299 AND tend > 100 )') self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99), ('E.faecalis.2', 100, 299)])
def run(alignmentset, referenceset, fofn, max_nchunks): #'python -m pbcoretools.tasks.scatter_alignments_reference alignment_ds ds_reference json_out' dir_name = os.getcwd() dset = AlignmentSet(alignmentset, strict=True) dset_chunks = dset.split(contigs=True, maxChunks=max_nchunks, breakContigs=True) # referenceset is used only for sanity checking. ReferenceSet(referenceset, strict=True) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_alignmentset_{}.alignmentset.xml'.format(i) #chunk_fn = os.path.join(dir_name, chunk_name) chunk_fn = chunk_name # rel to CWD if os.path.exists(chunk_fn): os.unlink(chunk_fn) dset.write(chunk_fn, relPaths=True) chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = AlignmentSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) self.assertEqual(refWindows, [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449), ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433), ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463), ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482), ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472), ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)]) old_refWindows = refWindows random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) # test with maxchunks but no breaking contigs dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with maxchunks and breaking contigs is allowed (triggers # targetsize, may result in fewer chunks) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with previous setup and smaller targetSize, resulting in more # chunks dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True, targetSize=10) self.assertEqual(len(dss), 9) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and fewer chunks than atoms dss = ds3.split(contigs=True, chunks=3, byRecords=True) self.assertEqual(len(dss), 3) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and more chunks than atoms orf = random_few random_few = [('C.beijerinckii.13', 0, 747), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 742)] dss = ds3.split(contigs=True, chunks=16, byRecords=True) self.assertEqual(len(dss), 16) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, byRecords=True, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [30, 31, 31]) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRefLength and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [20, 24, 48]) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_split_by_contigs_with_split_and_maxChunks(self): # test to make sure the refWindows work when chunks == # refs ds3 = AlignmentSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) self.assertEqual(refWindows, [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449), ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433), ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463), ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482), ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472), ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)]) old_refWindows = refWindows random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] dss = ds3.split(contigs=True, maxChunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, maxChunks=24) # This isn't expected if num refs >= 100, as map check isn't made # for now (too expensive) # There are only 12 refs represented in this set, however... self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) # test with maxchunks but no breaking contigs dss = ds3.split(contigs=True, maxChunks=36) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with maxchunks and breaking contigs is allowed (triggers # targetsize, may result in fewer chunks) dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True) self.assertEqual(len(dss), 2) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with previous setup and smaller targetSize, resulting in more # chunks dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True, targetSize=10) self.assertEqual(len(dss), 9) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and fewer chunks than atoms dss = ds3.split(contigs=True, chunks=3, byRecords=True) self.assertEqual(len(dss), 3) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and more chunks than atoms orf = random_few random_few = [('C.beijerinckii.13', 0, 747), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 742)] dss = ds3.split(contigs=True, chunks=16, byRecords=True) self.assertEqual(len(dss), 16) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRecords and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, byRecords=True, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [30, 31, 31]) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found) # test with byRefLength and updateCounts random_few = orf dss = ds3.split(contigs=True, chunks=3, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) self.assertListEqual(sizes, [20, 24, 48]) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)