def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets ]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_split(self): ds1 = DataSet(data.getXml()) self.assertTrue(ds1.numExternalResources > 1) dss = ds1.split() self.assertTrue(len(dss) == ds1.numExternalResources) dss = ds1.split(chunks=1) self.assertTrue(len(dss) == 1) dss = ds1.split(chunks=2, ignoreSubDatasets=True) self.assertTrue(len(dss) == 2) self.assertFalse(dss[0].uuid == dss[1].uuid) self.assertTrue(dss[0].name == dss[1].name) # Lets try merging and splitting on subdatasets ds1 = DataSet(data.getXml(8)) self.assertEquals(ds1.totalLength, 123588) ds1tl = ds1.totalLength ds2 = DataSet(data.getXml(11)) self.assertEquals(ds2.totalLength, 117086) ds2tl = ds2.totalLength dss = ds1 + ds2 self.assertTrue(dss.totalLength == (ds1tl + ds2tl)) ds1, ds2 = sorted(dss.split(2), key=lambda x: x.totalLength, reverse=True) self.assertTrue(ds1.totalLength == ds1tl) self.assertTrue(ds2.totalLength == ds2tl)
def test_reads_in_reference(self): ds = DataSet(data.getBam()) refNames = ds.refNames # See test_ref_names for why this is expected: rn = refNames[15] reads = ds.readsInReference(rn) self.assertEqual(len(list(reads)), 11) ds2 = DataSet(data.getBam(0)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 3) ds2 = DataSet(data.getXml(8)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) # Because of the filter! reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 0)
def test_addFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) assert str(ds1.filters) == '( rq > 0.85 )' # Or added from a source XML ds2 = DataSet(data.getXml(15)) assert str(ds2.filters).startswith('( rname = E.faecalis')
def test_addFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) self.assertEquals(str(ds1.filters), '( rq > 0.85 )') # Or added from a source XML ds2 = DataSet(data.getXml(16)) self.assertTrue(str(ds2.filters).startswith('( rname = E.faecalis'))
def test_setFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) self.assertEquals(str(ds1.filters), '( rq > 0.85 )') # Or added from a source XML ds2 = DataSet() ds2.filters = ds1.filters self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
def test_checkFilterMatch(self): # different resourceIds, compatible filters: ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=11)) #self.assertTrue(ds1._checkFilterMatch(ds2.filters)) self.assertTrue(ds1.filters.testCompatibility(ds2.filters)) # different resourceIds, incompatible filters: ds3 = DataSet(data.getXml(no=11)) ds3.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) #self.assertFalse(ds1._checkFilterMatch(ds3.filters)) self.assertFalse(ds1.filters.testCompatibility(ds3.filters))
def test_addMetadata(self): ds = DataSet() ds.addMetadata(None, Name='LongReadsRock') self.assertEquals(ds._metadata.getV(container='attrib', tag='Name'), 'LongReadsRock') ds2 = DataSet(data.getXml(no=8)) self.assertEquals(ds2._metadata.totalLength, 123588) ds2._metadata.totalLength = 100000 self.assertEquals(ds2._metadata.totalLength, 100000) ds2._metadata.totalLength += 100000 self.assertEquals(ds2._metadata.totalLength, 200000)
def test_toExternalFiles(self): bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False) self.assertEqual(['bam1.bam', 'bam2.bam'], bogusDS.externalResources.resourceIds) self.assertEquals( DataSet("bam1.bam", "bam2.bam", strict=False).toExternalFiles(), ['bam1.bam', 'bam2.bam']) realDS = DataSet(data.getXml(8)) files = realDS.toExternalFiles() self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertTrue(os.path.isabs(files[0]))
def test_toFofn(self): self.assertEquals( DataSet("bam1.bam", "bam2.bam", strict=False).toFofn(), ['bam1.bam', 'bam2.bam']) realDS = DataSet(data.getXml(8)) files = realDS.toFofn() self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertTrue(os.path.isabs(files[0])) files = realDS.toFofn(relative=True) self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertFalse(os.path.isabs(files[0]))
def test_addExternalResources(self): ds = DataSet() er1 = ExternalResource() er1.resourceId = "test1.bam" er2 = ExternalResource() er2.resourceId = "test2.bam" er3 = ExternalResource() er3.resourceId = "test1.bam" ds.addExternalResources([er1], updateCount=False) self.assertEquals(ds.numExternalResources, 1) # different resourceId: succeeds ds.addExternalResources([er2], updateCount=False) self.assertEquals(ds.numExternalResources, 2) # same resourceId: fails ds.addExternalResources([er3], updateCount=False) self.assertEquals(ds.numExternalResources, 2) for extRef in ds.externalResources: self.assertEqual(type(extRef).__name__, "ExternalResource") extRef = ds.externalResources[0] self.assertEqual(type(extRef).__name__, "ExternalResource") self.assertEqual(extRef.resourceId, 'test1.bam') extRef = ds.externalResources[1] self.assertEqual(type(extRef).__name__, "ExternalResource") self.assertEqual(extRef.resourceId, 'test2.bam')
def test_refLengths(self): ds = DataSet(data.getBam(0)) random_few = { 'B.cereus.6': 1472, 'S.agalactiae.1': 1470, 'B.cereus.4': 1472 } for key, value in random_few.items(): self.assertEqual(ds.refLengths[key], value) # this is a hack to only emit refNames that actually have records # associated with them: dss = ds.split(contigs=True, chunks=1)[0] self.assertEqual( dss.refLengths, { 'B.vulgatus.4': 1449, 'B.vulgatus.5': 1449, 'C.beijerinckii.13': 1433, 'C.beijerinckii.14': 1433, 'C.beijerinckii.9': 1433, 'E.coli.6': 1463, 'E.faecalis.1': 1482, 'E.faecalis.2': 1482, 'R.sphaeroides.1': 1386, 'S.epidermidis.2': 1472, 'S.epidermidis.3': 1472, 'S.epidermidis.4': 1472 })
def loadStatsXml(args): dset = DataSet(args.infile, strict=args.strict) dset.loadStats(args.statsfile) if args.outfile: dset.write(args.outfile, validate=False) else: dset.write(args.infile, validate=False)
def test_split_by_contigs_presplit(self): # Consumes too much memory for Jenkins # Test to make sure the result of a split by contigs has an appropriate # number of records (make sure filters are appropriately aggressive) ds2 = DataSet(data.getXml(14)) bams = ds2.externalResources.resourceIds assert len(bams) == 2 refwindows = ds2.refWindows assert refwindows == [(0, 0, 224992)] res1 = openIndexedAlignmentFile(bams[0][7:]) res2 = openIndexedAlignmentFile(bams[1][7:]) def count(iterable): count = 0 for _ in iterable: count += 1 return count assert count(res1.readsInRange(*refwindows[0])) == 1409 assert count(res2.readsInRange(*refwindows[0])) == 1375 assert count(ds2.readsInRange(*refwindows[0])) == 2784 assert count(ds2.records) == 2784 ds2.disableFilters() assert count(ds2.records) == 53552 assert ds2.countRecords() == 53552
def test_split_by_contigs_presplit(self): # Consumes too much memory for Jenkins # Test to make sure the result of a split by contigs has an appropriate # number of records (make sure filters are appropriately aggressive) ds2 = DataSet(data.getXml(15)) bams = ds2.externalResources.resourceIds self.assertEqual(len(bams), 2) refwindows = ds2.refWindows self.assertEqual(refwindows, [(0, 0, 224992)]) res1 = openIndexedAlignmentFile(bams[0][7:]) res2 = openIndexedAlignmentFile(bams[1][7:]) def count(iterable): count = 0 for _ in iterable: count += 1 return count self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409) self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375) self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784) self.assertEqual(count(ds2.records), 2784) ds2.disableFilters() self.assertEqual(count(ds2.records), 53552) self.assertEqual(ds2.countRecords(), 53552)
def test_filter(self): ds2 = DataSet(data.getXml(8)) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) self.assertEqual(len(list(ds2.records)), 20) ds2.disableFilters() self.assertEqual(len(list(ds2.records)), 92) ds2.enableFilters() self.assertEqual(len(list(ds2.records)), 20)
def test_refWindows(self): ds = DataSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue('( rname = E.faecalis.2 ) ' in str(dss[0].filters) or '( rname = E.faecalis.2 ) ' in str(dss[1].filters)) ds = DataSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), ('=', 'lambda_NEB3011')], tStart=[('<', '0'), ('<', '100')], tEnd=[('>', '99'), ('>', '299')]) self.assertEqual( str(ds.filters), '( rname = lambda_NEB3011 AND tstart ' '< 0 AND tend > 99 ) OR ( rname = lambd' 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
def to_report(stats_xml): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.prodDist: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") dsets = [dset] for subdset in dset.subdatasets: if subdset.metadata.summaryStats: dsets.append(subdset) col_ids = [ Constants.C_CONTEXT, Constants.C_ZMWS, Constants.C_PROD_0, Constants.C_PROD_1, Constants.C_PROD_2 ] col_values = [[], [], [], [], []] for dset in dsets: if len(dsets) > 1 and len(col_values[0]) == 0: movie_name = "Combined" else: try: collection = list(dset.metadata.collections)[0] movie_name = collection.context except AttributeError: movie_name = "NA" productive_zmws = int(dset.metadata.summaryStats.numSequencingZmws) empty, productive, other, _ = dset.metadata.summaryStats.prodDist.bins prod0 = np.round(100.0 * empty / float(productive_zmws), decimals=Constants.DECIMALS) prod1 = np.round(100.0 * productive / float(productive_zmws), decimals=Constants.DECIMALS) prod2 = np.round(100.0 * other / float(productive_zmws), decimals=Constants.DECIMALS) this_row = [movie_name, productive_zmws, prod0, prod1, prod2] map(lambda (x, y): x.append(y), zip(col_values, this_row)) columns = [ Column(cid, values=vals) for cid, vals in zip(col_ids, col_values) ] tables = [Table(Constants.T_LOADING, columns=columns)] report = Report(meta_rpt.id, title=meta_rpt.title, tables=tables, attributes=None, plotgroups=None) return meta_rpt.apply_view(report)
def test_merge(self): # xmls with different resourceIds: success ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=11)) ds3 = ds1 + ds2 expected = ds1.numExternalResources + ds2.numExternalResources self.assertTrue(ds3.numExternalResources == expected) # xmls with different resourceIds but conflicting filters: # failure to merge ds2 = DataSet(data.getXml(no=11)) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) ds3 = ds1 + ds2 self.assertEqual(ds3, None) # xmls with same resourceIds: ignores new inputs ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=8)) ds3 = ds1 + ds2 expected = ds1.numExternalResources self.assertTrue(ds3.numExternalResources == expected)
def test_reads_in_range(self): ds = DataSet(data.getBam()) refNames = ds.refNames rn = refNames[15] reads = ds.readsInRange(rn, 10, 100) self.assertEqual(len(list(reads)), 10) ds2 = DataSet(data.getBam(0)) reads = ds2.readsInRange("E.faecalis.1", 0, 1400) self.assertEqual(len(list(reads)), 20) lengths = ds.refLengths for rname, rId in ds.refInfo('ID'): rn = ds._idToRname(rId) self.assertEqual(rname, rn) rlen = lengths[rn] self.assertEqual(len(list(ds.readsInReference(rn))), len(list(ds.readsInReference(rId)))) self.assertEqual(len(list(ds.readsInRange(rn, 0, rlen))), len(list(ds.readsInRange(rId, 0, rlen))))
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_staggered_reads_in_range(self): ds = DataSet(data.getXml(8)) refNames = ds.refNames rn = 'B.vulgatus.5' reads = list(ds.readsInRange(rn, 0, 10000)) ds2 = DataSet(data.getXml(11)) reads2 = list(ds2.readsInRange(rn, 0, 10000)) dsBoth = DataSet(data.getXml(8), data.getXml(11)) readsBoth = list(dsBoth.readsInRange(rn, 0, 10000)) self.assertEqual(len(reads), 2) self.assertEqual(len(reads2), 5) self.assertEqual(len(readsBoth), 7) read_starts = (0, 1053) for read, start in zip(reads, read_starts): self.assertEqual(read.tStart, start) read2_starts = (0, 0, 3, 3, 4) for read, start in zip(reads2, read2_starts): self.assertEqual(read.tStart, start) readboth_starts = (0, 0, 0, 3, 3, 4, 1053) for read, start in zip(readsBoth, readboth_starts): self.assertEqual(read.tStart, start)
def test_reads_in_subdataset(self): ds = DataSet(data.getXml(8)) #refs = ['E.faecalis.1', 'E.faecalis.2'] #readRefs = ['E.faecalis.1'] * 2 + ['E.faecalis.2'] * 9 #ds.filters.removeRequirement('rname') dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) self.assertEqual([ 'B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.13', 'C.beijerinckii.14', 'C.beijerinckii.9', 'E.coli.6', 'E.faecalis.1', 'E.faecalis.2', 'R.sphaeroides.1', 'S.epidermidis.2', 'S.epidermidis.3', 'S.epidermidis.4' ], sorted([ds.filters[0][0].value for ds in dss])) self.assertEqual(len(list(dss[0].readsInSubDatasets())), 3) self.assertEqual(len(list(dss[1].readsInSubDatasets())), 20)
def test_build(self): # Progs like pbalign provide a .bam file: # e.g. d = DataSet("aligned.bam") # Something like the test files we have: inBam = data.getBam() self.assertTrue(inBam.endswith('.bam')) d = DataSet(inBam) # A UniqueId is generated, despite being a BAM input self.assertTrue(d.uuid != '') dOldUuid = d.uuid # They can write this BAM to an XML: # e.g. d.write("alignmentset.xml") outdir = tempfile.mkdtemp(suffix="dataset-unittest") outXml = os.path.join(outdir, 'tempfile.xml') d.write(outXml) # And then recover the same XML (or a different one): # e.g. d = DataSet("alignmentset.xml") d = DataSet(outXml) # The UniqueId will be the same self.assertTrue(d.uuid == dOldUuid) # Inputs can be many and varied ds1 = DataSet(data.getXml(11), data.getBam()) self.assertEquals(ds1.numExternalResources, 2) ds1 = DataSet(data.getFofn()) self.assertEquals(ds1.numExternalResources, 2) # New! Use the correct constructor: self.assertEquals( type(SubreadSet(data.getSubreadSet())).__name__, 'SubreadSet') # Even with untyped inputs self.assertTrue( str(SubreadSet(data.getBam())).startswith('<SubreadSet')) self.assertEquals( type(SubreadSet(data.getBam())).__name__, 'SubreadSet') self.assertEquals(type(DataSet(data.getBam())).__name__, 'DataSet') # You can also cast up and down, but casting between siblings # is limited (abuse at your own risk) self.assertEquals( type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__, 'SubreadSet') self.assertEquals( type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__, 'DataSet') # Add external Resources: ds = DataSet() ds.externalResources.addResources(["IdontExist.bam"]) self.assertTrue( ds.externalResources[-1].resourceId == "IdontExist.bam") # Add an index file ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"]) self.assertTrue(ds.externalResources[-1].indices[0].resourceId == "IdontExist.bam.pbi")
def splitXml(args): log.debug("Starting split") dataSet = DataSet(args.infile, strict=args.strict) chunks = len(args.outfiles) if args.chunks: chunks = args.chunks dss = dataSet.split(chunks=chunks, ignoreSubDatasets=(not args.subdatasets), contigs=args.contigs, maxChunks=args.maxChunks, breakContigs=args.breakContigs) log.debug("Split into {i} chunks".format(i=len(dss))) infix = 'chunk{i}' if args.contigs: infix += 'contigs' if not args.outfiles: if not args.outdir: args.outfiles = [ '.'.join( args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss)) ] else: args.outfiles = [ '.'.join( args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss)) ] args.outfiles = [ os.path.join(args.outdir, os.path.basename(outfn)) for outfn in args.outfiles ] num = len(dss) end = '' if num > 5: num = 5 end = '...' log.debug("Emitting {f} {e}".format(f=', '.join( args.outfiles[:num]), e=end)) log.debug("Finished splitting, now writing") for out_fn, dset in zip(args.outfiles, dss): dset.write(out_fn) log.debug("Done writing files")
def filterXml(args): log.error("Adding filters via CLI is temporarily out of order") exit(1) if args.infile.endswith('xml'): dataSet = DataSet(args.infile, strict=args.strict) filters = [] separators = ['<=', '>=', '!=', '==', '>', '<', '='] for filt in args.filters: for sep in separators: if sep in filt: param, condition = filt.split(sep) condition = sep + condition filters[param] = condition break dataSet.addFilters([filters]) log.info("{i} filters added".format(i=len(filters))) dataSet.write(args.outfile) else: raise IOError("No files found/found to be compatible")
def test_checkInputFile(self): """Test checkInputFile().""" fastaFN = path.join(self.rootDir, "data/ecoli.fasta") plsFN = self.dataDir + \ "m121215_065521_richard_c100425710150000001823055001121371_s1_p0.pls.h5" self.assertTrue(filecmp.cmp(fastaFN, checkInputFile(fastaFN))) self.assertTrue(filecmp.cmp(plsFN, checkInputFile(plsFN))) fofnFN = path.join(self.rootDir, "data/ecoli_lp.fofn") self.assertTrue(filecmp.cmp(fofnFN, checkInputFile(fofnFN))) xmlFN = path.join(self.rootDir, "data/subreads_dataset1.xml") ret = checkInputFile(xmlFN) self.assertTrue(ret.endswith('.xml')) fs = DataSet(ret).toExternalFiles() self.assertTrue(fs[0].endswith( "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.subreads.bam" )) self.assertTrue(fs[1].endswith( "m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.subreads.bam" ))
def _get_dataset_uuid_or_create_uuid(path): """ Extract the uuid from the DataSet or assign a new UUID :param path: Path to file :rtype: str :return: uuid string """ try: ds = DataSet(path) ds_id = ds.uuid # make sure it's a validate uuid _ = uuid.UUID(ds_id) except ValueError as e: log.error("DataSet {p} uuid is malformed. {e}".format(e=e, p=path)) ds_id = uuid.uuid4 except Exception: # not a DataSet file ds_id = uuid.uuid4() return ds_id
def test_referenceInfoTableMerging(self): log.info("Testing refIds, etc. after merging") ds = DataSet(data.getXml(17)) also_lambda = ds.toExternalFiles()[0] aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda) readers = aln.resourceReaders() ids = sorted([i for _, i in aln.refInfo('ID')]) self.assertEqual(range(len(ids)), ids) accNames = aln.refNames expNames = reduce( np.append, [reader.referenceInfoTable['Name'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames)) accNames = aln.fullRefNames expNames = reduce( np.append, [reader.referenceInfoTable['FullName'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames))
def mergeXml(args): dss = [] for infn in args.infiles: dss.append(DataSet(infn, strict=args.strict)) reduce(lambda ds1, ds2: ds1 + ds2, dss).write(args.outfile)