Example #1
0
    def test_reads_in_reference(self):
        ds = DataSet(data.getBam())
        refNames = ds.refNames

        # See test_ref_names for why this is expected:
        rn = refNames[15]
        reads = ds.readsInReference(rn)
        self.assertEqual(len(list(reads)), 11)

        ds2 = DataSet(data.getBam(0))
        reads = ds2.readsInReference("E.faecalis.1")
        self.assertEqual(len(list(reads)), 20)

        reads = ds2.readsInReference("E.faecalis.2")
        self.assertEqual(len(list(reads)), 3)

        ds2 = DataSet(data.getXml(8))
        reads = ds2.readsInReference("E.faecalis.1")
        self.assertEqual(len(list(reads)), 20)

        ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])

        # Because of the filter!
        reads = ds2.readsInReference("E.faecalis.2")
        self.assertEqual(len(list(reads)), 0)
Example #2
0
 def test_copy(self):
     ds1 = DataSet(data.getXml())
     ds2 = ds1.copy()
     self.assertFalse(ds1 == ds2)
     self.assertFalse(ds1.uuid == ds2.uuid)
     self.assertFalse(ds1 is ds2)
     self.assertTrue(ds1.name == ds2.name)
     self.assertTrue(ds1.externalResources == ds2.externalResources)
     # The name and UniqueId are different:
     self.assertFalse(ds1.objMetadata == ds2.objMetadata)
     self.assertTrue(ds1.filters == ds2.filters)
     self.assertTrue(ds1.subdatasets == ds2.subdatasets)
     self.assertTrue(len(ds1.subdatasets) == 2)
     self.assertTrue(len(ds2.subdatasets) == 2)
     assert not reduce(lambda x, y: x or y, [
         ds1d is ds2d for ds1d in ds1.subdatasets
         for ds2d in ds2.subdatasets
     ])
     # TODO: once simulated files are indexable, turn on strict:
     ds1 = SubreadSet(data.getXml(no=10), strict=False)
     self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata')
     ds2 = ds1.copy()
     self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata')
     # Lets try casting
     ds1 = DataSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'DataSet')
     ds1 = ds1.copy(asType='SubreadSet')
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
     # Lets do some illicit casting
     with self.assertRaises(TypeError):
         ds1 = ds1.copy(asType='ReferenceSet')
     # Lets try not having to cast
     ds1 = SubreadSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
 def test_refWindows(self):
     ds = AlignmentSet(data.getBam())
     dss = ds.split(chunks=2, contigs=True)
     self.assertEqual(len(dss), 2)
     log.debug(dss[0].filters)
     log.debug(dss[1].filters)
     self.assertTrue(
         '( rname = E.faecalis.2 '
         in str(dss[0].filters)
         or
         '( rname = E.faecalis.2 '
         in str(dss[1].filters))
     ds = AlignmentSet(data.getBam())
     ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
                                      ('=', 'E.faecalis.2')],
                               tStart=[('<', '99'),
                                       ('<', '299')],
                               tEnd=[('>', '0'),
                                     ('>', '100')])
     self.assertEqual(str(ds.filters),
                      '( rname = E.faecalis.2 AND tstart '
                      '< 99 AND tend > 0 ) OR ( rname = '
                      'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
     self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
                                      ('E.faecalis.2', 100, 299)])
Example #4
0
    def test_reads_in_reference(self):
        ds = DataSet(data.getBam())
        refNames = ds.refNames

        # See test_ref_names for why this is expected:
        rn = refNames[15]
        reads = ds.readsInReference(rn)
        self.assertEqual(len(list(reads)), 11)

        ds2 = DataSet(data.getBam(0))
        reads = ds2.readsInReference("E.faecalis.1")
        self.assertEqual(len(list(reads)), 20)

        reads = ds2.readsInReference("E.faecalis.2")
        self.assertEqual(len(list(reads)), 3)

        ds2 = DataSet(data.getXml(8))
        reads = ds2.readsInReference("E.faecalis.1")
        self.assertEqual(len(list(reads)), 20)

        ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])

        # Because of the filter!
        reads = ds2.readsInReference("E.faecalis.2")
        self.assertEqual(len(list(reads)), 0)
Example #5
0
    def test_readGroupTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].readGroupTable), 1)
        self.assertEqual(len(readers[1].readGroupTable), 1)
        self.assertEqual(len(readers[2].readGroupTable), 1)
        self.assertEqual(len(aln.readGroupTable), 3)
Example #6
0
    def test_readGroupTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].readGroupTable), 1)
        self.assertEqual(len(readers[1].readGroupTable), 1)
        self.assertEqual(len(readers[2].readGroupTable), 1)
        self.assertEqual(len(aln.readGroupTable), 3)
Example #7
0
    def test_referenceInfoTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].referenceInfoTable), 1)
        self.assertEqual(len(readers[1].referenceInfoTable), 59)
        self.assertEqual(len(readers[2].referenceInfoTable), 1)
        self.assertEqual(readers[0].referenceInfoTable.Name,
                         readers[2].referenceInfoTable.Name)
        self.assertEqual(len(aln.referenceInfoTable), 60)
Example #8
0
    def test_referenceInfoTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].referenceInfoTable), 1)
        self.assertEqual(len(readers[1].referenceInfoTable), 59)
        self.assertEqual(len(readers[2].referenceInfoTable), 1)
        self.assertEqual(readers[0].referenceInfoTable.Name,
                         readers[2].referenceInfoTable.Name)
        self.assertEqual(len(aln.referenceInfoTable), 60)
Example #9
0
    def test_updateCounts(self):
        log.info("Testing updateCounts without filters")
        aln = AlignmentSet(data.getBam(0))
        readers = aln.resourceReaders()

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength
                self.assertEqual(
                    record.aStart, record.bam.pbi[record.rowNumber]['aStart'])
                self.assertEqual(
                    record.aEnd, record.bam.pbi[record.rowNumber]['aEnd'])

        expNum = 0
        for reader in readers:
            expNum += len(reader)

        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)

        log.info("Testing whether filters are respected")
        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
        aln.updateCounts()
        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        def count(gen):
            count = 0
            for _ in gen:
                count += 1
            return count

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength

        bfile = openIndexedAlignmentFile(data.getBam(0))
        rWin = (bfile.referenceInfo('E.faecalis.1').ID,
                0,
                bfile.referenceInfo('E.faecalis.1').Length)
        reads = bfile.readsInRange(*rWin)
        expNum = count(reads)
        expLen = 0
        reads = bfile.readsInRange(*rWin)
        for read in reads:
            expLen += read.readLength

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)
Example #10
0
    def test_updateCounts(self):
        log.info("Testing updateCounts without filters")
        aln = AlignmentSet(data.getBam(0))
        readers = aln.resourceReaders()

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength
                self.assertEqual(record.aStart,
                                 record.bam.pbi[record.rowNumber]['aStart'])
                self.assertEqual(record.aEnd,
                                 record.bam.pbi[record.rowNumber]['aEnd'])

        expNum = 0
        for reader in readers:
            expNum += len(reader)

        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)

        log.info("Testing whether filters are respected")
        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
        aln.updateCounts()
        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        def count(gen):
            count = 0
            for _ in gen:
                count += 1
            return count

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength

        bfile = openIndexedAlignmentFile(data.getBam(0))
        rWin = (bfile.referenceInfo('E.faecalis.1').ID, 0,
                bfile.referenceInfo('E.faecalis.1').Length)
        reads = bfile.readsInRange(*rWin)
        expNum = count(reads)
        expLen = 0
        reads = bfile.readsInRange(*rWin)
        for read in reads:
            expLen += read.readLength

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)
Example #11
0
    def test_write(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfile = os.path.join(outdir, 'tempfile.xml')
        ds1 = AlignmentSet(data.getBam())
        ds1.write(outfile)
        log.debug('Validated file: {f}'.format(f=outfile))
        validateFile(outfile)
        ds2 = AlignmentSet(outfile)
        self.assertTrue(ds1 == ds2)

        # Should fail when strict:
        ds3 = AlignmentSet(data.getBam())
        ds3.write(outfile)
Example #12
0
    def test_write(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfile = os.path.join(outdir, 'tempfile.xml')
        ds1 = AlignmentSet(data.getBam())
        ds1.write(outfile)
        log.debug('Validated file: {f}'.format(f=outfile))
        validateFile(outfile)
        ds2 = AlignmentSet(outfile)
        self.assertTrue(ds1 == ds2)

        # Should fail when strict:
        ds3 = AlignmentSet(data.getBam())
        ds3.write(outfile)
Example #13
0
    def test_loading_reference(self):
        log.info('Opening Reference')
        r = ReferenceSet(data.getRef()).toExternalFiles()[0]
        log.info('Done Opening Reference')
        log.info('Opening AlignmentSet')
        d = AlignmentSet(data.getBam(), referenceFastaFname=r)
        log.info('Done Opening AlignmentSet')
        bfile = openIndexedAlignmentFile(data.getBam(), referenceFastaFname=r)
        self.assertTrue(bfile.isReferenceLoaded)
        for res in d.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)

        aln = AlignmentSet(data.getBam())
        aln.addReference(r)
        for res in aln.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)
Example #14
0
    def test_refLengths(self):
        ds = DataSet(data.getBam(0))
        random_few = {
            'B.cereus.6': 1472,
            'S.agalactiae.1': 1470,
            'B.cereus.4': 1472
        }
        for key, value in random_few.items():
            self.assertEqual(ds.refLengths[key], value)

        # this is a hack to only emit refNames that actually have records
        # associated with them:
        dss = ds.split(contigs=True, chunks=1)[0]
        self.assertEqual(
            dss.refLengths, {
                'B.vulgatus.4': 1449,
                'B.vulgatus.5': 1449,
                'C.beijerinckii.13': 1433,
                'C.beijerinckii.14': 1433,
                'C.beijerinckii.9': 1433,
                'E.coli.6': 1463,
                'E.faecalis.1': 1482,
                'E.faecalis.2': 1482,
                'R.sphaeroides.1': 1386,
                'S.epidermidis.2': 1472,
                'S.epidermidis.3': 1472,
                'S.epidermidis.4': 1472
            })
Example #15
0
 def test_repr(self):
     ds = DataSet(data.getBam())
     rep = str(ds)
     self.assertTrue(re.search('DataSet', rep))
     self.assertTrue(re.search('uuid:',
                               rep))
     self.assertTrue(re.search('pbalchemysim0.pbalign.bam', rep))
Example #16
0
 def test_referenceInfo(self):
     aln = AlignmentSet(data.getBam(0))
     readers = aln.resourceReaders()
     self.assertEqual(len(readers[0].referenceInfoTable), 59)
     self.assertEqual(
         str(readers[0].referenceInfo('E.faecalis.1')),
         "(27, 27, 'E.faecalis.1', 'E.faecalis.1', 1482, "
         "'a1a59c267ac1341e5a12bce7a7d37bcb', 0L, 0L)")
Example #17
0
 def test_attributes(self):
     aln = AlignmentSet(data.getBam(0))
     self.assertEqual(aln.sequencingChemistry, ['unknown'])
     self.assertEqual(aln.isSorted, True)
     self.assertEqual(aln.isEmpty, False)
     self.assertEqual(aln.readType, 'standard')
     self.assertEqual(len(aln.tStart), aln.metadata.numRecords)
     self.assertEqual(len(aln.tEnd), aln.metadata.numRecords)
Example #18
0
    def test_loading_reference(self):
        log.info('Opening Reference')
        r = ReferenceSet(data.getRef()).toExternalFiles()[0]
        log.info('Done Opening Reference')
        log.info('Opening AlignmentSet')
        d = AlignmentSet(data.getBam(), referenceFastaFname=r)
        log.info('Done Opening AlignmentSet')
        bfile = openIndexedAlignmentFile(data.getBam(),
                                         referenceFastaFname=r)
        self.assertTrue(bfile.isReferenceLoaded)
        for res in d.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)

        aln = AlignmentSet(data.getBam())
        aln.addReference(r)
        for res in aln.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)
Example #19
0
 def test_attributes(self):
     aln = AlignmentSet(data.getBam(0))
     self.assertEqual(aln.sequencingChemistry, ['unknown'])
     self.assertEqual(aln.isSorted, True)
     self.assertEqual(aln.isEmpty, False)
     self.assertEqual(aln.readType, 'standard')
     self.assertEqual(len(aln.tStart), aln.metadata.numRecords)
     self.assertEqual(len(aln.tEnd), aln.metadata.numRecords)
Example #20
0
 def test_referenceInfo(self):
     aln = AlignmentSet(data.getBam(0))
     readers = aln.resourceReaders()
     self.assertEqual(len(readers[0].referenceInfoTable), 59)
     self.assertEqual(
         str(readers[0].referenceInfo('E.faecalis.1')),
         "(27, 27, 'E.faecalis.1', 'E.faecalis.1', 1482, "
         "'a1a59c267ac1341e5a12bce7a7d37bcb', 0L, 0L)")
Example #21
0
 def test_refWindows(self):
     ds = DataSet(data.getBam())
     dss = ds.split(chunks=2, contigs=True)
     self.assertEqual(len(dss), 2)
     log.debug(dss[0].filters)
     log.debug(dss[1].filters)
     self.assertTrue('( rname = E.faecalis.2 ) ' in str(dss[0].filters)
                     or '( rname = E.faecalis.2 ) ' in str(dss[1].filters))
     ds = DataSet(data.getBam())
     ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'),
                                      ('=', 'lambda_NEB3011')],
                               tStart=[('<', '0'), ('<', '100')],
                               tEnd=[('>', '99'), ('>', '299')])
     self.assertEqual(
         str(ds.filters), '( rname = lambda_NEB3011 AND tstart '
         '< 0 AND tend > 99 ) OR ( rname = lambd'
         'a_NEB3011 AND tstart < 100 AND tend > 299 )')
Example #22
0
 def test_updateCounts_without_pbi(self):
     log.info("Testing updateCounts without pbi")
     data_fname = data.getBam(0)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     tempout = os.path.join(outdir, os.path.basename(data_fname))
     backticks('cp {i} {o}'.format(i=data_fname, o=tempout))
     aln = AlignmentSet(tempout, strict=False)
     self.assertEqual(aln.totalLength, -1)
     self.assertEqual(aln.numRecords, -1)
Example #23
0
 def test_updateCounts_without_pbi(self):
     log.info("Testing updateCounts without pbi")
     data_fname = data.getBam(0)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     tempout = os.path.join(outdir, os.path.basename(data_fname))
     backticks('cp {i} {o}'.format(i=data_fname, o=tempout))
     aln = AlignmentSet(tempout, strict=False)
     self.assertEqual(aln.totalLength, -1)
     self.assertEqual(aln.numRecords, -1)
Example #24
0
    def test_split_by_contigs_with_split(self):
        # test to make sure the refWindows work when chunks == # refs
        ds3 = DataSet(data.getBam())
        dss = ds3.split(contigs=True)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        # not all references have something mapped to them, refWindows doesn't
        # care...
        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
        random_few = [('C.beijerinckii.13', 0, 1433),
                      ('B.vulgatus.4', 0, 1449),
                      ('E.faecalis.1', 0, 1482)]
        for reference in random_few:
            found = False
            for ref in refWindows:
                if ref == reference:
                    found = True
            self.assertTrue(found)
        old_refWindows = refWindows

        dss = ds3.split(contigs=True, chunks=1)
        self.assertEqual(len(dss), 1)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        self.assertEqual(refWindows, old_refWindows)

        dss = ds3.split(contigs=True, chunks=24)
        self.assertEqual(len(dss), 24)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))

        random_few = [('E.faecalis.2', 0, 741),
                      ('E.faecalis.2', 741, 1482)]
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            if not found:
                log.debug(ref)
            self.assertTrue(found)

        dss = ds3.split(contigs=True, chunks=36)
        self.assertEqual(len(dss), 36)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        random_few = [('E.faecalis.2', 0, 494),
                      ('E.faecalis.2', 494, 988),
                      ('E.faecalis.2', 988, 1482)]
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)
Example #25
0
 def test_refWindows(self):
     ds = AlignmentSet(data.getBam())
     dss = ds.split(chunks=2, contigs=True)
     self.assertEqual(len(dss), 2)
     log.debug(dss[0].filters)
     log.debug(dss[1].filters)
     self.assertTrue('( rname = E.faecalis.2 ' in str(dss[0].filters)
                     or '( rname = E.faecalis.2 ' in str(dss[1].filters))
     ds = AlignmentSet(data.getBam())
     ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
                                      ('=', 'E.faecalis.2')],
                               tStart=[('<', '99'), ('<', '299')],
                               tEnd=[('>', '0'), ('>', '100')])
     self.assertEqual(
         str(ds.filters), '( rname = E.faecalis.2 AND tstart '
         '< 99 AND tend > 0 ) OR ( rname = '
         'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
     self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
                                      ('E.faecalis.2', 100, 299)])
Example #26
0
    def test_reads_in_range(self):
        ds = DataSet(data.getBam())
        refNames = ds.refNames

        rn = refNames[15]
        reads = ds.readsInRange(rn, 10, 100)
        self.assertEqual(len(list(reads)), 10)

        ds2 = DataSet(data.getBam(0))
        reads = ds2.readsInRange("E.faecalis.1", 0, 1400)
        self.assertEqual(len(list(reads)), 20)

        lengths = ds.refLengths
        for rname, rId in ds.refInfo('ID'):
            rn = ds._idToRname(rId)
            self.assertEqual(rname, rn)
            rlen = lengths[rn]
            self.assertEqual(len(list(ds.readsInReference(rn))),
                             len(list(ds.readsInReference(rId))))
            self.assertEqual(len(list(ds.readsInRange(rn, 0, rlen))),
                             len(list(ds.readsInRange(rId, 0, rlen))))
Example #27
0
    def test_split_by_contigs_with_split(self):
        # test to make sure the refWindows work when chunks == # refs
        ds3 = AlignmentSet(data.getBam())
        dss = ds3.split(contigs=True)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        # not all references have something mapped to them, refWindows doesn't
        # care...
        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
        random_few = [('C.beijerinckii.13', 0, 1433),
                      ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)]
        for reference in random_few:
            found = False
            for ref in refWindows:
                if ref == reference:
                    found = True
            self.assertTrue(found)
        old_refWindows = refWindows

        dss = ds3.split(contigs=True, chunks=1)
        self.assertEqual(len(dss), 1)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        self.assertEqual(refWindows, old_refWindows)

        dss = ds3.split(contigs=True, chunks=24)
        self.assertEqual(len(dss), 24)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))

        random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)]
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            if not found:
                log.debug(ref)
            self.assertTrue(found)

        dss = ds3.split(contigs=True, chunks=36)
        self.assertEqual(len(dss), 36)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988),
                      ('E.faecalis.2', 988, 1482)]
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)
Example #28
0
    def test_reads_in_range(self):
        ds = DataSet(data.getBam())
        refNames = ds.refNames

        rn = refNames[15]
        reads = ds.readsInRange(rn, 10, 100)
        self.assertEqual(len(list(reads)), 10)

        ds2 = DataSet(data.getBam(0))
        reads = ds2.readsInRange("E.faecalis.1", 0, 1400)
        self.assertEqual(len(list(reads)), 20)

        lengths = ds.refLengths
        for rname, rId in ds.refInfo('ID'):
            rn = ds._idToRname(rId)
            self.assertEqual(rname, rn)
            rlen = lengths[rn]
            self.assertEqual(len(list(ds.readsInReference(rn))),
                             len(list(ds.readsInReference(rId))))
            self.assertEqual(len(list(ds.readsInRange(rn, 0, rlen))),
                             len(list(ds.readsInRange(rId, 0, rlen))))
Example #29
0
 def test_copy(self):
     ds1 = DataSet(data.getXml())
     ds2 = ds1.copy()
     self.assertFalse(ds1 == ds2)
     self.assertFalse(ds1.uuid == ds2.uuid)
     self.assertFalse(ds1 is ds2)
     self.assertTrue(ds1.name == ds2.name)
     self.assertTrue(ds1.externalResources == ds2.externalResources)
     # The name and UniqueId are different:
     self.assertFalse(ds1.objMetadata == ds2.objMetadata)
     self.assertTrue(ds1.filters == ds2.filters)
     self.assertTrue(ds1.subdatasets == ds2.subdatasets)
     self.assertTrue(len(ds1.subdatasets) == 2)
     self.assertTrue(len(ds2.subdatasets) == 2)
     assert not reduce(lambda x, y: x or y,
                       [ds1d is ds2d for ds1d in
                        ds1.subdatasets for ds2d in
                        ds2.subdatasets])
     # TODO: once simulated files are indexable, turn on strict:
     ds1 = SubreadSet(data.getXml(no=10), strict=False)
     self.assertEquals(type(ds1.metadata).__name__,
                       'SubreadSetMetadata')
     ds2 = ds1.copy()
     self.assertEquals(type(ds2.metadata).__name__,
                       'SubreadSetMetadata')
     # Lets try casting
     ds1 = DataSet(data.getBam())
     self.assertEquals(type(ds1).__name__,
                       'DataSet')
     ds1 = ds1.copy(asType='SubreadSet')
     self.assertEquals(type(ds1).__name__,
                       'SubreadSet')
     # Lets do some illicit casting
     with self.assertRaises(TypeError):
         ds1 = ds1.copy(asType='ReferenceSet')
     # Lets try not having to cast
     ds1 = SubreadSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
Example #30
0
 def test_build(self):
     # Progs like pbalign provide a .bam file:
     # e.g. d = DataSet("aligned.bam")
     # Something like the test files we have:
     inBam = data.getBam()
     self.assertTrue(inBam.endswith('.bam'))
     d = DataSet(inBam)
     # A UniqueId is generated, despite being a BAM input
     self.assertTrue(d.uuid != '')
     dOldUuid = d.uuid
     # They can write this BAM to an XML:
     # e.g. d.write("alignmentset.xml")
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outXml = os.path.join(outdir, 'tempfile.xml')
     d.write(outXml)
     # And then recover the same XML (or a different one):
     # e.g. d = DataSet("alignmentset.xml")
     d = DataSet(outXml)
     # The UniqueId will be the same
     self.assertTrue(d.uuid == dOldUuid)
     # Inputs can be many and varied
     ds1 = DataSet(data.getXml(11), data.getBam())
     self.assertEquals(ds1.numExternalResources, 2)
     ds1 = DataSet(data.getFofn())
     self.assertEquals(ds1.numExternalResources, 2)
     # New! Use the correct constructor:
     self.assertEquals(type(SubreadSet(data.getSubreadSet())).__name__,
                       'SubreadSet')
     # Even with untyped inputs
     self.assertTrue(str(SubreadSet(data.getBam())).startswith(
         '<SubreadSet'))
     self.assertEquals(type(SubreadSet(data.getBam())).__name__,
                       'SubreadSet')
     self.assertEquals(type(DataSet(data.getBam())).__name__,
                       'DataSet')
     # You can also cast up and down, but casting between siblings
     # is limited (abuse at your own risk)
     self.assertEquals(
         type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__,
         'SubreadSet')
     self.assertEquals(
         type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__,
         'DataSet')
     # Add external Resources:
     ds = DataSet()
     ds.externalResources.addResources(["IdontExist.bam"])
     self.assertTrue(
         ds.externalResources[-1].resourceId == "IdontExist.bam")
     # Add an index file
     ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"])
     self.assertTrue(
         ds.externalResources[-1].indices[0].resourceId ==
         "IdontExist.bam.pbi")
Example #31
0
    def test_referenceInfoTableMerging(self):
        log.info("Testing refIds, etc. after merging")
        ds = DataSet(data.getXml(17))
        also_lambda = ds.toExternalFiles()[0]
        aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda)
        readers = aln.resourceReaders()

        ids = sorted([i for _, i in aln.refInfo('ID')])
        self.assertEqual(range(len(ids)), ids)

        accNames = aln.refNames
        expNames = reduce(np.append,
                          [reader.referenceInfoTable['Name']
                           for reader in readers])
        expNames = np.unique(expNames)
        self.assertEqual(sorted(expNames), sorted(accNames))

        accNames = aln.fullRefNames
        expNames = reduce(np.append,
                          [reader.referenceInfoTable['FullName']
                           for reader in readers])
        expNames = np.unique(expNames)
        self.assertEqual(sorted(expNames), sorted(accNames))
Example #32
0
    def test_referenceInfoTableMerging(self):
        log.info("Testing refIds, etc. after merging")
        ds = DataSet(data.getXml(17))
        also_lambda = ds.toExternalFiles()[0]
        aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda)
        readers = aln.resourceReaders()

        ids = sorted([i for _, i in aln.refInfo('ID')])
        self.assertEqual(range(len(ids)), ids)

        accNames = aln.refNames
        expNames = reduce(
            np.append,
            [reader.referenceInfoTable['Name'] for reader in readers])
        expNames = np.unique(expNames)
        self.assertEqual(sorted(expNames), sorted(accNames))

        accNames = aln.fullRefNames
        expNames = reduce(
            np.append,
            [reader.referenceInfoTable['FullName'] for reader in readers])
        expNames = np.unique(expNames)
        self.assertEqual(sorted(expNames), sorted(accNames))
Example #33
0
 def test_refWindows(self):
     ds = DataSet(data.getBam())
     dss = ds.split(chunks=2, contigs=True)
     self.assertEqual(len(dss), 2)
     log.debug(dss[0].filters)
     log.debug(dss[1].filters)
     self.assertTrue(
         '( rname = E.faecalis.2 ) '
         in str(dss[0].filters)
         or
         '( rname = E.faecalis.2 ) '
         in str(dss[1].filters))
     ds = DataSet(data.getBam())
     ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'),
                                      ('=', 'lambda_NEB3011')],
                               tStart=[('<', '0'),
                                       ('<', '100')],
                               tEnd=[('>', '99'),
                                     ('>', '299')])
     self.assertEqual(str(ds.filters),
                      '( rname = lambda_NEB3011 AND tstart '
                      '< 0 AND tend > 99 ) OR ( rname = lambd'
                      'a_NEB3011 AND tstart < 100 AND tend > 299 )')
Example #34
0
 def test_factory_function(self):
     bam = data.getBam()
     aln = data.getXml(8)
     ref = data.getXml(9)
     sub = data.getXml(10)
     inTypes = [bam, aln, ref, sub]
     expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet]
     for infn, exp in zip(inTypes, expTypes):
         # TODO enable this for all when simulated subread files can be
         # pbi'd
         if exp in [DataSet, ReferenceSet, AlignmentSet]:
             ds = openDataSet(infn, strict=True)
         else:
             ds = openDataSet(infn)
         self.assertEqual(type(ds), exp)
Example #35
0
 def test_factory_function(self):
     bam = data.getBam()
     aln = data.getXml(8)
     ref = data.getXml(9)
     sub = data.getXml(10)
     inTypes = [bam, aln, ref, sub]
     expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet]
     for infn, exp in zip(inTypes, expTypes):
         # TODO enable this for all when simulated subread files can be
         # pbi'd
         if exp in [DataSet, ReferenceSet, AlignmentSet]:
             ds = openDataSet(infn, strict=True)
         else:
             ds = openDataSet(infn)
         self.assertEqual(type(ds), exp)
Example #36
0
 def test_build(self):
     # Progs like pbalign provide a .bam file:
     # e.g. d = DataSet("aligned.bam")
     # Something like the test files we have:
     inBam = data.getBam()
     self.assertTrue(inBam.endswith('.bam'))
     d = DataSet(inBam)
     # A UniqueId is generated, despite being a BAM input
     self.assertTrue(d.uuid != '')
     dOldUuid = d.uuid
     # They can write this BAM to an XML:
     # e.g. d.write("alignmentset.xml")
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outXml = os.path.join(outdir, 'tempfile.xml')
     d.write(outXml)
     # And then recover the same XML (or a different one):
     # e.g. d = DataSet("alignmentset.xml")
     d = DataSet(outXml)
     # The UniqueId will be the same
     self.assertTrue(d.uuid == dOldUuid)
     # Inputs can be many and varied
     ds1 = DataSet(data.getXml(11), data.getBam())
     self.assertEquals(ds1.numExternalResources, 2)
     ds1 = DataSet(data.getFofn())
     self.assertEquals(ds1.numExternalResources, 2)
     # New! Use the correct constructor:
     self.assertEquals(
         type(SubreadSet(data.getSubreadSet())).__name__, 'SubreadSet')
     # Even with untyped inputs
     self.assertTrue(
         str(SubreadSet(data.getBam())).startswith('<SubreadSet'))
     self.assertEquals(
         type(SubreadSet(data.getBam())).__name__, 'SubreadSet')
     self.assertEquals(type(DataSet(data.getBam())).__name__, 'DataSet')
     # You can also cast up and down, but casting between siblings
     # is limited (abuse at your own risk)
     self.assertEquals(
         type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__,
         'SubreadSet')
     self.assertEquals(
         type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__,
         'DataSet')
     # Add external Resources:
     ds = DataSet()
     ds.externalResources.addResources(["IdontExist.bam"])
     self.assertTrue(
         ds.externalResources[-1].resourceId == "IdontExist.bam")
     # Add an index file
     ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"])
     self.assertTrue(ds.externalResources[-1].indices[0].resourceId ==
                     "IdontExist.bam.pbi")
Example #37
0
    def test_refLengths(self):
        ds = DataSet(data.getBam(0))
        random_few = {'B.cereus.6': 1472, 'S.agalactiae.1': 1470,
                      'B.cereus.4': 1472}
        for key, value in random_few.items():
            self.assertEqual(ds.refLengths[key], value)

        # this is a hack to only emit refNames that actually have records
        # associated with them:
        dss = ds.split(contigs=True, chunks=1)[0]
        self.assertEqual(dss.refLengths, {'B.vulgatus.4': 1449,
                                          'B.vulgatus.5': 1449,
                                          'C.beijerinckii.13': 1433,
                                          'C.beijerinckii.14': 1433,
                                          'C.beijerinckii.9': 1433,
                                          'E.coli.6': 1463,
                                          'E.faecalis.1': 1482,
                                          'E.faecalis.2': 1482,
                                          'R.sphaeroides.1': 1386,
                                          'S.epidermidis.2': 1472,
                                          'S.epidermidis.3': 1472,
                                          'S.epidermidis.4': 1472
                                         })
Example #38
0
    def test_split_by_contigs_with_split_and_maxChunks(self):
        # test to make sure the refWindows work when chunks == # refs
        ds3 = AlignmentSet(data.getBam())
        dss = ds3.split(contigs=True)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        # not all references have something mapped to them, refWindows doesn't
        # care...
        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
        self.assertEqual(refWindows, [('B.vulgatus.4', 0, 1449),
                                      ('B.vulgatus.5', 0, 1449),
                                      ('C.beijerinckii.13', 0, 1433),
                                      ('C.beijerinckii.14', 0, 1433),
                                      ('C.beijerinckii.9', 0, 1433),
                                      ('E.coli.6', 0, 1463),
                                      ('E.faecalis.1', 0, 1482),
                                      ('E.faecalis.2', 0, 1482),
                                      ('R.sphaeroides.1', 0, 1386),
                                      ('S.epidermidis.2', 0, 1472),
                                      ('S.epidermidis.3', 0, 1472),
                                      ('S.epidermidis.4', 0, 1472)])
        old_refWindows = refWindows
        random_few = [('C.beijerinckii.13', 0, 1433),
                      ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)]

        dss = ds3.split(contigs=True, maxChunks=1)
        self.assertEqual(len(dss), 1)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        self.assertEqual(refWindows, old_refWindows)

        dss = ds3.split(contigs=True, maxChunks=24)
        # This isn't expected if num refs >= 100, as map check isn't made
        # for now (too expensive)
        # There are only 12 refs represented in this set, however...
        self.assertEqual(len(dss), 12)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))

        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            if not found:
                log.debug(ref)
            self.assertTrue(found)

        # test with maxchunks but no breaking contigs
        dss = ds3.split(contigs=True, maxChunks=36)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with maxchunks and breaking contigs is allowed (triggers
        # targetsize, may result in fewer chunks)
        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
        self.assertEqual(len(dss), 2)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with previous setup and smaller targetSize, resulting in more
        # chunks
        dss = ds3.split(contigs=True,
                        maxChunks=36,
                        breakContigs=True,
                        targetSize=10)
        self.assertEqual(len(dss), 9)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRecords and fewer chunks than atoms
        dss = ds3.split(contigs=True, chunks=3, byRecords=True)
        self.assertEqual(len(dss), 3)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRecords and more chunks than atoms
        orf = random_few
        random_few = [('C.beijerinckii.13', 0, 747), ('B.vulgatus.4', 0, 1449),
                      ('E.faecalis.1', 0, 742)]
        dss = ds3.split(contigs=True, chunks=16, byRecords=True)
        self.assertEqual(len(dss), 16)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRecords and updateCounts
        random_few = orf
        dss = ds3.split(contigs=True,
                        chunks=3,
                        byRecords=True,
                        updateCounts=True)
        self.assertEqual(len(dss), 3)
        sizes = sorted([dset.numRecords for dset in dss])
        self.assertListEqual(sizes, [30, 31, 31])
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRefLength and updateCounts
        random_few = orf
        dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
        self.assertEqual(len(dss), 3)
        sizes = sorted([dset.numRecords for dset in dss])
        self.assertListEqual(sizes, [20, 24, 48])
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)
Example #39
0
 def test_repr(self):
     ds = DataSet(data.getBam())
     rep = str(ds)
     self.assertTrue(re.search('DataSet', rep))
     self.assertTrue(re.search('uuid:', rep))
     self.assertTrue(re.search('pbalchemysim0.pbalign.bam', rep))
Example #40
0
    def test_stats_metadata(self):
        ds = DataSet(data.getBam())
        ds.loadStats(data.getStats())
        self.assertEqual(ds.metadata.summaryStats.prodDist.numBins, 4)
        self.assertEqual(ds.metadata.summaryStats.prodDist.bins,
                         [1576, 901, 399, 0])
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds3 = ds1 + ds2
        self.assertEqual(ds1.metadata.summaryStats.prodDist.bins,
                         [1576, 901, 399, 0])
        self.assertEqual(ds2.metadata.summaryStats.prodDist.bins,
                         [1576, 901, 399, 0])
        self.assertEqual(ds3.metadata.summaryStats.prodDist.bins,
                         [3152, 1802, 798, 0])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
                         [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45,
                          54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 0])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
                         [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45,
                          54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 0])
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                         [0, 124, 78, 72, 58, 74, 38, 58, 74, 64, 64, 80, 90,
                          108, 146, 154, 194, 190, 98, 34, 4, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 0, 0])
        # Lets check some manual values
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds1.metadata.summaryStats.readLenDist.bins = (
            [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
        ds1.metadata.summaryStats.readLenDist.minBinValue = 10
        ds1.metadata.summaryStats.readLenDist.binWidth = 10
        ds2.metadata.summaryStats.readLenDist.bins = (
            [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
                         [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
        ds2.metadata.summaryStats.readLenDist.minBinValue = 20
        ds2.metadata.summaryStats.readLenDist.binWidth = 10
        ds3 = ds1 + ds2
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                         [0, 10, 10, 9, 8, 7, 5, 3, 2, 1, 0, 1, 1])
        # now lets swap
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds1.metadata.summaryStats.readLenDist.bins = (
            [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
        ds1.metadata.summaryStats.readLenDist.minBinValue = 20
        ds1.metadata.summaryStats.readLenDist.binWidth = 10
        ds2.metadata.summaryStats.readLenDist.bins = (
            [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
                         [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
        ds2.metadata.summaryStats.readLenDist.minBinValue = 10
        ds2.metadata.summaryStats.readLenDist.binWidth = 10
        ds3 = ds1 + ds2
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                         [0, 1, 11, 10, 9, 8, 7, 5, 3, 1, 0, 1, 1])

        # now lets do some non-overlapping
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds1.metadata.summaryStats.readLenDist.bins = (
            [1, 1, 1])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
                         [1, 1, 1])
        ds1.metadata.summaryStats.readLenDist.minBinValue = 10
        ds1.metadata.summaryStats.readLenDist.binWidth = 10
        ds2.metadata.summaryStats.readLenDist.bins = (
            [2, 2, 2])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
                         [2, 2, 2])
        ds2.metadata.summaryStats.readLenDist.minBinValue = 50
        ds2.metadata.summaryStats.readLenDist.binWidth = 10
        ds3 = ds1 + ds2
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                         [1, 1, 1, 0, 2, 2, 2])

        # now lets test the subdataset metadata retention:
        ss = SubreadSet(data.getXml(10))
        ss.loadStats(data.getStats(0))
        ss.loadStats(data.getStats(1))
        self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws)
        self.assertEqual(
            2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
        self.assertEqual(
            150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
Example #41
0
 def test_resourceReaders(self):
     ds = DataSet(data.getBam())
     for seqFile in ds.resourceReaders():
         self.assertEqual(len([row for row in seqFile]), 92)
Example #42
0
 def test_ref_names(self):
     ds = DataSet(data.getBam())
     refNames = ds.refNames
     self.assertEqual(sorted(refNames)[0], 'A.baumannii.1')
     self.assertEqual(len(refNames), 59)
Example #43
0
 def test_resourceReaders(self):
     ds = DataSet(data.getBam())
     for seqFile in ds.resourceReaders():
         self.assertEqual(len([row for row in seqFile]), 92)
Example #44
0
 def test_nonempty_metatype(self):
     inBam = data.getBam()
     d = AlignmentSet(inBam)
     for extRes in d.externalResources:
         self.assertEqual(extRes.metaType,
                          "PacBio.SubreadFile.SubreadBamFile")
Example #45
0
    def test_split_by_contigs_with_split_and_maxChunks(self):
        # test to make sure the refWindows work when chunks == # refs
        ds3 = DataSet(data.getBam())
        dss = ds3.split(contigs=True)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        # not all references have something mapped to them, refWindows doesn't
        # care...
        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
        random_few = [('C.beijerinckii.13', 0, 1433),
                      ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)]
        for reference in random_few:
            found = False
            for ref in refWindows:
                if ref == reference:
                    found = True
            self.assertTrue(found)
        old_refWindows = refWindows

        dss = ds3.split(contigs=True, maxChunks=1)
        self.assertEqual(len(dss), 1)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        self.assertEqual(refWindows, old_refWindows)

        dss = ds3.split(contigs=True, maxChunks=24)
        # This isn't expected if num refs >= 100, as map check isn't made
        # for now (too expensive)
        # There are only 12 refs represented in this set, however...
        self.assertEqual(len(dss), 12)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))

        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            if not found:
                log.debug(ref)
            self.assertTrue(found)

        dss = ds3.split(contigs=True, maxChunks=36)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
        self.assertEqual(len(dss), 2)
        refWindows = sorted(
            reduce(lambda x, y: x + y, [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)
Example #46
0
 def test_empty_metatype(self):
     inBam = data.getBam()
     d = DataSet(inBam)
     for extRes in d.externalResources:
         self.assertEqual(extRes.metaType, "")
    def test_split_by_contigs_with_split_and_maxChunks(self):
        # test to make sure the refWindows work when chunks == # refs
        ds3 = AlignmentSet(data.getBam())
        dss = ds3.split(contigs=True)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        # not all references have something mapped to them, refWindows doesn't
        # care...
        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
        self.assertEqual(refWindows,
            [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449),
             ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433),
             ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463),
             ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482),
             ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472),
             ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)])
        old_refWindows = refWindows
        random_few = [('C.beijerinckii.13', 0, 1433),
                      ('B.vulgatus.4', 0, 1449),
                      ('E.faecalis.1', 0, 1482)]

        dss = ds3.split(contigs=True, maxChunks=1)
        self.assertEqual(len(dss), 1)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        self.assertEqual(refWindows, old_refWindows)

        dss = ds3.split(contigs=True, maxChunks=24)
        # This isn't expected if num refs >= 100, as map check isn't made
        # for now (too expensive)
        # There are only 12 refs represented in this set, however...
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))

        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            if not found:
                log.debug(ref)
            self.assertTrue(found)

        # test with maxchunks but no breaking contigs
        dss = ds3.split(contigs=True, maxChunks=36)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with maxchunks and breaking contigs is allowed (triggers
        # targetsize, may result in fewer chunks)
        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
        self.assertEqual(len(dss), 2)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with previous setup and smaller targetSize, resulting in more
        # chunks
        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True,
                        targetSize=10)
        self.assertEqual(len(dss), 9)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRecords and fewer chunks than atoms
        dss = ds3.split(contigs=True, chunks=3, byRecords=True)
        self.assertEqual(len(dss), 3)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRecords and more chunks than atoms
        orf = random_few
        random_few = [('C.beijerinckii.13', 0, 747),
                      ('B.vulgatus.4', 0, 1449),
                      ('E.faecalis.1', 0, 742)]
        dss = ds3.split(contigs=True, chunks=16, byRecords=True)
        self.assertEqual(len(dss), 16)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRecords and updateCounts
        random_few = orf
        dss = ds3.split(contigs=True, chunks=3, byRecords=True,
                        updateCounts=True)
        self.assertEqual(len(dss), 3)
        sizes = sorted([dset.numRecords for dset in dss])
        self.assertListEqual(sizes, [30, 31, 31])
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        # test with byRefLength and updateCounts
        random_few = orf
        dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
        self.assertEqual(len(dss), 3)
        sizes = sorted([dset.numRecords for dset in dss])
        self.assertListEqual(sizes, [20, 24, 48])
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)
Example #48
0
 def test_empty_metatype(self):
     inBam = data.getBam()
     d = DataSet(inBam)
     for extRes in d.externalResources:
         self.assertEqual(extRes.metaType, "")
Example #49
0
    def test_split_by_contigs_with_split_and_maxChunks(self):
        # test to make sure the refWindows work when chunks == # refs
        ds3 = DataSet(data.getBam())
        dss = ds3.split(contigs=True)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        # not all references have something mapped to them, refWindows doesn't
        # care...
        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
        random_few = [('C.beijerinckii.13', 0, 1433),
                      ('B.vulgatus.4', 0, 1449),
                      ('E.faecalis.1', 0, 1482)]
        for reference in random_few:
            found = False
            for ref in refWindows:
                if ref == reference:
                    found = True
            self.assertTrue(found)
        old_refWindows = refWindows

        dss = ds3.split(contigs=True, maxChunks=1)
        self.assertEqual(len(dss), 1)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        self.assertEqual(refWindows, old_refWindows)

        dss = ds3.split(contigs=True, maxChunks=24)
        # This isn't expected if num refs >= 100, as map check isn't made
        # for now (too expensive)
        # There are only 12 refs represented in this set, however...
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))

        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            if not found:
                log.debug(ref)
            self.assertTrue(found)

        dss = ds3.split(contigs=True, maxChunks=36)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)

        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
        self.assertEqual(len(dss), 2)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)
Example #50
0
    def test_stats_metadata(self):
        ds = DataSet(data.getBam())
        ds.loadStats(data.getStats())
        self.assertEqual(ds.metadata.summaryStats.prodDist.numBins, 4)
        self.assertEqual(ds.metadata.summaryStats.prodDist.bins,
                         [1576, 901, 399, 0])
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds3 = ds1 + ds2
        self.assertEqual(ds1.metadata.summaryStats.prodDist.bins,
                         [1576, 901, 399, 0])
        self.assertEqual(ds2.metadata.summaryStats.prodDist.bins,
                         [1576, 901, 399, 0])
        self.assertEqual(ds3.metadata.summaryStats.prodDist.bins,
                         [3152, 1802, 798, 0])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [
            0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97,
            95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [
            0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97,
            95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ])
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [
            0, 124, 78, 72, 58, 74, 38, 58, 74, 64, 64, 80, 90, 108, 146, 154,
            194, 190, 98, 34, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ])
        # Lets check some manual values
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds1.metadata.summaryStats.readLenDist.bins = ([
            0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1
        ])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
        ds1.metadata.summaryStats.readLenDist.minBinValue = 10
        ds1.metadata.summaryStats.readLenDist.binWidth = 10
        ds2.metadata.summaryStats.readLenDist.bins = ([
            0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1
        ])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
                         [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
        ds2.metadata.summaryStats.readLenDist.minBinValue = 20
        ds2.metadata.summaryStats.readLenDist.binWidth = 10
        ds3 = ds1 + ds2
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                         [0, 10, 10, 9, 8, 7, 5, 3, 2, 1, 0, 1, 1])
        # now lets swap
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds1.metadata.summaryStats.readLenDist.bins = ([
            0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1
        ])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
        ds1.metadata.summaryStats.readLenDist.minBinValue = 20
        ds1.metadata.summaryStats.readLenDist.binWidth = 10
        ds2.metadata.summaryStats.readLenDist.bins = ([
            0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1
        ])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
                         [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
        ds2.metadata.summaryStats.readLenDist.minBinValue = 10
        ds2.metadata.summaryStats.readLenDist.binWidth = 10
        ds3 = ds1 + ds2
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                         [0, 1, 11, 10, 9, 8, 7, 5, 3, 1, 0, 1, 1])

        # now lets do some non-overlapping
        ds1 = DataSet(data.getXml(8))
        ds1.loadStats(data.getStats())
        ds2 = DataSet(data.getXml(11))
        ds2.loadStats(data.getStats())
        ds1.metadata.summaryStats.readLenDist.bins = ([1, 1, 1])
        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [1, 1, 1])
        ds1.metadata.summaryStats.readLenDist.minBinValue = 10
        ds1.metadata.summaryStats.readLenDist.binWidth = 10
        ds2.metadata.summaryStats.readLenDist.bins = ([2, 2, 2])
        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [2, 2, 2])
        ds2.metadata.summaryStats.readLenDist.minBinValue = 50
        ds2.metadata.summaryStats.readLenDist.binWidth = 10
        ds3 = ds1 + ds2
        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                         [1, 1, 1, 0, 2, 2, 2])

        # now lets test the subdataset metadata retention:
        ss = SubreadSet(data.getXml(10))
        ss.loadStats(data.getStats(0))
        ss.loadStats(data.getStats(1))
        self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws)
        self.assertEqual(
            2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
        self.assertEqual(
            150292.0,
            ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
Example #51
0
 def test_nonempty_metatype(self):
     inBam = data.getBam()
     d = AlignmentSet(inBam)
     for extRes in d.externalResources:
         self.assertEqual(extRes.metaType,
                          "PacBio.SubreadFile.SubreadBamFile")
Example #52
0
 def test_ref_names(self):
     ds = DataSet(data.getBam())
     refNames = ds.refNames
     self.assertEqual(sorted(refNames)[0], 'A.baumannii.1')
     self.assertEqual(len(refNames), 59)