Example #1
0
class TestIndexedBam(_IndexedAlnFileReaderTests):
    READER_CONSTRUCTOR = IndexedBamReader
    CONSTRUCTOR_ARGS = (data.getBamAndCmpH5()[0], data.getLambdaFasta())

    def test_empty_bam(self):
        fn = data.getEmptyBam()
        bam = IndexedBamReader(fn)
        EQ(len(bam), 0)

    def test_alignment_identity(self):
        """
        Check that the values of the 'identity' property are consistent
        between IndexedBamReader (numpy array) and BamAlignment (float)
        """
        fn = data.getBamAndCmpH5()[0]
        with IndexedBamReader(fn) as bam_in:
            i1 = bam_in.identity
            i2 = np.array([rec.identity for rec in bam_in])
            EQ((i2 == i1).all(), True)

    def test_alignment_identity_unindexed(self):
        """
        Check that the value of the 'identity' property is the same whether
        or not the .pbi index was used to calculate it.
        """
        fn1 = data.getBamAndCmpH5()[0]
        fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
        shutil.copyfile(fn1, fn2)
        with IndexedBamReader(fn1) as bam_pbi:
            with BamReader(fn2) as bam_noindex:
                i1 = np.array([rec.identity for rec in bam_pbi])
                i2 = np.array([rec.identity for rec in bam_noindex])
                EQ((i2 == i1).all(), True)
    def test_len_h5(self):
        # HdfSubreadSet
        # len means something else in bax/bas land. These numbers may actually
        # be correct...
        sset = HdfSubreadSet(data.getXml(17), strict=True)
        self.assertEqual(len(sset), 9)
        self.assertEqual(sset._length, (9, 128093))
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)

        # AlignmentSet with cmp.h5
        aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
        self.assertEqual(len(aln), 112)
        self.assertEqual(aln._length, (112, 59970))
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
class TestIndexedBam(_IndexedAlnFileReaderTests):
    READER_CONSTRUCTOR = IndexedBamReader
    CONSTRUCTOR_ARGS   = (data.getBamAndCmpH5()[0], data.getLambdaFasta())

    def test_empty_bam(self):
        fn = data.getEmptyBam()
        bam = IndexedBamReader(fn)
        EQ(len(bam), 0)
Example #4
0
class TestBasicBam(_BasicAlnFileReaderTests):
    READER_CONSTRUCTOR = BamReader
    CONSTRUCTOR_ARGS = (data.getBamAndCmpH5()[0], data.getLambdaFasta())

    def testSpecVersion(self):
        EQ("3.0.1", self.f.version)

    def testReadScore(self):
        EQISH(0.904, self.fwdAln.readScore, 3)
Example #5
0
 def test_alignment_identity(self):
     """
     Check that the values of the 'identity' property are consistent
     between IndexedBamReader (numpy array) and BamAlignment (float)
     """
     fn = data.getBamAndCmpH5()[0]
     with IndexedBamReader(fn) as bam_in:
         i1 = bam_in.identity
         i2 = np.array([rec.identity for rec in bam_in])
         EQ((i2 == i1).all(), True)
 def test_alignment_identity(self):
     """
     Check that the values of the 'identity' property are consistent
     between IndexedBamReader (numpy array) and BamAlignment (float)
     """
     fn = data.getBamAndCmpH5()[0]
     with IndexedBamReader(fn) as bam_in:
         i1 = bam_in.identity
         i2 = np.array([ rec.identity for rec in bam_in ])
         EQ((i2 == i1).all(), True)
 def test_alignment_identity_unindexed(self):
     """
     Check that the value of the 'identity' property is the same whether
     or not the .pbi index was used to calculate it.
     """
     fn1 = data.getBamAndCmpH5()[0]
     fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     shutil.copyfile(fn1, fn2)
     with IndexedBamReader(fn1) as bam_pbi:
         with BamReader(fn2) as bam_noindex:
             i1 = np.array([ rec.identity for rec in bam_pbi ])
             i2 = np.array([ rec.identity for rec in bam_noindex ])
             EQ((i2 == i1).all(), True)
Example #8
0
 def test_alignment_identity_unindexed(self):
     """
     Check that the value of the 'identity' property is the same whether
     or not the .pbi index was used to calculate it.
     """
     fn1 = data.getBamAndCmpH5()[0]
     fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     shutil.copyfile(fn1, fn2)
     with IndexedBamReader(fn1) as bam_pbi:
         with BamReader(fn2) as bam_noindex:
             i1 = np.array([rec.identity for rec in bam_pbi])
             i2 = np.array([rec.identity for rec in bam_noindex])
             EQ((i2 == i1).all(), True)
class TestCmpH5(_IndexedAlnFileReaderTests):
    READER_CONSTRUCTOR = CmpH5Reader
    CONSTRUCTOR_ARGS   = (data.getBamAndCmpH5()[1],)

    #
    # Test behaviors specific to CmpH5Reader, which should be few.
    #
    def testLazyChemistryResolution(self):
        """
        The CmpH5Reader allows reading of files that have missing
        chemistry information---an exception will be thrown only upon
        attempts to access the information.  We need to retain this
        behavior for compatibility.  """
        oldCmpH5 = data.getCmpH5()

        C = CmpH5Reader(oldCmpH5) # no exception here

        with assert_raises(ChemistryLookupError):
            C.sequencingChemistry

        with assert_raises(ChemistryLookupError):
            C[0].sequencingChemistry
Example #10
0
    def __init__(self):
        bamFname, cmpFname = D.getBamAndCmpH5()
        lambdaFasta = D.getLambdaFasta()

        self.b = PacBioBamReader(bamFname, lambdaFasta)
        self.c = CmpH5Reader(cmpFname)
        self.bBasic = BamReader(bamFname)

        # Note that sorting orders are not generally the same... BAM
        # sorts + alns before - alns, when there is a tie on tStart;
        # we don't do this in cmp.h5 (we next sort on tEnd).  However
        # in this file there are no ties on tStart.
        self.bAlns = list(self.b)
        self.bFwd = self.bAlns[0]
        self.bRev = self.bAlns[1]

        self.cAlns = list(self.c)
        self.cFwd = self.cAlns[0]
        self.cRev = self.cAlns[1]

        self.cFwdClipped = self.cFwd.clippedTo(10, 60)
        self.bFwdClipped = self.bFwd.clippedTo(10, 60)
        self.cRevClipped = self.cRev.clippedTo(310, 360)
        self.bRevClipped = self.bRev.clippedTo(310, 360)
Example #11
0
 def testIncorrectReference(self):
     bamFname, _ = D.getBamAndCmpH5()
     incorrectFasta = D.getTinyFasta()
     with assert_raises(Exception):
         f = BamReader(bamFname, incorrectFasta)
Example #12
0
 def test_alignmentset_index(self):
     aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
     reads = aln.readsInRange(aln.refNames[0], 0, 1000)
     self.assertEqual(len(list(reads)), 2)
     self.assertEqual(len(list(aln)), 112)
     self.assertEqual(len(aln.index), 112)
 def test_alignmentset_index(self):
     aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
     reads = aln.readsInRange(aln.refNames[0], 0, 1000)
     self.assertEqual(len(list(reads)), 2)
     self.assertEqual(len(list(aln)), 112)
     self.assertEqual(len(aln.index), 112)
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(8), strict=True)
        self.assertEqual(len(aln), 92)
        self.assertEqual(aln._length, (92, 123588))
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        self.assertEqual(sum(1 for _ in aln), 92)
        self.assertEqual(sum(len(rec) for rec in aln), 123588)

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(15), strict=True)
        self.assertEqual(len(aln), 40)
        self.assertEqual(aln._length, (40, 52023))
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)

        # AlignmentSet with cmp.h5
        aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
        self.assertEqual(len(aln), 112)
        self.assertEqual(aln._length, (112, 59970))
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)


        # SubreadSet
        sset = SubreadSet(data.getXml(10), strict=True)
        self.assertEqual(len(sset), 92)
        self.assertEqual(sset._length, (92, 124093))
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        self.assertEqual(sum(1 for _ in sset), 92)
        self.assertEqual(sum(len(rec) for rec in sset), 124093)

        # HdfSubreadSet
        # len means something else in bax/bas land. These numbers may actually
        # be correct...
        sset = HdfSubreadSet(data.getXml(17), strict=True)
        self.assertEqual(len(sset), 9)
        self.assertEqual(sset._length, (9, 128093))
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)

        # ReferenceSet
        sset = ReferenceSet(data.getXml(9), strict=True)
        self.assertEqual(len(sset), 59)
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
Example #15
0
 def test_cmp_alignmentset_filters(self):
     aln = AlignmentSet(upstreamdata.getBamAndCmpH5()[1], strict=True)
     self.assertEqual(len(aln), 112)
     aln.filters.addRequirement(length=[('>=', 1000)])
     self.assertEqual(len(aln), 12)
Example #16
0
    def test_movie_filter(self):
        # unaligned bam
        bam0 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590956/0003/"
                "Analysis_Results/m140913_222218_42240_c10069"
                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
        bam1 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590953/0001/"
                "Analysis_Results/m140913_005018_42139_c10071"
                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
        aln = SubreadSet(bam0, bam1)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(aln.readGroupTable['ID']))
        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(set(aln.index.qId)))
        self.assertEqual(len(aln), 178570)
        aln.filters.addRequirement(movie=[(
            '=',
            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
        self.assertEqual(len(SubreadSet(bam1)), len(aln))

        # aligned bam
        #bam0 = ("/pbi/dept/secondary/siv/testdata/"
        #        "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
        #        "m140913_222218_42240_c1006999524000000018231"
        #        "39203261564_s1_p0.all.alignmentset.xml")
        bam0 = upstreamdata.getBamAndCmpH5()[0]
        bam1 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
                "m140913_005018_42139_c1007136524000000018231"
                "52404301534_s1_p0.all.alignmentset.xml")
        aln = AlignmentSet(bam0, bam1)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(aln.readGroupTable['ID']))
        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(set(aln.index.qId)))
        self.assertEqual(len(aln), 103144)
        aln.filters.addRequirement(movie=[(
            '=',
            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
        self.assertEqual(len(AlignmentSet(bam1)), len(aln))

        # cmpH5
        cmp1 = upstreamdata.getBamAndCmpH5()[1]
        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
                "genomic_consensus-unittest/bam_c4p6_tests/"
                "ecoli_c4p6.cmp.h5")
        aln = AlignmentSet(cmp1, cmp2)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(aln.readGroupTable['ID']))
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(set(aln.index.MovieID)))
        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
        self.assertEqual(len(aln), 57147)
        aln.filters.addRequirement(movie=[(
            '=',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
        len1 = len(AlignmentSet(cmp1))
        self.assertEqual(len1, len(aln))

        aln.filters.removeRequirement('movie')
        self.assertEqual(len(aln), 57147)
class TestIndexedBam(_IndexedAlnFileReaderTests):
    READER_CONSTRUCTOR = IndexedBamReader
    CONSTRUCTOR_ARGS   = (data.getBamAndCmpH5()[0], data.getLambdaFasta())
class TestBasicBam(_BasicAlnFileReaderTests):
    READER_CONSTRUCTOR = BamReader
    CONSTRUCTOR_ARGS   = (data.getBamAndCmpH5()[0], data.getLambdaFasta())

    def testSpecVersion(self):
        EQ("3.0b7",     self.f.version)
 def test_cmp_alignmentset_filters(self):
     aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
     self.assertEqual(len(aln), 112)
     aln.filters.addRequirement(length=[('>=', 1000)])
     self.assertEqual(len(aln), 12)
 def test_alignment_identity(self):
     fn = data.getBamAndCmpH5()[0]
     with IndexedBamReader(fn) as bam_in:
         i1 = bam_in.identity
         i2 = np.array([ rec.identity for rec in bam_in ])
         EQ((i2 == i1).all(), True)
    def test_movie_filter(self):
        # unaligned bam
        bam0 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590956/0003/"
                "Analysis_Results/m140913_222218_42240_c10069"
                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
        bam1 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590953/0001/"
                "Analysis_Results/m140913_005018_42139_c10071"
                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
        aln = SubreadSet(bam0, bam1)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(aln.readGroupTable['ID']))
        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(set(aln.index.qId)))
        self.assertEqual(len(aln), 178570)
        aln.filters.addRequirement(movie=[(
            '=',
            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
        self.assertEqual(len(SubreadSet(bam1)), len(aln))

        # aligned bam
        #bam0 = ("/pbi/dept/secondary/siv/testdata/"
        #        "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
        #        "m140913_222218_42240_c1006999524000000018231"
        #        "39203261564_s1_p0.all.alignmentset.xml")
        bam0 = upstreamdata.getBamAndCmpH5()[0]
        bam1 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
                "m140913_005018_42139_c1007136524000000018231"
                "52404301534_s1_p0.all.alignmentset.xml")
        aln = AlignmentSet(bam0, bam1)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(aln.readGroupTable['ID']))
        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(set(aln.index.qId)))
        self.assertEqual(len(aln), 103144)
        aln.filters.addRequirement(movie=[(
            '=',
            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
        self.assertEqual(len(AlignmentSet(bam1)), len(aln))

        # cmpH5
        cmp1 = upstreamdata.getBamAndCmpH5()[1]
        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
                "genomic_consensus-unittest/bam_c4p6_tests/"
                "ecoli_c4p6.cmp.h5")
        aln = AlignmentSet(cmp1, cmp2)
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(aln.readGroupTable['ID']))
        self.assertEqual(len(set(aln.readGroupTable['ID'])),
                         len(set(aln.index.MovieID)))
        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
        self.assertEqual(len(aln), 57147)
        aln.filters.addRequirement(movie=[(
            '=',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
        len1 = len(AlignmentSet(cmp1))
        self.assertEqual(len1, len(aln))

        aln.filters.removeRequirement('movie')
        self.assertEqual(len(aln), 57147)
Example #22
0
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(8), strict=True)
        self.assertEqual(len(aln), 92)
        self.assertEqual(aln._length, (92, 123588))
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        self.assertEqual(sum(1 for _ in aln), 92)
        self.assertEqual(sum(len(rec) for rec in aln), 123588)

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(15), strict=True)
        self.assertEqual(len(aln), 40)
        self.assertEqual(aln._length, (40, 52023))
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)

        # AlignmentSet with cmp.h5
        aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
        self.assertEqual(len(aln), 112)
        self.assertEqual(aln._length, (112, 59970))
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)

        # SubreadSet
        sset = SubreadSet(data.getXml(10), strict=True)
        self.assertEqual(len(sset), 92)
        self.assertEqual(sset._length, (92, 124093))
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        self.assertEqual(sum(1 for _ in sset), 92)
        self.assertEqual(sum(len(rec) for rec in sset), 124093)

        # HdfSubreadSet
        # len means something else in bax/bas land. These numbers may actually
        # be correct...
        sset = HdfSubreadSet(data.getXml(17), strict=True)
        self.assertEqual(len(sset), 9)
        self.assertEqual(sset._length, (9, 128093))
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)

        # ReferenceSet
        sset = ReferenceSet(data.getXml(9), strict=True)
        self.assertEqual(len(sset), 59)
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)