def test_readFasta(self):
     f = FastaReader(data.getFasta())
     entries = list(f)
     assert_equal(48, len(entries))
     assert_equal("ref000001|EGFR_Exon_2", entries[0].header)
     assert_equal("TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTT"
                  "TGAAGATCATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCC"
                  "TTGGGAATTTGGAAATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTA"
                  "AAGGTTGGTGACTTTGATTTTCCT",
                  entries[0].sequence)
 def test_readFasta(self):
     f = FastaReader(data.getFasta())
     entries = list(f)
     assert_equal(48, len(entries))
     assert_equal("ref000001|EGFR_Exon_2", entries[0].name)
     assert_equal("TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTT"
                  "TGAAGATCATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCC"
                  "TTGGGAATTTGGAAATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTA"
                  "AAGGTTGGTGACTTTGATTTTCCT",
                  entries[0].sequence)
     assert_equal("e3912e9ceacd6538ede8c1b2adda7423",
                  entries[0].md5)
 def test_merged_contigset(self):
     fn = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     with ContigSet(upstreamData.getLambdaFasta(),
             upstreamData.getFasta()) as cset:
         self.assertEqual(len(list(cset)), 49)
         self.assertEqual(len(cset), 49)
         cset.consolidate()
         cset.write(fn)
         log.debug("Writing to {f}".format(f=fn))
         self.assertEqual(len(list(cset)), 49)
         self.assertEqual(len(cset), 49)
     with ContigSet(fn) as cset:
         self.assertEqual(len(list(cset)), 49)
         self.assertEqual(len(cset), 49)
 def setup(self):
     self.fastaPath = data.getFasta()
    def test_alignmentset_consolidate(self):
        log.debug("Test methods directly")
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln,
                        useTmp=False)
        self.assertTrue(os.path.exists(outfn))
        consAln = AlignmentSet(outfn)
        self.assertEqual(len(consAln.toExternalFiles()), 1)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(consAln))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(aln), len(consAln))

        log.debug("Test methods directly in tmp")
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln,
                        useTmp=True)
        self.assertTrue(os.path.exists(outfn))
        consAln = AlignmentSet(outfn)
        self.assertEqual(len(consAln.toExternalFiles()), 1)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(consAln))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(aln), len(consAln))

        log.debug("Test through API")
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        self.assertTrue(os.path.exists(outfn))
        self.assertEqual(len(aln.toExternalFiles()), 1)
        nonCons = AlignmentSet(data.getXml(12))
        self.assertEqual(len(nonCons.toExternalFiles()), 2)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(aln), len(nonCons))

        # Test that it is a valid xml:
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        datafile = os.path.join(outdir, "apimerged.bam")
        xmlfile = os.path.join(outdir, "apimerged.xml")
        log.debug(xmlfile)
        aln.write(xmlfile)

        log.debug("Test with cheap filter")
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        aln.filters.addRequirement(rname=[('=', 'B.vulgatus.5')])
        self.assertEqual(len(list(aln)), 7)
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        self.assertTrue(os.path.exists(outfn))
        self.assertEqual(len(aln.toExternalFiles()), 1)
        nonCons = AlignmentSet(data.getXml(12))
        nonCons.filters.addRequirement(rname=[('=', 'B.vulgatus.5')])
        self.assertEqual(len(nonCons.toExternalFiles()), 2)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(list(aln)), len(list(nonCons)))

        log.debug("Test with not refname filter")
        # This isn't trivial with bamtools
        """
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        aln.filters.addRequirement(rname=[('!=', 'B.vulgatus.5')])
        self.assertEqual(len(list(aln)), 7)
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        self.assertTrue(os.path.exists(outfn))
        self.assertEqual(len(aln.toExternalFiles()), 1)
        nonCons = AlignmentSet(data.getXml(12))
        nonCons.filters.addRequirement(rname=[('!=', 'B.vulgatus.5')])
        self.assertEqual(len(nonCons.toExternalFiles()), 2)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(list(aln)), len(list(nonCons)))
        """

        log.debug("Test with expensive filter")
        aln = AlignmentSet(data.getXml(12))
        self.assertEqual(len(list(aln)), 177)
        aln.filters.addRequirement(accuracy=[('>', '.85')])
        self.assertEqual(len(list(aln)), 174)
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        self.assertTrue(os.path.exists(outfn))
        self.assertEqual(len(aln.toExternalFiles()), 1)
        nonCons = AlignmentSet(data.getXml(12))
        nonCons.filters.addRequirement(accuracy=[('>', '.85')])
        self.assertEqual(len(nonCons.toExternalFiles()), 2)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(list(aln)), len(list(nonCons)))

        log.debug("Test with one reference")
        aln = AlignmentSet(data.getXml(12))
        reference = upstreamData.getFasta()
        aln.externalResources[0].reference = reference
        nonCons = aln.copy()
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        self.assertTrue(os.path.exists(outfn))
        self.assertEqual(len(aln.toExternalFiles()), 1)
        #nonCons = AlignmentSet(data.getXml(12))
        self.assertEqual(len(nonCons.toExternalFiles()), 2)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(aln), len(nonCons))
        self.assertEqual(aln.externalResources[0].reference, reference)

        log.debug("Test with two references")
        aln = AlignmentSet(data.getXml(12))
        reference = upstreamData.getFasta()
        for extRes in aln.externalResources:
            extRes.reference = reference
        self.assertEqual(len(aln.toExternalFiles()), 2)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        self.assertTrue(os.path.exists(outfn))
        self.assertEqual(len(aln.toExternalFiles()), 1)
        #nonCons = AlignmentSet(data.getXml(12))
        self.assertEqual(len(nonCons.toExternalFiles()), 2)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(aln), len(nonCons))
        self.assertEqual(aln.externalResources[0].reference, reference)
Example #6
0
    def test_alignmentset_consolidate(self):
        log.debug("Test through API")
        aln = AlignmentSet(data.getXml(11))
        assert len(aln.toExternalFiles()) == 2
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        assert os.path.exists(outfn)
        assert len(aln.toExternalFiles()) == 1
        nonCons = AlignmentSet(data.getXml(11))
        assert len(nonCons.toExternalFiles()) == 2
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            assert read1 == read2
        assert len(aln) == len(nonCons)

        # Test that it is a valid xml:
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        datafile = os.path.join(outdir, "apimerged.bam")
        xmlfile = os.path.join(outdir, "apimerged.xml")
        log.debug(xmlfile)
        aln.write(xmlfile)

        log.debug("Test with cheap filter")
        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        aln.filters.addRequirement(rname=[('=', 'B.vulgatus.5')])
        assert len(list(aln)) == 7
        assert len(aln.toExternalFiles()) == 2
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        assert os.path.exists(outfn)
        assert len(aln.toExternalFiles()) == 1
        nonCons = AlignmentSet(data.getXml(11))
        nonCons.filters.addRequirement(rname=[('=', 'B.vulgatus.5')])
        assert len(nonCons.toExternalFiles()) == 2
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            assert read1 == read2
        assert len(list(aln)) == len(list(nonCons))

        log.debug("Test with not refname filter")
        # This isn't trivial with bamtools
        """
        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        aln.filters.addRequirement(rname=[('!=', 'B.vulgatus.5')])
        assert len(list(aln)) == 7
        assert len(aln.toExternalFiles()) == 2
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        assert os.path.exists(outfn)
        assert len(aln.toExternalFiles()) == 1
        nonCons = AlignmentSet(data.getXml(11))
        nonCons.filters.addRequirement(rname=[('!=', 'B.vulgatus.5')])
        assert len(nonCons.toExternalFiles()) == 2
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            assert read1 == read2
        assert len(list(aln)) == len(list(nonCons))
        """

        log.debug("Test with expensive filter")
        aln = AlignmentSet(data.getXml(11))
        assert len(list(aln)) == 177
        aln.filters.addRequirement(accuracy=[('>', '.85')])
        assert len(list(aln)) == 174
        assert len(aln.toExternalFiles()) == 2
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        assert os.path.exists(outfn)
        assert len(aln.toExternalFiles()) == 1
        nonCons = AlignmentSet(data.getXml(11))
        nonCons.filters.addRequirement(accuracy=[('>', '.85')])
        assert len(nonCons.toExternalFiles()) == 2
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            assert read1 == read2
        assert len(list(aln)) == len(list(nonCons))

        log.debug("Test with one reference")
        aln = AlignmentSet(data.getXml(11))
        reference = upstreamData.getFasta()
        aln.externalResources[0].reference = reference
        nonCons = aln.copy()
        assert len(aln.toExternalFiles()) == 2
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        assert os.path.exists(outfn)
        assert len(aln.toExternalFiles()) == 1
        #nonCons = AlignmentSet(data.getXml(11))
        assert len(nonCons.toExternalFiles()) == 2
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            assert read1 == read2
        assert len(aln) == len(nonCons)
        assert aln.externalResources[0].reference == reference

        log.debug("Test with two references")
        aln = AlignmentSet(data.getXml(11))
        reference = upstreamData.getFasta()
        for extRes in aln.externalResources:
            extRes.reference = reference
        assert len(aln.toExternalFiles()) == 2
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn)
        assert os.path.exists(outfn)
        assert len(aln.toExternalFiles()) == 1
        #nonCons = AlignmentSet(data.getXml(11))
        assert len(nonCons.toExternalFiles()) == 2
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            assert read1 == read2
        assert len(aln) == len(nonCons)
        assert aln.externalResources[0].reference == reference
Example #7
0
class TestIndexedFastaReader:

    FASTAPATH = data.getFasta()

    def testIteration(self):
        ft = IndexedFastaReader(self.FASTAPATH)
        fr = FastaReader(self.FASTAPATH)
        ftContigs = list(ft)
        frContigs = list(fr)
        assert len(frContigs) == len(ftContigs)
        assert 48 == len(ftContigs)
        for ftC, frC in zip(ftContigs, frContigs):
            assert frC.header == ftC.header
            assert frC.sequence == ftC.sequence[:]

        # Unlike FastaReader, IndexedFastaReader iteration is repeatable.
        assert 48 == len(list(ft))

    def testAccessByName(self):
        ft = IndexedFastaReader(self.FASTAPATH)
        r000021 = ft["ref000021|EGFR_Exon_22\tMetadataTest"]
        assert "ref000021|EGFR_Exon_22\tMetadataTest" == r000021.header
        assert "ref000021|EGFR_Exon_22" == r000021.id
        assert "MetadataTest" == r000021.comment
        assert ("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT"
                "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA"
                "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG") == r000021.sequence[:]

    def testAccessById(self):
        ft = IndexedFastaReader(self.FASTAPATH)
        r000021 = ft["ref000021|EGFR_Exon_22"]
        assert "ref000021|EGFR_Exon_22\tMetadataTest" == r000021.header
        assert "ref000021|EGFR_Exon_22" == r000021.id
        assert "MetadataTest" == r000021.comment
        assert ("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT"
                "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA"
                "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG") == r000021.sequence[:]

    def testAccessByPosition(self):
        ft = IndexedFastaReader(self.FASTAPATH)
        r000001 = ft[0]
        assert "<IndexedFastaRecord: ref000001|EGFR_Exon_2>" == repr(r000001)
        firstTwo = ft[:2]
        assert [ft[0], ft[1]] == firstTwo
        lastTwo = ft[-2:]
        assert [ft[-2], ft[-1]] == lastTwo

    def testSlice(self):
        ft = IndexedFastaReader(self.FASTAPATH)
        r000021 = ft["ref000021|EGFR_Exon_22"]
        sequence = r000021.sequence
        assert "CACTGCCTCA" == sequence[0:10]
        assert "GCCAAGCTGG" == sequence[-10:]
        assert "G" == sequence[-1]
        assert "T" == sequence[-3]
        assert "C" == sequence[0]
        assert "A" == sequence[1]

    def test_dosLineEndingsFasta(self):
        fr = FastaReader(data.getDosFormattedFasta())
        frEntries = list(fr)

        ft = IndexedFastaReader(data.getDosFormattedFasta())
        ftEntries = list(ft)

        assert len(frEntries) == len(ftEntries)
        for (frE, ftE) in zip(frEntries, ftEntries):
            assert frE.header == ftE.header
            assert frE.sequence == ftE.sequence[:]

    def test_readWeirdFastaIndex(self):
        f = IndexedFastaReader(data.getWeird())
        entries = list(f)
        assert 1 == len(entries)
        assert "chr1" == entries[0].header
        assert "acgtacgtacgtact" == entries[0].sequence[:]
Example #8
0
 def _run_and_validate(args, file_name):
     self._run_cmd_with_output(cmd, ofn)
     aset = AlignmentSet(ofn, strict=True)
     for res in aset.externalResources:
         assert res.reference == otherdata.getFasta()