Beispiel #1
0
    def test_save_as_bam(self):

        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        tmpPath = self.tmpFile() + ".bam"
        reads.saveAsSam(tmpPath, isSorted=True, asSingleFile=True)

        bamReads = ac.loadAlignments(tmpPath)

        self.assertEqual(bamReads._jvmRdd.jrdd().count(),
                         reads._jvmRdd.jrdd().count())
    def test_save_as_bam(self):

        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        tmpPath = self.tmpFile() + ".bam"
        reads.saveAsSam(tmpPath,
                        isSorted=True,
                        asSingleFile=True)

        bamReads = ac.loadAlignments(tmpPath)

        self.assertEqual(bamReads._jvmRdd.jrdd().count(),
                          reads._jvmRdd.jrdd().count())
Beispiel #3
0
    def test_cumulative_count_distribution(self):
        # load file
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("small.sam")
        # read alignments

        reads = ac.loadAlignments(testFile)

        # convert to coverage
        coverage = reads.toCoverage()

        qc = CoverageDistribution(self.ss, coverage)

        _, cd = qc.plotDistributions(testMode = True, cumulative = True, normalize = False)

        # first sample
        items = list(cd.popitem()[1])
        assert(len(items) == 1)
        assert(items.pop()[1] == 1500)

        _, cd = qc.plotDistributions(testMode = True, cumulative = False, normalize = False)

        # first sample
        items = list(cd.popitem()[1])
        assert(len(items) == 1)
        assert(items.pop()[1] == 1500)
Beispiel #4
0
    def test_aggregatedCoverage(self):
        testFile = self.resourceFile("small.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        collapsed = coverage.aggregatedCoverage(10)
        self.assertEquals(collapsed.toDF().count(), 166)
Beispiel #5
0
    def test_collapse(self):
        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        collapsed = coverage.collapse()
        self.assertEquals(collapsed.toDF().count(), coverage.toDF().count())
Beispiel #6
0
    def test_collapse(self):
        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        collapsed = coverage.collapse()
        self.assertEquals(collapsed.toDF().count(), coverage.toDF().count())
Beispiel #7
0
    def test_aggregatedCoverage(self):
        testFile = self.resourceFile("small.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        collapsed = coverage.aggregatedCoverage(10)
        self.assertEquals(collapsed.toDF().count(), 166)
Beispiel #8
0
    def test_flatten(self):
        testFile = self.resourceFile("small.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        flattened = coverage.flatten()
        self.assertEquals(flattened.toDF().count(), 1500)
Beispiel #9
0
    def test_flatten(self):
        testFile = self.resourceFile("small.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        flattened = coverage.flatten()
        self.assertEquals(flattened.toDF().count(), 1500)
Beispiel #10
0
    def test_load_alignments(self):
        
        testFile = self.resourceFile("small.sam")
        ac = ADAMContext(self.ss)
        
        reads = ac.loadAlignments(testFile)

        self.assertEqual(reads.toDF().count(), 20)
        self.assertEqual(reads._jvmRdd.jrdd().count(), 20)
    def test_load_alignments(self):
        
        testFile = self.resourceFile("small.sam")
        ac = ADAMContext(self.ss)
        
        reads = ac.loadAlignments(testFile)

        self.assertEqual(reads.toDF().count(), 20)
        self.assertEqual(reads._jvmRdd.jrdd().count(), 20)
    def test_count_kmers(self):

        testFile = self.resourceFile("small.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        kmers = reads.countKmers(6)

        self.assertEqual(kmers.count(), 1040)
    def test_to_fragments(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        fragments = reads.toFragments()
        self.assertEqual(fragments.toDF().count(), 5)
Beispiel #14
0
    def test_save_unordered_sam(self):

        testFile = self.resourceFile("unordered.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        tmpPath = self.tmpFile() + ".sam"
        reads.saveAsSam(tmpPath, asSingleFile=True)

        self.checkFiles(testFile, tmpPath)
Beispiel #15
0
    def test_save(self):

        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        tmpPath = self.tmpFile() + ".coverage.adam"
        coverage.save(tmpPath, asSingleFile=True, disableFastConcat=True)
        assert (os.listdir(tmpPath) != [])
    def test_realignIndels_reads(self):

        readsPath = self.resourceFile("small.1.sam")

        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)
        realigned = reads.realignIndels()

        self.assertEqual(realigned.toDF().count(), 20)
Beispiel #17
0
    def test_toFeatures(self):
        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        features = coverage.toFeatures()

        assert (isinstance(features, FeatureRDD))
        self.assertEquals(features.toDF().count(), coverage.toDF().count())
Beispiel #18
0
    def test_toFeatures(self):
        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        features = coverage.toFeatures()

        assert(isinstance(features, FeatureRDD))
        self.assertEquals(features.toDF().count(), coverage.toDF().count())
    def test_transform(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        transformedReads = reads.transform(lambda x: x.filter(x.referenceName == "1"))

        self.assertEqual(transformedReads.toDF().count(), 1)
Beispiel #20
0
    def test_save_sorted_sam(self):

        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        tmpPath = self.tmpFile() + ".sam"
        sortedReads = reads.sortByReferencePosition()
        sortedReads.saveAsSam(tmpPath, isSorted=True, asSingleFile=True)

        self.checkFiles(testFile, tmpPath)
    def test_filterByOverlappingRegion(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        query = ReferenceRegion("chr2", 1, 400)

        filtered = reads.filterByOverlappingRegion(query)
        self.assertEqual(filtered.toDF().count(), 1)
    def test_save_unordered_sam(self):

        testFile = self.resourceFile("unordered.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        tmpPath = self.tmpFile() + ".sam"
        reads.saveAsSam(tmpPath,
                        asSingleFile=True)

        self.checkFiles(testFile, tmpPath)
Beispiel #23
0
    def test_save(self):

        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        tmpPath = self.tmpFile() + ".coverage.adam"
        coverage.save(tmpPath, isSingleFile=True, disableFastConcat=True)

        self.checkFiles(testFile, tmpPath)
    def test_to_coverage(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        coverage = reads.toCoverage()
        self.assertEquals(coverage.toDF().count(), 42)

        coverage = reads.toCoverage(collapse = False)
        self.assertEquals(coverage.toDF().count(), 46)
    def test_filterByOverlappingRegions(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        querys = [ReferenceRegion("chr2", 1L, 400L),
                    ReferenceRegion("3", 1L, 100L)]

        filtered = reads.filterByOverlappingRegions(querys)
        self.assertEquals(filtered.toDF().count(), 2)
Beispiel #26
0
    def test_to_coverage(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        coverage = reads.toCoverage()
        self.assertEquals(coverage.toDF().count(), 42)

        coverage = reads.toCoverage(collapse=False)
        self.assertEquals(coverage.toDF().count(), 46)
Beispiel #27
0
    def test_save(self):

        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        coverage = reads.toCoverage()
        tmpPath = self.tmpFile() + ".coverage.adam"
        coverage.save(tmpPath,
                              asSingleFile=True,
                            disableFastConcat=True)
    	assert(os.listdir(tmpPath) != [])
    def test_filterByOverlappingRegions(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(readsPath)

        querys = [ReferenceRegion("1", 20000000L, 27000000L),
                    ReferenceRegion("1", 230000000L,270000000L)]

        filtered = reads.filterByOverlappingRegion(querys)
        self.assertEquals(filtered.toDF().count(), 6)
    def test_union(self):

        testFile1 = self.resourceFile("sorted.sam")
        testFile2 = self.resourceFile("unordered.sam")
        ac = ADAMContext(self.ss)

        reads1 = ac.loadAlignments(testFile1)
        reads2 = ac.loadAlignments(testFile2)

        unionReads = reads1.union([reads2])

        self.assertEqual(unionReads.toDF().count(), 13)
Beispiel #30
0
    def test_alignment_distribution_no_elements(self):
        # load file
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("small.sam")
        # read alignments
        reads = ac.loadAlignments(testFile)

        qc = AlignmentDistribution(self.ss, reads, bin_size=1000000000)

        mDistribution = qc.plot(testMode=True, plotType="D")
        expectedM = Counter({('1', 0): 0})
        assert (mDistribution == expectedM)
    def test_save_sorted_sam(self):

        testFile = self.resourceFile("sorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(testFile)
        tmpPath = self.tmpFile() + ".sam"
        sortedReads = reads.sortReadsByReferencePosition()
        sortedReads.saveAsSam(tmpPath,
                              isSorted=True,
                              asSingleFile=True)

        self.checkFiles(testFile, tmpPath)
    def test_pipe_as_sam(self):

        reads12Path = self.resourceFile("reads12.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(reads12Path)

        pipedRdd = reads.pipe(["tee", "/dev/null"],
                              "org.bdgenomics.adam.rdd.read.SAMInFormatter",
                              "org.bdgenomics.adam.rdd.read.AnySAMOutFormatter",
                              "org.bdgenomics.adam.api.java.AlignmentRecordsToAlignmentRecordsConverter")

        self.assertEqual(reads.toDF().count(), pipedRdd.toDF().count())
    def test_realignIndels_known_indels(self):

        readsPath = self.resourceFile("small.1.sam")
        variantsPath = self.resourceFile("small.vcf")

        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)
        knownIndels = ac.loadVariants(variantsPath)

        realigned = reads.realignIndelsFromKnownIndels(knownIndels)

        self.assertEqual(realigned.toDF().count(), 20)
    def test_shuffle_right_outer_join_groupBy_left(self):

        readsPath = self.resourceFile("small.1.sam")
        targetsPath = self.resourceFile("small.1.bed")

        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)
        targets = ac.loadFeatures(targetsPath)

        jRdd = reads.rightOuterShuffleRegionJoinAndGroupByLeft(targets)

        self.assertEqual(jRdd.toDF().count(), 21)
    def test_shuffle_inner_join(self):

        readsPath = self.resourceFile("small.1.sam")
        targetsPath = self.resourceFile("small.1.bed")

        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)
        targets = ac.loadFeatures(targetsPath)

        jRdd = reads.shuffleRegionJoin(targets)

        self.assertEqual(jRdd.toDF().count(), 5)
    def test_broadcast_right_outer_join(self):

        readsPath = self.resourceFile("small.1.sam")
        targetsPath = self.resourceFile("small.1.bed")

        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)
        targets = ac.loadFeatures(targetsPath)

        jRdd = reads.rightOuterBroadcastRegionJoin(targets)

        self.assertEqual(jRdd.toDF().count(), 6)
Beispiel #37
0
    def test_fail_on_invalid_sample(self):
        # load file
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("small.sam")
        # read alignments

        reads = ac.loadAlignments(testFile)

        # convert to coverage
        coverage = reads.toCoverage()

        with self.assertRaises(Exception):
            CoverageDistribution(self.ss, coverage, sample = 1.2)
            CoverageDistribution(self.ss, coverage, sample = 0)
    def test_caching(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        cachedReads = reads.cache()
        cached = self.sc._jsc.getPersistentRDDs()
        self.assertEquals(cached.isEmpty(), False)

        cachedReads.unpersist()
        cached = self.sc._jsc.getPersistentRDDs()
        self.assertEquals(cached.isEmpty(), True)
    def test_to_coverage(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        coverage = reads.toCoverage()
        self.assertEqual(coverage.toDF().count(), 5)

        # 5 reads: contig 3 has 8 bp, chr2 (2 strands) has 20bp, contig 4 has 8bp, contig 1 has 10 bp
        # 8 + 20 + 8 + 10 = 46
        coverage = reads.toCoverage(collapse = False)
        self.assertEqual(coverage.toDF().count(), 46)
Beispiel #40
0
    def test_transmute_to_coverage(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(readsPath)

        readsAsCoverage = reads.transmute(
            lambda x: x.select(x.contigName, x.start, x.end,
                               x.mapq.cast(DoubleType()).alias("count")),
            CoverageRDD)

        assert (isinstance(readsAsCoverage, CoverageRDD))
        self.assertEquals(readsAsCoverage.toDF().count(), 5)
    def test_persisting(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        persistedReads = reads.persist(StorageLevel.DISK_ONLY)
        cached = self.sc._jsc.getPersistentRDDs()
        self.assertEqual(cached.isEmpty(), False)

        persistedReads.unpersist()
        cached = self.sc._jsc.getPersistentRDDs()
        self.assertEqual(cached.isEmpty(), True)
Beispiel #42
0
    def test_caching(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        cachedReads = reads.cache()
        cached = self.sc._jsc.getPersistentRDDs()
        self.assertEquals(cached.isEmpty(), False)

        cachedReads.unpersist()
        cached = self.sc._jsc.getPersistentRDDs()
        self.assertEquals(cached.isEmpty(), True)
    def test_transmute_to_coverage(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.sc)

        reads = ac.loadAlignments(readsPath)

        readsAsCoverage = reads.transmute(lambda x: x.select(x.contigName,
                                                             x.start,
                                                             x.end,
                                                             x.mapq.cast(DoubleType()).alias("count")),
                                          CoverageRDD)

        assert(isinstance(readsAsCoverage, CoverageRDD))
        self.assertEquals(readsAsCoverage.toDF().count(), 5)
    def test_transmute_to_coverage(self):

        readsPath = self.resourceFile("unsorted.sam")
        ac = ADAMContext(self.ss)

        reads = ac.loadAlignments(readsPath)

        readsAsCoverage = reads.transmute(lambda x: x.select(x.referenceName,
                                                             x.start,
                                                             x.end,
                                                             x.mappingQuality.cast(DoubleType()).alias("count"),
                                                             x.readGroupSampleId.alias("optSampleId")),
                                        CoverageDataset)

        assert(isinstance(readsAsCoverage, CoverageDataset))
        self.assertEquals(readsAsCoverage.toDF().count(), 5)
Beispiel #45
0
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from bdgenomics.adam.adamContext import ADAMContext

from pyspark.context import SparkContext

sc = SparkContext('local')
ac = ADAMContext(sc)

reads = ac.loadAlignments("adam-core/src/test/resources/small.sam").toDF().count()

if reads == 20:
    exit(0)
else:
    exit(1)