def test_cumulative_count_distribution(self): # load file ac = ADAMContext(self.ss) testFile = self.resourceFile("small.sam") # read alignments reads = ac.loadAlignments(testFile) # convert to coverage coverage = reads.toCoverage() qc = CoverageDistribution(self.ss, coverage) _, cd = qc.plotDistributions(testMode = True, cumulative = True, normalize = False) # first sample items = list(cd.popitem()[1]) assert(len(items) == 1) assert(items.pop()[1] == 1500) _, cd = qc.plotDistributions(testMode = True, cumulative = False, normalize = False) # first sample items = list(cd.popitem()[1]) assert(len(items) == 1) assert(items.pop()[1] == 1500)
def __init__(self, sc, build='hg19'): """ Initializes a GenomicRDD viz class. """ self.ac = ADAMContext(sc) self.build = build self.chrPrefix = 'chr'
def test_collapse(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.collapse() self.assertEquals(collapsed.toDF().count(), coverage.toDF().count())
def test_flatten(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() flattened = coverage.flatten() self.assertEquals(flattened.toDF().count(), 1500)
def test_aggregatedCoverage(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.aggregatedCoverage(10) self.assertEquals(collapsed.toDF().count(), 166)
def test_load_coverage(self): testFile = self.resourceFile("sample_coverage.bed") ac = ADAMContext(self.sc) coverage = ac.loadCoverage(testFile) self.assertEqual(coverage.toDF().count(), 3)
def test_load_genotypes(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) reads = ac.loadGenotypes(testFile) self.assertEqual(reads.toDF().count(), 18) self.assertEqual(reads._jvmRdd.jrdd().count(), 18)
def test_load_dna_sequences(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.ss) sequences = ac.loadDnaSequences(testFile) self.assertEqual(sequences.toDF().count(), 1) self.assertEqual(sequences._jvmDataset.jrdd().count(), 1)
def test_load_slices(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.ss) slices = ac.loadSlices(testFile, 10000) self.assertEqual(slices.toDF().count(), 1) self.assertEqual(slices._jvmDataset.jrdd().count(), 1)
def test_load_interval_list(self): testFile = self.resourceFile("SeqCap_EZ_Exome_v3.hg19.interval_list") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 369) self.assertEqual(reads._jvmRdd.jrdd().count(), 369)
def test_to_fragments(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) fragments = reads.toFragments() self.assertEqual(fragments.toDF().count(), 5)
def test_VariantsPerSampleDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = VariantsPerSampleDistribution(self.ss, genotypes).plotDistributions(testMode= True) expected = [6, 8, 8, 1, 7, 8] assert(sum(data) == sum(expected))
def test_load_bed(self): testFile = self.resourceFile("gencode.v7.annotation.trunc10.bed") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 10) self.assertEqual(reads._jvmRdd.jrdd().count(), 10)
def test_transform(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.sc) genotypes = ac.loadGenotypes(testFile) transformedGenotypes = genotypes.transform(lambda x: x.filter(x.contigName == '1')) self.assertEquals(transformedGenotypes.toDF().count(), 9)
def test_load_gtf(self): testFile = self.resourceFile("Homo_sapiens.GRCh37.75.trun20.gtf") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 15) self.assertEqual(reads._jvmRdd.jrdd().count(), 15)
def test_load_alignments(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) self.assertEqual(reads.toDF().count(), 20) self.assertEqual(reads._jvmRdd.jrdd().count(), 20)
def test_load_variants(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.sc) reads = ac.loadVariants(testFile) self.assertEqual(reads.toDF().count(), 6) self.assertEqual(reads._jvmRdd.jrdd().count(), 6)
def test_count_kmers(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) kmers = reads.countKmers(6) self.assertEqual(kmers.count(), 1040)
def test_load_contig_fragments(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.sc) reads = ac.loadContigFragments(testFile) self.assertEqual(reads.toDF().count(), 1) self.assertEqual(reads._jvmRdd.jrdd().count(), 1)
def test_load_indexed_bam(self): testFile = self.resourceFile("indexed_bams/sorted.bam") ac = ADAMContext(self.ss) reads = ac.loadIndexedBam(testFile, [ReferenceRegion("chr2", 100, 101), ReferenceRegion("3", 10, 17)]) self.assertEqual(reads.toDF().count(), 2)
def test_GenotypeCallRatesDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = GenotypeCallRatesDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True) expected = sorted([0.95, 0.88, 0.89, 0.94, 0.93, 0.90]) sorted_data = sorted(data) assert( expected == [ round(x,2) for x in sorted_data] )
def test_transform(self): featurePath = self.resourceFile("gencode.v7.annotation.trunc10.bed") ac = ADAMContext(self.ss) features = ac.loadFeatures(featurePath) transformedFeatures = features.transform(lambda x: x.filter(x.start < 12613)) self.assertEquals(transformedFeatures.toDF().count(), 6)
def test_transform(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) transformedReads = reads.transform(lambda x: x.filter(x.referenceName == "1")) self.assertEqual(transformedReads.toDF().count(), 1)
def test_transform(self): variantPath = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) variants = ac.loadVariants(variantPath) transformedVariants = variants.transform(lambda x: x.filter(x.start < 19190)) self.assertEquals(transformedVariants.toDF().count(), 3)
def test_save_unordered_sam(self): testFile = self.resourceFile("unordered.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) tmpPath = self.tmpFile() + ".sam" reads.saveAsSam(tmpPath, asSingleFile=True) self.checkFiles(testFile, tmpPath)
def test_HetHomRatioDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = HetHomRatioDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True) expected = sorted([5.0, 0.6, 0.14, 0.17, 1.67]) sorted_data = sorted(data) assert( expected == [ round(x,2) for x in sorted_data ])
def test_load_narrowPeak(self): testFile = self.resourceFile("wgEncodeOpenChromDnaseGm19238Pk.trunc10.narrowPeak") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 10) self.assertEqual(reads._jvmRdd.jrdd().count(), 10)
def test_realignIndels_reads(self): readsPath = self.resourceFile("small.1.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) realigned = reads.realignIndels() self.assertEqual(realigned.toDF().count(), 20)
def test_toFeatures(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() features = coverage.toFeatures() assert (isinstance(features, FeatureRDD)) self.assertEquals(features.toDF().count(), coverage.toDF().count())
def test_save(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() tmpPath = self.tmpFile() + ".coverage.adam" coverage.save(tmpPath, asSingleFile=True, disableFastConcat=True) assert (os.listdir(tmpPath) != [])