def test_cumulative_count_distribution(self): # load file ac = ADAMContext(self.ss) testFile = self.resourceFile("small.sam") # read alignments reads = ac.loadAlignments(testFile) # convert to coverage coverage = reads.toCoverage() qc = CoverageDistribution(self.ss, coverage) _, cd = qc.plotDistributions(testMode = True, cumulative = True, normalize = False) # first sample items = list(cd.popitem()[1]) assert(len(items) == 1) assert(items.pop()[1] == 1500) _, cd = qc.plotDistributions(testMode = True, cumulative = False, normalize = False) # first sample items = list(cd.popitem()[1]) assert(len(items) == 1) assert(items.pop()[1] == 1500)
def test_collapse(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.collapse() self.assertEquals(collapsed.toDF().count(), coverage.toDF().count())
def test_aggregatedCoverage(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.aggregatedCoverage(10) self.assertEquals(collapsed.toDF().count(), 166)
def test_flatten(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() flattened = coverage.flatten() self.assertEquals(flattened.toDF().count(), 1500)
def test_collapse(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.collapse() self.assertEquals(collapsed.toDF().count(), coverage.toDF().count())
def test_aggregatedCoverage(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.aggregatedCoverage(10) self.assertEquals(collapsed.toDF().count(), 166)
def test_load_coverage(self): testFile = self.resourceFile("sample_coverage.bed") ac = ADAMContext(self.sc) coverage = ac.loadCoverage(testFile) self.assertEqual(coverage.toDF().count(), 3)
def test_flatten(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() flattened = coverage.flatten() self.assertEquals(flattened.toDF().count(), 1500)
def test_load_slices(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.ss) slices = ac.loadSlices(testFile, 10000) self.assertEqual(slices.toDF().count(), 1) self.assertEqual(slices._jvmDataset.jrdd().count(), 1)
def test_load_coverage(self): testFile = self.resourceFile("sample_coverage.bed") ac = ADAMContext(self.ss) coverage = ac.loadCoverage(testFile) self.assertEqual(coverage.toDF().count(), 3)
def test_load_interval_list(self): testFile = self.resourceFile("SeqCap_EZ_Exome_v3.hg19.interval_list") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 369) self.assertEqual(reads._jvmRdd.jrdd().count(), 369)
def test_load_bed(self): testFile = self.resourceFile("gencode.v7.annotation.trunc10.bed") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 10) self.assertEqual(reads._jvmRdd.jrdd().count(), 10)
def test_load_gtf(self): testFile = self.resourceFile("Homo_sapiens.GRCh37.75.trun20.gtf") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 15) self.assertEqual(reads._jvmRdd.jrdd().count(), 15)
def test_load_alignments(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) self.assertEqual(reads.toDF().count(), 20) self.assertEqual(reads._jvmRdd.jrdd().count(), 20)
def test_load_dna_sequences(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.ss) sequences = ac.loadDnaSequences(testFile) self.assertEqual(sequences.toDF().count(), 1) self.assertEqual(sequences._jvmDataset.jrdd().count(), 1)
def test_VariantsPerSampleDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = VariantsPerSampleDistribution(self.ss, genotypes).plotDistributions(testMode= True) expected = [6, 8, 8, 1, 7, 8] assert(sum(data) == sum(expected))
def test_load_interval_list(self): testFile = self.resourceFile("SeqCap_EZ_Exome_v3.hg19.interval_list") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 369) self.assertEqual(reads._jvmRdd.jrdd().count(), 369)
def test_load_bed(self): testFile = self.resourceFile("gencode.v7.annotation.trunc10.bed") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 10) self.assertEqual(reads._jvmRdd.jrdd().count(), 10)
def test_transform(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.sc) genotypes = ac.loadGenotypes(testFile) transformedGenotypes = genotypes.transform(lambda x: x.filter(x.contigName == '1')) self.assertEquals(transformedGenotypes.toDF().count(), 9)
def test_count_kmers(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) kmers = reads.countKmers(6) self.assertEqual(kmers.count(), 1040)
def test_load_alignments(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) self.assertEqual(reads.toDF().count(), 20) self.assertEqual(reads._jvmRdd.jrdd().count(), 20)
def test_load_genotypes(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) reads = ac.loadGenotypes(testFile) self.assertEqual(reads.toDF().count(), 18) self.assertEqual(reads._jvmRdd.jrdd().count(), 18)
def test_to_fragments(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) fragments = reads.toFragments() self.assertEqual(fragments.toDF().count(), 5)
def test_load_contig_fragments(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.sc) reads = ac.loadContigFragments(testFile) self.assertEqual(reads.toDF().count(), 1) self.assertEqual(reads._jvmRdd.jrdd().count(), 1)
def test_load_gtf(self): testFile = self.resourceFile("Homo_sapiens.GRCh37.75.trun20.gtf") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 15) self.assertEqual(reads._jvmRdd.jrdd().count(), 15)
def test_load_variants(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.sc) reads = ac.loadVariants(testFile) self.assertEqual(reads.toDF().count(), 6) self.assertEqual(reads._jvmRdd.jrdd().count(), 6)
def test_transform(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) transformedGenotypes = genotypes.transform(lambda x: x.filter(x.contigName == '1')) self.assertEquals(transformedGenotypes.toDF().count(), 9)
def test_realignIndels_reads(self): readsPath = self.resourceFile("small.1.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) realigned = reads.realignIndels() self.assertEqual(realigned.toDF().count(), 20)
def test_transform(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) transformedReads = reads.transform(lambda x: x.filter(x.referenceName == "1")) self.assertEqual(transformedReads.toDF().count(), 1)
def test_transform(self): featurePath = self.resourceFile("gencode.v7.annotation.trunc10.bed") ac = ADAMContext(self.ss) features = ac.loadFeatures(featurePath) transformedFeatures = features.transform(lambda x: x.filter(x.start < 12613)) self.assertEqual(transformedFeatures.toDF().count(), 6)
def test_GenotypeCallRatesDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = GenotypeCallRatesDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True) expected = sorted([0.95, 0.88, 0.89, 0.94, 0.93, 0.90]) sorted_data = sorted(data) assert( expected == [ round(x,2) for x in sorted_data] )
def test_load_indexed_bam(self): readsPath = self.resourceFile("indexed_bams/sorted.bam") ac = ADAMContext(self.ss) querys = [ReferenceRegion("chr2", 100, 101), ReferenceRegion("3", 10, 17)] reads = ac.loadIndexedBam(readsPath, querys) self.assertEqual(reads.toDF().count(), 2)
def test_load_narrowPeak(self): testFile = self.resourceFile("wgEncodeOpenChromDnaseGm19238Pk.trunc10.narrowPeak") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 10) self.assertEqual(reads._jvmRdd.jrdd().count(), 10)
def test_transform(self): variantPath = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) variants = ac.loadVariants(variantPath) transformedVariants = variants.transform(lambda x: x.filter(x.start < 19190)) self.assertEqual(transformedVariants.toDF().count(), 3)
def test_HetHomRatioDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = HetHomRatioDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True) expected = sorted([5.0, 0.6, 0.14, 0.17, 1.67]) sorted_data = sorted(data) assert( expected == [ round(x,2) for x in sorted_data ])
def test_load_variants(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) reads = ac.loadVariants(testFile) self.assertEqual(reads.toDF().count(), 6) self.assertEqual(reads._jvmRdd.jrdd().count(), 6)
def test_toFeatures(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() features = coverage.toFeatures() assert (isinstance(features, FeatureRDD)) self.assertEquals(features.toDF().count(), coverage.toDF().count())
def test_load_contig_fragments(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.ss) reads = ac.loadContigFragments(testFile) self.assertEqual(reads.toDF().count(), 1) self.assertEqual(reads._jvmRdd.jrdd().count(), 1)
def test_toFeatures(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() features = coverage.toFeatures() assert(isinstance(features, FeatureRDD)) self.assertEquals(features.toDF().count(), coverage.toDF().count())
def test_save(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() tmpPath = self.tmpFile() + ".coverage.adam" coverage.save(tmpPath, asSingleFile=True, disableFastConcat=True) assert (os.listdir(tmpPath) != [])
def test_save_unordered_sam(self): testFile = self.resourceFile("unordered.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) tmpPath = self.tmpFile() + ".sam" reads.saveAsSam(tmpPath, asSingleFile=True) self.checkFiles(testFile, tmpPath)
def test_transform(self): featurePath = self.resourceFile("gencode.v7.annotation.trunc10.bed") ac = ADAMContext(self.ss) features = ac.loadFeatures(featurePath) transformedFeatures = features.transform(lambda x: x.filter(x.start < 12613)) self.assertEquals(transformedFeatures.toDF().count(), 6)
def test_transform(self): variantPath = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) variants = ac.loadVariants(variantPath) transformedVariants = variants.transform(lambda x: x.filter(x.start < 19190)) self.assertEquals(transformedVariants.toDF().count(), 3)
def test_load_indexed_bam(self): testFile = self.resourceFile("indexed_bams/sorted.bam") ac = ADAMContext(self.ss) reads = ac.loadIndexedBam(testFile, [ReferenceRegion("chr2", 100, 101), ReferenceRegion("3", 10, 17)]) self.assertEqual(reads.toDF().count(), 2)
def test_load_narrowPeak(self): testFile = self.resourceFile("wgEncodeOpenChromDnaseGm19238Pk.trunc10.narrowPeak") ac = ADAMContext(self.ss) reads = ac.loadFeatures(testFile) self.assertEqual(reads.toDF().count(), 10) self.assertEqual(reads._jvmRdd.jrdd().count(), 10)
def test_filterByOverlappingRegion(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) query = ReferenceRegion("chr2", 1, 400) filtered = reads.filterByOverlappingRegion(query) self.assertEqual(filtered.toDF().count(), 1)
def test_save_unordered_sam(self): testFile = self.resourceFile("unordered.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) tmpPath = self.tmpFile() + ".sam" reads.saveAsSam(tmpPath, asSingleFile=True) self.checkFiles(testFile, tmpPath)
def test_vcf_sort_lex(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().sortLexicographically().saveAsVcf(tmpPath, asSingleFile=True) self.checkFiles(tmpPath, self.resourceFile("sorted.lex.vcf", module='adam-cli'))
def test_save(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() tmpPath = self.tmpFile() + ".coverage.adam" coverage.save(tmpPath, asSingleFile=True, disableFastConcat=True) assert(os.listdir(tmpPath) != [])
def test_vcf_sort(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.sc) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContextRDD().sort().saveAsVcf(tmpPath, asSingleFile=True) self.checkFiles(tmpPath, self.resourceFile("sorted.vcf"))
def test_filterByOverlappingRegions(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) querys = [ReferenceRegion("chr2", 1L, 400L), ReferenceRegion("3", 1L, 100L)] filtered = reads.filterByOverlappingRegions(querys) self.assertEquals(filtered.toDF().count(), 2)
def test_union(self): testFile1 = self.resourceFile("sorted.sam") testFile2 = self.resourceFile("unordered.sam") ac = ADAMContext(self.ss) reads1 = ac.loadAlignments(testFile1) reads2 = ac.loadAlignments(testFile2) unionReads = reads1.union([reads2]) self.assertEqual(unionReads.toDF().count(), 13)
def test_vcf_add_filter(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addFilterHeaderLine("BAD", "Bad variant.").saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FILTER=<ID=BAD,Description="Bad variant.">')
def test_to_coverage(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) coverage = reads.toCoverage() self.assertEquals(coverage.toDF().count(), 42) coverage = reads.toCoverage(collapse = False) self.assertEquals(coverage.toDF().count(), 46)
def test_to_variants(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) variants = genotypes.toVariants() self.assertEquals(variants.toDF().count(), 18) variants = genotypes.toVariants(dedupe=True) self.assertEquals(variants.toDF().count(), 6)