def test_vcf_round_trip(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().saveAsVcf(tmpPath) savedGenotypes = ac.loadGenotypes(testFile) self.assertEquals(genotypes._jvmRdd.jrdd().count(), savedGenotypes._jvmRdd.jrdd().count())
def test_vcf_round_trip(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.sc) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().saveAsVcf(tmpPath) savedGenotypes = ac.loadGenotypes(testFile) self.assertEquals(genotypes._jvmRdd.jrdd().count(), savedGenotypes._jvmRdd.jrdd().count())
def test_transform(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) transformedGenotypes = genotypes.transform(lambda x: x.filter(x.contigName == '1')) self.assertEquals(transformedGenotypes.toDF().count(), 9)
def test_VariantsPerSampleDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = VariantsPerSampleDistribution(self.ss, genotypes).plotDistributions(testMode= True) expected = [6, 8, 8, 1, 7, 8] assert(sum(data) == sum(expected))
def test_load_genotypes(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.sc) reads = ac.loadGenotypes(testFile) self.assertEqual(reads.toDF().count(), 18) self.assertEqual(reads._jvmRdd.jrdd().count(), 18)
def test_transform(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.sc) genotypes = ac.loadGenotypes(testFile) transformedGenotypes = genotypes.transform(lambda x: x.filter(x.contigName == '1')) self.assertEquals(transformedGenotypes.toDF().count(), 9)
def test_load_genotypes(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) reads = ac.loadGenotypes(testFile) self.assertEqual(reads.toDF().count(), 18) self.assertEqual(reads._jvmRdd.jrdd().count(), 18)
def test_HetHomRatioDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = HetHomRatioDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True) expected = sorted([5.0, 0.6, 0.14, 0.17, 1.67]) sorted_data = sorted(data) assert( expected == [ round(x,2) for x in sorted_data ])
def test_GenotypeCallRatesDistribution(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = GenotypeCallRatesDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True) expected = sorted([0.95, 0.88, 0.89, 0.94, 0.93, 0.90]) sorted_data = sorted(data) assert( expected == [ round(x,2) for x in sorted_data] )
def test_vcf_add_filter(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addFilterHeaderLine("BAD", "Bad variant.").saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FILTER=<ID=BAD,Description="Bad variant.">')
def test_vcf_sort(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().sort().saveAsVcf(tmpPath, asSingleFile=True) self.checkFiles(tmpPath, self.resourceFile("sorted.vcf", module='adam-cli'))
def test_VariantsPerSampleDistributionSampling(self): ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") genotypes = ac.loadGenotypes(testFile) _, data = VariantsPerSampleDistribution(self.ss, genotypes, sample=0.9).plotDistributions(testMode= True) expected = [6, 8, 8, 1, 7, 8] # estimated counts should be around real counts dev = 8 assert(sum(expected) > sum(data) - dev and sum(expected) < sum(data) + dev)
def test_vcf_add_filter(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addFilterHeaderLine("BAD", "Bad variant.").saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FILTER=<ID=BAD,Description="Bad variant.">')
def test_vcf_sort_lex(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().sortLexicographically().saveAsVcf(tmpPath, asSingleFile=True) self.checkFiles(tmpPath, self.resourceFile("sorted.lex.vcf", module='adam-cli'))
def test_vcf_sort_lex(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.sc) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().sortLexicographically().saveAsVcf(tmpPath, asSingleFile=True) self.checkFiles(tmpPath, self.resourceFile("sorted.lex.vcf"))
def test_vcf_sort(self): testFile = self.resourceFile("random.vcf") ac = ADAMContext(self.sc) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContextRDD().sort().saveAsVcf(tmpPath, asSingleFile=True) self.checkFiles(tmpPath, self.resourceFile("sorted.vcf"))
def test_to_variants(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) variants = genotypes.toVariants() self.assertEqual(variants.toDF().count(), 18) variants = genotypes.toVariants(dedupe=True) self.assertEqual(variants.toDF().count(), 6)
def test_vcf_add_info_scalar(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addScalarInfoHeaderLine( "SC", "Scalar.", bool).saveAsVcf(tmpPath) self.check_for_line_in_file( tmpPath, '##INFO=<ID=SC,Number=0,Type=Flag,Description="Scalar.">')
def test_to_variants(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) variants = genotypes.toVariants() self.assertEquals(variants.toDF().count(), 18) variants = genotypes.toVariants(dedupe=True) self.assertEquals(variants.toDF().count(), 6)
def test_vcf_add_info_all_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addAllAlleleArrayInfoHeaderLine("RA", "Array with # alleles.", float).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##INFO=<ID=RA,Number=R,Type=Float,Description="Array with # alleles.">')
def test_vcf_add_info_scalar(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addScalarInfoHeaderLine("SC", "Scalar.", bool).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##INFO=<ID=SC,Number=0,Type=Flag,Description="Scalar.">')
def test_vcf_add_format_alts_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addAlternateAlleleArrayFormatHeaderLine("AA", "Array with # alts.", chr).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FORMAT=<ID=AA,Number=A,Type=Character,Description="Array with # alts.">')
def test_vcf_add_format_genotype_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addGenotypeArrayFormatHeaderLine("GA", "Array with # genotypes.", float).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FORMAT=<ID=GA,Number=G,Type=Float,Description="Array with # genotypes.">')
def test_vcf_add_format_genotype_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addGenotypeArrayFormatHeaderLine("GA", "Array with # genotypes.", float).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FORMAT=<ID=GA,Number=G,Type=Float,Description="Array with # genotypes.">')
def test_vcf_add_format_alts_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addAlternateAlleleArrayFormatHeaderLine("AA", "Array with # alts.", chr).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FORMAT=<ID=AA,Number=A,Type=Character,Description="Array with # alts.">')
def test_vcf_add_info_all_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addAllAlleleArrayInfoHeaderLine("RA", "Array with # alleles.", float).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##INFO=<ID=RA,Number=R,Type=Float,Description="Array with # alleles.">')
def test_vcf_add_format_scalar(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addScalarFormatHeaderLine("SC", "Scalar.", str).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FORMAT=<ID=SC,Number=1,Type=String,Description="Scalar.">')
def test_vcf_add_format_scalar(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addScalarFormatHeaderLine( "SC", "Scalar.", str).saveAsVcf(tmpPath) self.check_for_line_in_file( tmpPath, '##FORMAT=<ID=SC,Number=1,Type=String,Description="Scalar.">')
def test_vcf_add_format_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addFixedArrayFormatHeaderLine("FA4", 4, "Fixed array of 4 elements.", int).saveAsVcf(tmpPath) self.check_for_line_in_file(tmpPath, '##FORMAT=<ID=FA4,Number=4,Type=Integer,Description="Fixed array of 4 elements.">')
def test_vcf_add_format_array(self): testFile = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) genotypes = ac.loadGenotypes(testFile) tmpPath = self.tmpFile() + ".vcf" genotypes.toVariantContexts().addFixedArrayFormatHeaderLine( "FA4", 4, "Fixed array of 4 elements.", int).saveAsVcf(tmpPath) self.check_for_line_in_file( tmpPath, '##FORMAT=<ID=FA4,Number=4,Type=Integer,Description="Fixed array of 4 elements.">' )
def test_visualize_genotypes(self): # load file ac = ADAMContext(self.ss) testFile = self.resourceFile("genodata.v3.test.vcf") # read features genotypes = ac.loadGenotypes(testFile) gs = GenotypeSummary(self.ss, ac, genotypes) contig = "chr22" start = 21079600 end = 21079700 x = gs.viewPileup(contig, start, end) assert(x != None)