Exemple #1
0
    def test_vcf_round_trip(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().saveAsVcf(tmpPath)

        savedGenotypes = ac.loadGenotypes(testFile)

        self.assertEquals(genotypes._jvmRdd.jrdd().count(),
                          savedGenotypes._jvmRdd.jrdd().count())
    def test_vcf_round_trip(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.sc)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().saveAsVcf(tmpPath)

        savedGenotypes = ac.loadGenotypes(testFile)

        self.assertEquals(genotypes._jvmRdd.jrdd().count(),
                          savedGenotypes._jvmRdd.jrdd().count())
Exemple #3
0
    def test_transform(self):
        testFile = self.resourceFile("random.vcf")
        ac = ADAMContext(self.ss)

        genotypes = ac.loadGenotypes(testFile)

        transformedGenotypes = genotypes.transform(lambda x: x.filter(x.contigName == '1'))

        self.assertEquals(transformedGenotypes.toDF().count(), 9)
Exemple #4
0
    def test_VariantsPerSampleDistribution(self):
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("genodata.v3.test.vcf")

        genotypes = ac.loadGenotypes(testFile)
        _, data = VariantsPerSampleDistribution(self.ss, genotypes).plotDistributions(testMode= True)

        expected = [6, 8, 8, 1, 7, 8]
        assert(sum(data) == sum(expected))
Exemple #5
0
    def test_load_genotypes(self):

        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.sc)

        reads = ac.loadGenotypes(testFile)

        self.assertEqual(reads.toDF().count(), 18)
        self.assertEqual(reads._jvmRdd.jrdd().count(), 18)
    def test_transform(self):
        testFile = self.resourceFile("random.vcf")
        ac = ADAMContext(self.sc)

        genotypes = ac.loadGenotypes(testFile)

        transformedGenotypes = genotypes.transform(lambda x: x.filter(x.contigName == '1'))

        self.assertEquals(transformedGenotypes.toDF().count(), 9)
    def test_load_genotypes(self):

        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        reads = ac.loadGenotypes(testFile)

        self.assertEqual(reads.toDF().count(), 18)
        self.assertEqual(reads._jvmRdd.jrdd().count(), 18)
Exemple #8
0
    def test_HetHomRatioDistribution(self):
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("genodata.v3.test.vcf")

        genotypes = ac.loadGenotypes(testFile)
        _, data =  HetHomRatioDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True)
        expected = sorted([5.0, 0.6, 0.14, 0.17, 1.67])
        sorted_data = sorted(data)

        assert( expected == [ round(x,2) for x in sorted_data ])
Exemple #9
0
    def test_GenotypeCallRatesDistribution(self):
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("genodata.v3.test.vcf")

        genotypes = ac.loadGenotypes(testFile)
        _, data =  GenotypeCallRatesDistribution(self.ss, genotypes, sample=1.0).plot(testMode= True)
        expected = sorted([0.95, 0.88, 0.89, 0.94, 0.93, 0.90])
        sorted_data = sorted(data)

        assert( expected == [ round(x,2) for x in sorted_data] )
    def test_vcf_add_filter(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addFilterHeaderLine("BAD",
                                                          "Bad variant.").saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath, '##FILTER=<ID=BAD,Description="Bad variant.">')
    def test_vcf_sort(self):
    
        testFile = self.resourceFile("random.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().sort().saveAsVcf(tmpPath,
                                                       asSingleFile=True)

        self.checkFiles(tmpPath, self.resourceFile("sorted.vcf", module='adam-cli'))
Exemple #12
0
    def test_VariantsPerSampleDistributionSampling(self):
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("genodata.v3.test.vcf")

        genotypes = ac.loadGenotypes(testFile)
        _, data = VariantsPerSampleDistribution(self.ss, genotypes, sample=0.9).plotDistributions(testMode= True)

        expected = [6, 8, 8, 1, 7, 8]

        # estimated counts should be around real counts
        dev = 8
        assert(sum(expected) > sum(data) - dev and sum(expected) < sum(data) + dev)
Exemple #13
0
    def test_vcf_add_filter(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addFilterHeaderLine("BAD",
                                                          "Bad variant.").saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath, '##FILTER=<ID=BAD,Description="Bad variant.">')
Exemple #14
0
    def test_vcf_sort_lex(self):
    
        testFile = self.resourceFile("random.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().sortLexicographically().saveAsVcf(tmpPath,
                                                                        asSingleFile=True)

        self.checkFiles(tmpPath, self.resourceFile("sorted.lex.vcf", module='adam-cli'))
    def test_vcf_sort_lex(self):
    
        testFile = self.resourceFile("random.vcf")
        ac = ADAMContext(self.sc)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().sortLexicographically().saveAsVcf(tmpPath,
                                                                        asSingleFile=True)

        self.checkFiles(tmpPath, self.resourceFile("sorted.lex.vcf"))
Exemple #16
0
    def test_vcf_sort(self):
    
        testFile = self.resourceFile("random.vcf")
        ac = ADAMContext(self.sc)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContextRDD().sort().saveAsVcf(tmpPath,
                                                         asSingleFile=True)

        self.checkFiles(tmpPath, self.resourceFile("sorted.vcf"))
Exemple #17
0
    def test_to_variants(self):
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)

        genotypes = ac.loadGenotypes(testFile)

        variants = genotypes.toVariants()

        self.assertEqual(variants.toDF().count(), 18)

        variants = genotypes.toVariants(dedupe=True)

        self.assertEqual(variants.toDF().count(), 6)
Exemple #18
0
    def test_vcf_add_info_scalar(self):

        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)

        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addScalarInfoHeaderLine(
            "SC", "Scalar.", bool).saveAsVcf(tmpPath)

        self.check_for_line_in_file(
            tmpPath, '##INFO=<ID=SC,Number=0,Type=Flag,Description="Scalar.">')
Exemple #19
0
    def test_to_variants(self):
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)

        genotypes = ac.loadGenotypes(testFile)

        variants = genotypes.toVariants()

        self.assertEquals(variants.toDF().count(), 18)

        variants = genotypes.toVariants(dedupe=True)

        self.assertEquals(variants.toDF().count(), 6)
Exemple #20
0
    def test_vcf_add_info_all_array(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addAllAlleleArrayInfoHeaderLine("RA",
                                                                      "Array with # alleles.",
                                                                      float).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##INFO=<ID=RA,Number=R,Type=Float,Description="Array with # alleles.">')
Exemple #21
0
    def test_vcf_add_info_scalar(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addScalarInfoHeaderLine("SC",
                                                              "Scalar.",
                                                              bool).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##INFO=<ID=SC,Number=0,Type=Flag,Description="Scalar.">')
Exemple #22
0
    def test_vcf_add_format_alts_array(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addAlternateAlleleArrayFormatHeaderLine("AA",
                                                                              "Array with # alts.",
                                                                              chr).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##FORMAT=<ID=AA,Number=A,Type=Character,Description="Array with # alts.">')
Exemple #23
0
    def test_vcf_add_format_genotype_array(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addGenotypeArrayFormatHeaderLine("GA",
                                                                       "Array with # genotypes.",
                                                                       float).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##FORMAT=<ID=GA,Number=G,Type=Float,Description="Array with # genotypes.">')
    def test_vcf_add_format_genotype_array(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addGenotypeArrayFormatHeaderLine("GA",
                                                                       "Array with # genotypes.",
                                                                       float).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##FORMAT=<ID=GA,Number=G,Type=Float,Description="Array with # genotypes.">')
    def test_vcf_add_format_alts_array(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addAlternateAlleleArrayFormatHeaderLine("AA",
                                                                              "Array with # alts.",
                                                                              chr).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##FORMAT=<ID=AA,Number=A,Type=Character,Description="Array with # alts.">')
    def test_vcf_add_info_all_array(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addAllAlleleArrayInfoHeaderLine("RA",
                                                                      "Array with # alleles.",
                                                                      float).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##INFO=<ID=RA,Number=R,Type=Float,Description="Array with # alleles.">')
Exemple #27
0
    def test_vcf_add_format_scalar(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addScalarFormatHeaderLine("SC",
                                                                "Scalar.",
                                                                str).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##FORMAT=<ID=SC,Number=1,Type=String,Description="Scalar.">')
Exemple #28
0
    def test_vcf_add_format_scalar(self):

        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)

        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addScalarFormatHeaderLine(
            "SC", "Scalar.", str).saveAsVcf(tmpPath)

        self.check_for_line_in_file(
            tmpPath,
            '##FORMAT=<ID=SC,Number=1,Type=String,Description="Scalar.">')
Exemple #29
0
    def test_vcf_add_format_array(self):
        
        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)
        
        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addFixedArrayFormatHeaderLine("FA4",
                                                                    4,
                                                                    "Fixed array of 4 elements.",
                                                                    int).saveAsVcf(tmpPath)

        self.check_for_line_in_file(tmpPath,
                                    '##FORMAT=<ID=FA4,Number=4,Type=Integer,Description="Fixed array of 4 elements.">')
Exemple #30
0
    def test_vcf_add_format_array(self):

        testFile = self.resourceFile("small.vcf")
        ac = ADAMContext(self.ss)

        genotypes = ac.loadGenotypes(testFile)

        tmpPath = self.tmpFile() + ".vcf"
        genotypes.toVariantContexts().addFixedArrayFormatHeaderLine(
            "FA4", 4, "Fixed array of 4 elements.", int).saveAsVcf(tmpPath)

        self.check_for_line_in_file(
            tmpPath,
            '##FORMAT=<ID=FA4,Number=4,Type=Integer,Description="Fixed array of 4 elements.">'
        )
Exemple #31
0
    def test_visualize_genotypes(self):
        # load file
        ac = ADAMContext(self.ss)
        testFile = self.resourceFile("genodata.v3.test.vcf")

        # read features
        genotypes = ac.loadGenotypes(testFile)

        gs =  GenotypeSummary(self.ss, ac, genotypes)

        contig = "chr22"
        start = 21079600
        end = 21079700

        x = gs.viewPileup(contig, start, end)
        assert(x != None)