Esempio n. 1
0
    def test_vars_to_bed(self):
        vars = [
            MockVariant(chrom="1", start=1000, ref="A", alleles=("G",), samples=None),
            MockVariant(chrom="1", start=5000, ref="A", alleles=("G",), samples=None),
            MockVariant(chrom="1", start=5050, ref="T", alleles=("G",), samples=None),
            MockVariant(chrom="2", start=2000, ref="T", alleles=("G",), samples=None),
            MockVariant(chrom="2", start=2010, ref="T", alleles=("G",), samples=None),
            MockVariant(chrom="2", start=3000, ref="T", alleles=("G",), samples=None),
        ]

        bed = util.vars_to_bed(vars, window=500)
        regions = [r for r in util.read_regions(bed)]
        self.assertTrue(len(regions) == 4)
        self.assertTrue(regions[0].chrom == "1")
        self.assertTrue(regions[0].start == 500)
        self.assertTrue(regions[0].stop == 1500)

        self.assertTrue(regions[1].chrom == "1")
        self.assertTrue(regions[1].start == 4500)
        self.assertTrue(regions[1].stop == 5550)

        self.assertTrue(regions[2].chrom == "2")
        self.assertTrue(regions[2].start == 1500)
        self.assertTrue(regions[2].stop == 2510)

        self.assertTrue(regions[3].chrom == "2")
        self.assertTrue(regions[3].start == 2500)
        self.assertTrue(regions[3].stop == 3500)

        os.remove(bed)
Esempio n. 2
0
 def collect_bam_stats(self, bam, bed, orig_vcf):
     """
     For each bed region compute some bam-level stats and return them in a dict
     The key of the dict is the var_key of the matching original (input) variant
     """
     bam_stats = defaultdict(dict)
     for region in util.read_regions(bed):
         key = var_key(util.find_matching_var(orig_vcf, region))
         bam_stats[key] = bam_simulation.gen_bam_stats(bam, region)
     return bam_stats
Esempio n. 3
0
 def collect_var_quals(self, caller_vars, bed, orig_vcf):
     """
     For each call and input variant, find the quality of the matching called variant, if there is one
     Return a dict[variant key][caller] for each input variant
     Missing variants (ref calls) are assigned a quality of MISSING_QUAL
     """
     var_quals = defaultdict(dict)
     for region in util.read_regions(bed):
         key = var_key(util.find_matching_var(orig_vcf, region))
         for caller in caller_vars:
             with pysam.VariantFile(caller_vars[caller]) as cvars:
                 cvar = util.find_matching_var(cvars, region)
                 var_quals[key][caller] = find_qual(cvar)
     return var_quals
Esempio n. 4
0
    def test_read_bedfile(self):
        bed_path = os.path.join(TestUtils.DATA_PATH, TestUtils.TEST_BED)
        regions = [r for r in util.read_regions(bed_path)]
        self.assertTrue(len(regions) == 3)
        self.assertTrue(regions[0].chrom == "1")
        self.assertTrue(regions[0].start == 10)
        self.assertTrue(regions[0].stop == 20)

        self.assertTrue(regions[1].chrom == "1")
        self.assertTrue(regions[1].start == 100)
        self.assertTrue(regions[1].stop == 200)

        self.assertTrue(regions[2].chrom == "10")
        self.assertTrue(regions[2].start == 55)
        self.assertTrue(regions[2].stop == 77)
Esempio n. 5
0
    def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf):
        raw_orig_vcf = os.path.abspath(raw_orig_vcf)
        raw_test_vcf = os.path.abspath(raw_test_vcf)
        orig_vars    = list(pysam.VariantFile(raw_orig_vcf))
        tmp_dirname  = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr()

        with util.TempDir(dirname=tmp_dirname):
            orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf)
            test_vcf = util.remove_halfcalls(raw_test_vcf)
            test_vcf = util.bgz_tabix(test_vcf, self.conf)
            caller_name = util.strip_extensions(test_vcf, ['gz','vcf'])
            bed = util.vars_to_bed(orig_vars)
            var_results = defaultdict(dict)
            var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf)
            bamstats = defaultdict(dict)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf   = normalizer(orig_vcf, self.conf)
                normed_caller_vcf = normalizer(test_vcf, self.conf)

                for comparator_name, comparator in self.comparators.iteritems():
                    logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                    all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                    single_results = split_results(all_results, bed)
                    for region, result in zip(util.read_regions(bed), single_results):
                        match_vars = util.find_matching_var(orig_vcf, region)
                        if not match_vars:
                            raise ValueError('Unable to find original variant from region ' + str(region))
                        result = compare_single_var(result,
                                                    region,
                                                    normed_orig_vcf,
                                                    normed_caller_vcf,
                                                    comparator,
                                                    "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                    self.conf)
                        key = var_key(match_vars)
                        if caller_name not in var_results[key]:
                            var_results[key][caller_name] = defaultdict(dict)
                        var_results[key][caller_name][normalizer_name][comparator_name] = result
                        bamstats[key] = {}

        # Iterate over all results and write to standard output. We do this here instead of within the loops above
        # because it keeps results organized by variant, which makes them easier to look at
        self.reporter.write_output(var_results, var_quals, bamstats)
Esempio n. 6
0
def split_results(allresults, bed):
    """
    allresults is the result of a call to a comparator, so it's a
    tuple of (unmatched_orig (FN), matches, unmatched_caller (FP)). This function
    breaks the allresults into a list with separate
    entries for each region in the bed file.
    :param allresults: Tuple containing results from a single comparator call
    :param bed: BED file to split regions by
    :return: List of tuples containing same data as allresults, but organized by bed region
    """
    # FIXME: Use non-quadratic search
    reg_results = []
    for region in util.read_regions(bed):
        fns = [v for v in allresults[0] if v.chrom==region.chrom and v.start >= region.start and v.start < region.stop]
        matches = [v for v in allresults[1] if v[0].chrom==region.chrom and v[0].start >= region.start and v[0].start < region.stop]
        fps = [v for v in allresults[2] if v.chrom==region.chrom and v.start >= region.start and v.start < region.stop]
        reg_results.append( (fns, matches, fps) )

    return reg_results
Esempio n. 7
0
    def process_batch(self, vcf, batchname, gt_policy, ex_snp=None, keep_tmpdir=False, read_depth=250, reads=None):
        """
        Process the given batch of variants by creating a fake 'genome' with the variants, simulating reads from it,
         aligning the reads to make a bam file, then using different callers, variant normalizers, and variant
         comparison methods to generate results. The results are just written to a big text file, which needs to
         be parsed by a separate utility to generate anything readable.
        :param vcf: .vcf file containing variants to simulate
        :param conf: Configuration containing paths to all required binaries / executables / genomes, etc.
        :param homs: Boolean indicating whether variants should be simulated as hets or homs
        :return:
        """
        raw_vars = list(pysam.VariantFile(vcf))

        tmpdir_del_policy = util.TempDir.DELETE_NO_EXCEPTION
        if keep_tmpdir:
            tmpdir_del_policy = util.TempDir.NEVER_DELETE

        tmp_dirname = batchname + "-" + util.randstr()
        with util.TempDir(dirname=tmp_dirname, del_policy=tmpdir_del_policy):
            ref_path = self.conf.get('main', 'ref_genome')
            var_results = defaultdict(dict)

            orig_vcf, variant_sets = self.create_input_vcf(raw_vars, ex_snp, gt_policy)
            bed = util.vars_to_bed(variant_sets)
            if reads is None:
                reads = bam_simulation.gen_alt_fq(ref_path, variant_sets, read_depth)
            bam = bam_simulation.gen_alt_bam(ref_path, self.conf, reads)

            caller_variants = self.call_variants(bam, bed)
            bam_stats = self.collect_bam_stats(bam, bed, orig_vcf)
            var_quals = self.collect_var_quals(caller_variants, bed, orig_vcf)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf = normalizer(orig_vcf, self.conf)

                for caller in caller_variants:
                    normed_caller_vcf = normalizer(caller_variants[caller], self.conf)

                    for comparator_name, comparator in self.comparators.iteritems():
                        logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                        all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                        single_results = split_results(all_results, bed)
                        for region, result in zip(util.read_regions(bed), single_results):
                            match_vars = util.find_matching_var(orig_vcf, region)
                            if not match_vars:
                                raise ValueError('Unable to find original variant from region ' + str(region))
                            result = compare_single_var(result,
                                                        region,
                                                        normed_orig_vcf,
                                                        normed_caller_vcf,
                                                        comparator,
                                                        "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                        self.conf)
                            key = var_key(match_vars)
                            if caller not in var_results[key]:
                                var_results[key][caller] = defaultdict(dict)
                            var_results[key][caller][normalizer_name][comparator_name] = result
            #Iterate over all results and write to standard output. We do this here instead of within the loops above
            #because it keeps results organized by variant, which makes them easier to look at
            self.reporter.write_output(var_results, var_quals, bam_stats)