Example #1
0
    def test_vars_to_bed(self):
        vars = [
            MockVariant(chrom="1", start=1000, ref="A", alleles=("G",), samples=None),
            MockVariant(chrom="1", start=5000, ref="A", alleles=("G",), samples=None),
            MockVariant(chrom="1", start=5050, ref="T", alleles=("G",), samples=None),
            MockVariant(chrom="2", start=2000, ref="T", alleles=("G",), samples=None),
            MockVariant(chrom="2", start=2010, ref="T", alleles=("G",), samples=None),
            MockVariant(chrom="2", start=3000, ref="T", alleles=("G",), samples=None),
        ]

        bed = util.vars_to_bed(vars, window=500)
        regions = [r for r in util.read_regions(bed)]
        self.assertTrue(len(regions) == 4)
        self.assertTrue(regions[0].chrom == "1")
        self.assertTrue(regions[0].start == 500)
        self.assertTrue(regions[0].stop == 1500)

        self.assertTrue(regions[1].chrom == "1")
        self.assertTrue(regions[1].start == 4500)
        self.assertTrue(regions[1].stop == 5550)

        self.assertTrue(regions[2].chrom == "2")
        self.assertTrue(regions[2].start == 1500)
        self.assertTrue(regions[2].stop == 2510)

        self.assertTrue(regions[3].chrom == "2")
        self.assertTrue(regions[3].start == 2500)
        self.assertTrue(regions[3].stop == 3500)

        os.remove(bed)
    def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf):
        raw_orig_vcf = os.path.abspath(raw_orig_vcf)
        raw_test_vcf = os.path.abspath(raw_test_vcf)
        orig_vars    = list(pysam.VariantFile(raw_orig_vcf))
        tmp_dirname  = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr()

        with util.TempDir(dirname=tmp_dirname):
            orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf)
            test_vcf = util.remove_halfcalls(raw_test_vcf)
            test_vcf = util.bgz_tabix(test_vcf, self.conf)
            caller_name = util.strip_extensions(test_vcf, ['gz','vcf'])
            bed = util.vars_to_bed(orig_vars)
            var_results = defaultdict(dict)
            var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf)
            bamstats = defaultdict(dict)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf   = normalizer(orig_vcf, self.conf)
                normed_caller_vcf = normalizer(test_vcf, self.conf)

                for comparator_name, comparator in self.comparators.iteritems():
                    logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                    all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                    single_results = split_results(all_results, bed)
                    for region, result in zip(util.read_regions(bed), single_results):
                        match_vars = util.find_matching_var(orig_vcf, region)
                        if not match_vars:
                            raise ValueError('Unable to find original variant from region ' + str(region))
                        result = compare_single_var(result,
                                                    region,
                                                    normed_orig_vcf,
                                                    normed_caller_vcf,
                                                    comparator,
                                                    "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                    self.conf)
                        key = var_key(match_vars)
                        if caller_name not in var_results[key]:
                            var_results[key][caller_name] = defaultdict(dict)
                        var_results[key][caller_name][normalizer_name][comparator_name] = result
                        bamstats[key] = {}

        # Iterate over all results and write to standard output. We do this here instead of within the loops above
        # because it keeps results organized by variant, which makes them easier to look at
        self.reporter.write_output(var_results, var_quals, bamstats)
    def process_batch(self, vcf, batchname, gt_policy, ex_snp=None, keep_tmpdir=False, read_depth=250, reads=None):
        """
        Process the given batch of variants by creating a fake 'genome' with the variants, simulating reads from it,
         aligning the reads to make a bam file, then using different callers, variant normalizers, and variant
         comparison methods to generate results. The results are just written to a big text file, which needs to
         be parsed by a separate utility to generate anything readable.
        :param vcf: .vcf file containing variants to simulate
        :param conf: Configuration containing paths to all required binaries / executables / genomes, etc.
        :param homs: Boolean indicating whether variants should be simulated as hets or homs
        :return:
        """
        raw_vars = list(pysam.VariantFile(vcf))

        tmpdir_del_policy = util.TempDir.DELETE_NO_EXCEPTION
        if keep_tmpdir:
            tmpdir_del_policy = util.TempDir.NEVER_DELETE

        tmp_dirname = batchname + "-" + util.randstr()
        with util.TempDir(dirname=tmp_dirname, del_policy=tmpdir_del_policy):
            ref_path = self.conf.get('main', 'ref_genome')
            var_results = defaultdict(dict)

            orig_vcf, variant_sets = self.create_input_vcf(raw_vars, ex_snp, gt_policy)
            bed = util.vars_to_bed(variant_sets)
            if reads is None:
                reads = bam_simulation.gen_alt_fq(ref_path, variant_sets, read_depth)
            bam = bam_simulation.gen_alt_bam(ref_path, self.conf, reads)

            caller_variants = self.call_variants(bam, bed)
            bam_stats = self.collect_bam_stats(bam, bed, orig_vcf)
            var_quals = self.collect_var_quals(caller_variants, bed, orig_vcf)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf = normalizer(orig_vcf, self.conf)

                for caller in caller_variants:
                    normed_caller_vcf = normalizer(caller_variants[caller], self.conf)

                    for comparator_name, comparator in self.comparators.iteritems():
                        logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                        all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                        single_results = split_results(all_results, bed)
                        for region, result in zip(util.read_regions(bed), single_results):
                            match_vars = util.find_matching_var(orig_vcf, region)
                            if not match_vars:
                                raise ValueError('Unable to find original variant from region ' + str(region))
                            result = compare_single_var(result,
                                                        region,
                                                        normed_orig_vcf,
                                                        normed_caller_vcf,
                                                        comparator,
                                                        "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                        self.conf)
                            key = var_key(match_vars)
                            if caller not in var_results[key]:
                                var_results[key][caller] = defaultdict(dict)
                            var_results[key][caller][normalizer_name][comparator_name] = result
            #Iterate over all results and write to standard output. We do this here instead of within the loops above
            #because it keeps results organized by variant, which makes them easier to look at
            self.reporter.write_output(var_results, var_quals, bam_stats)