def gen_reads(vcf, dest_vcf, dest_fq_prefix, ex_snp, gt_policy, read_depth, conf): """ Generate fastqs for the given set of input variants. This code is fired when the user supplies the --generate-fqs arg, and closely mimics the fastq generation code in VariantProcessor :param vars: List of variants :param dest_vcf: Destination filename for final VCF (may be gzipped) :param dest_fq_prefix: Destination prefix for fastq files :param ex_snp: Info for extra SNP addition :param gt_policy: Policy describing genotype (hets, homs, from file, etc.) :param read_depth: :param conf: """ #First, make sure there aren't variants that are too close to process independently... batches = util.batch_variants(vcf, max_batch_size=1e9) if len(list(batches))>1: raise ValueError('The VCF file ' + vcf + ' contains variants that are too close to include in a single set of fastqs, please ensure no two variants are within 2kb of each other') vars = list(pysam.VariantFile(vcf)) variant_sets = bp.create_variant_sets(vars, ex_snp, gt_policy, pysam.FastaFile( conf.get('main', 'ref_genome'))) allvars = [] for vset in variant_sets: allvars.extend(vset['vars']) variant_batch = sorted(allvars, cmp=util.variant_comp) final_vcf = util.write_vcf(variant_batch, dest_vcf, conf) logging.info("Writing full VCF to " + final_vcf) reads = bam_simulation.gen_alt_fq(conf.get('main', 'ref_genome'), variant_sets, read_depth, dest_prefix=dest_fq_prefix) logging.info("Writing fastqs to " + reads[0] + ", " + reads[1])
def process_batch(self, vcf, batchname, gt_policy, ex_snp=None, keep_tmpdir=False, read_depth=250, reads=None): """ Process the given batch of variants by creating a fake 'genome' with the variants, simulating reads from it, aligning the reads to make a bam file, then using different callers, variant normalizers, and variant comparison methods to generate results. The results are just written to a big text file, which needs to be parsed by a separate utility to generate anything readable. :param vcf: .vcf file containing variants to simulate :param conf: Configuration containing paths to all required binaries / executables / genomes, etc. :param homs: Boolean indicating whether variants should be simulated as hets or homs :return: """ raw_vars = list(pysam.VariantFile(vcf)) tmpdir_del_policy = util.TempDir.DELETE_NO_EXCEPTION if keep_tmpdir: tmpdir_del_policy = util.TempDir.NEVER_DELETE tmp_dirname = batchname + "-" + util.randstr() with util.TempDir(dirname=tmp_dirname, del_policy=tmpdir_del_policy): ref_path = self.conf.get('main', 'ref_genome') var_results = defaultdict(dict) orig_vcf, variant_sets = self.create_input_vcf(raw_vars, ex_snp, gt_policy) bed = util.vars_to_bed(variant_sets) if reads is None: reads = bam_simulation.gen_alt_fq(ref_path, variant_sets, read_depth) bam = bam_simulation.gen_alt_bam(ref_path, self.conf, reads) caller_variants = self.call_variants(bam, bed) bam_stats = self.collect_bam_stats(bam, bed, orig_vcf) var_quals = self.collect_var_quals(caller_variants, bed, orig_vcf) for normalizer_name, normalizer in self.normalizers.iteritems(): logging.info("--> Running normalizer " + normalizer_name) normed_orig_vcf = normalizer(orig_vcf, self.conf) for caller in caller_variants: normed_caller_vcf = normalizer(caller_variants[caller], self.conf) for comparator_name, comparator in self.comparators.iteritems(): logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")") all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf) single_results = split_results(all_results, bed) for region, result in zip(util.read_regions(bed), single_results): match_vars = util.find_matching_var(orig_vcf, region) if not match_vars: raise ValueError('Unable to find original variant from region ' + str(region)) result = compare_single_var(result, region, normed_orig_vcf, normed_caller_vcf, comparator, "/".join(str(i) for i in match_vars[0].samples[0]['GT']), self.conf) key = var_key(match_vars) if caller not in var_results[key]: var_results[key][caller] = defaultdict(dict) var_results[key][caller][normalizer_name][comparator_name] = result #Iterate over all results and write to standard output. We do this here instead of within the loops above #because it keeps results organized by variant, which makes them easier to look at self.reporter.write_output(var_results, var_quals, bam_stats)