def gen_reads(vcf, dest_vcf, dest_fq_prefix, ex_snp, gt_policy, read_depth, conf): """ Generate fastqs for the given set of input variants. This code is fired when the user supplies the --generate-fqs arg, and closely mimics the fastq generation code in VariantProcessor :param vars: List of variants :param dest_vcf: Destination filename for final VCF (may be gzipped) :param dest_fq_prefix: Destination prefix for fastq files :param ex_snp: Info for extra SNP addition :param gt_policy: Policy describing genotype (hets, homs, from file, etc.) :param read_depth: :param conf: """ #First, make sure there aren't variants that are too close to process independently... batches = util.batch_variants(vcf, max_batch_size=1e9) if len(list(batches))>1: raise ValueError('The VCF file ' + vcf + ' contains variants that are too close to include in a single set of fastqs, please ensure no two variants are within 2kb of each other') vars = list(pysam.VariantFile(vcf)) variant_sets = bp.create_variant_sets(vars, ex_snp, gt_policy, pysam.FastaFile( conf.get('main', 'ref_genome'))) allvars = [] for vset in variant_sets: allvars.extend(vset['vars']) variant_batch = sorted(allvars, cmp=util.variant_comp) final_vcf = util.write_vcf(variant_batch, dest_vcf, conf) logging.info("Writing full VCF to " + final_vcf) reads = bam_simulation.gen_alt_fq(conf.get('main', 'ref_genome'), variant_sets, read_depth, dest_prefix=dest_fq_prefix) logging.info("Writing fastqs to " + reads[0] + ", " + reads[1])
def process_vcf(vcf, gt_default, conf, output, callers, fqs=None, snp_info=None, single_batch=False, keep_tmpdir=False, read_depth=250): """ Perform analyses for each variant in the VCF file. :param input_vcf: Path to vcf file containing variants to process :param single_batch: Assume all variants in VCF are part of one batch and process them all simultaneously :param keep_tmpdir: Preserve tmpdirs created (otherwise delete them, unless they are flagged) :param conf: Configuration object """ variant_callers = core_callers.get_callers() #variant_callers.update(load_components(conf, 'callers', 'get_callers')) normalizers = core_norms.get_normalizers() #normalizers.update(load_components(conf, 'normalizers', 'get_normalizers')) comparators = core_comps.get_comparators() #comparators.update(load_components(conf, 'comparators', 'get_comparators')) if callers is not None and len(callers)>0: callers_to_use = {} for caller in callers: if caller not in variant_callers: raise KeyError('No variant caller ' + caller + ' found in callers') callers_to_use[caller] = variant_callers[caller] variant_callers = callers_to_use if fqs is not None: nfq = [] for fq in fqs: nfq.append( os.path.abspath(fq)) fqs = nfq processor = bp.VariantProcessor(variant_callers, normalizers, comparators, JsonReporter(output)) logging.info("Processing variants in file " + vcf) if single_batch: logging.info("Processing all variants as one batch") processor.process_batch(vcf, vcf.replace(".vcf", "-tmpfiles"), conf, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs) else: batches = util.batch_variants(vcf, max_batch_size=1000, min_safe_dist=2000) for batchnum, batch_vcf in enumerate(batches): logging.info("Processing batch #" + str(batchnum+1) + " of " + str(len(batches))) processor.process_batch(batch_vcf, vcf.replace(".vcf", "-tmpfiles"), conf, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs) os.remove(batch_vcf)