コード例 #1
0
ファイル: injectvar.py プロジェクト: brendanofallon/varcomp
def gen_reads(vcf, dest_vcf, dest_fq_prefix, ex_snp, gt_policy, read_depth, conf):
    """
    Generate fastqs for the given set of input variants. This code is fired when the user supplies the --generate-fqs
    arg, and closely mimics the fastq generation code in VariantProcessor
    :param vars: List of variants
    :param dest_vcf: Destination filename for final VCF (may be gzipped)
    :param dest_fq_prefix: Destination prefix for fastq files
    :param ex_snp: Info for extra SNP addition
    :param gt_policy: Policy describing genotype (hets, homs, from file, etc.)
    :param read_depth:
    :param conf:
    """

    #First, make sure there aren't variants that are too close to process independently...
    batches = util.batch_variants(vcf, max_batch_size=1e9)
    if len(list(batches))>1:
        raise ValueError('The VCF file ' + vcf + ' contains variants that are too close to include in a single set of fastqs, please ensure no two variants are within 2kb of each other')
    vars = list(pysam.VariantFile(vcf))
    variant_sets = bp.create_variant_sets(vars, ex_snp, gt_policy, pysam.FastaFile( conf.get('main', 'ref_genome')))
    allvars = []
    for vset in variant_sets:
        allvars.extend(vset['vars'])
    variant_batch = sorted(allvars, cmp=util.variant_comp)
    final_vcf = util.write_vcf(variant_batch, dest_vcf, conf)
    logging.info("Writing full VCF to " + final_vcf)
    reads = bam_simulation.gen_alt_fq(conf.get('main', 'ref_genome'), variant_sets, read_depth, dest_prefix=dest_fq_prefix)
    logging.info("Writing fastqs to " + reads[0] + ", " + reads[1])
コード例 #2
0
ファイル: injectvar.py プロジェクト: brendanofallon/varcomp
def process_vcf(vcf, gt_default, conf, output, callers, fqs=None, snp_info=None, single_batch=False, keep_tmpdir=False, read_depth=250):
    """
    Perform analyses for each variant in the VCF file.
    :param input_vcf: Path to vcf file containing variants to process
    :param single_batch: Assume all variants in VCF are part of one batch and process them all simultaneously
    :param keep_tmpdir: Preserve tmpdirs created (otherwise delete them, unless they are flagged)
    :param conf: Configuration object
    """

    variant_callers = core_callers.get_callers()
    variant_callers.update(load_components(conf, 'callers', 'get_callers'))

    normalizers = core_norms.get_normalizers()
    normalizers.update(load_components(conf, 'normalizers', 'get_normalizers'))

    comparators = core_comps.get_comparators()
    comparators.update(load_components(conf, 'comparators', 'get_comparators'))

    if callers is not None and len(callers)>0:
        callers_to_use = {}
        for caller in callers:
            if caller not in variant_callers:
                raise KeyError('No variant caller ' + caller + ' found in callers')
            callers_to_use[caller] = variant_callers[caller]
        variant_callers = callers_to_use

    if fqs is not None:
        fqs = [os.path.abspath(fq) for fq in fqs]

    processor = bp.VariantProcessor(variant_callers, normalizers, comparators, JsonReporter(output), conf)
    logging.info("Processing variants in file " + vcf)
    if single_batch:
        logging.info("Processing all variants as one batch")
        tmp_dir = '{}-tmp'.format(util.strip_extensions(os.path.basename(vcf), ['vcf','gz']))
        processor.process_batch(vcf, tmp_dir, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs)
    else:
        batches = util.batch_variants(vcf, max_batch_size=1000, min_safe_dist=2000)
        for batchnum, batch_vcf in enumerate(batches, 1):
            logging.info('Processing batch #{} of {}'.format(batchnum, len(batches)))
            tmp_dir = '{}-batch{:03d}-tmp'.format(util.strip_extensions(os.path.basename(vcf), ['vcf','gz']), batchnum)
            processor.process_batch(batch_vcf, tmp_dir, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs)
            os.remove(batch_vcf)