def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1 --experimental-gls" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") vcfsamplediff = config_utils.get_program("vcfsamplediff", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) opts += " -f {}".format(ref_file) # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ("{freebayes} --pooled-discrete --pvar 0.7" " --genotype-qualities {opts} {paired.tumor_bam}" " {paired.normal_bam} | {vcfsamplediff} -s VT" " {paired.normal_name} {paired.tumor_name}" " - {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError( "Require both tumor and normal BAM files for FreeBayes cancer calling" ) vcfsamplediff = config_utils.get_program("vcfsamplediff", config) vcffilter = config_utils.get_program("vcffilter", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join( _freebayes_options_from_config(items, config, out_file, region)) opts += " -f {}".format(ref_file) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency # FreeBayes defaults to 20%, but use 10% by default for the # tumor case min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += " --min-alternate-fraction %s" % min_af # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one # NOTE: -s in vcfsamplediff (strict checking: i.e., require no # reads in the germline to call somatic) is not used as it is # too stringent compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ( "{freebayes} --pooled-discrete --genotype-qualities " "{opts} {paired.tumor_bam} {paired.normal_bam} " "| {vcffilter} -f 'QUAL > 1' -s " "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) fix_somatic_calls(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect somatic mutations with qSNP. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): out_file = out_file.replace(".gz", "") with file_transaction(config, out_file) as tx_out_file: with tx_tmpdir() as tmpdir: with utils.chdir(tmpdir): paired = get_paired_bams(align_bams, items) qsnp = config_utils.get_program("qsnp", config) resources = config_utils.get_resources("qsnp", config) mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"])) qsnp_log = os.path.join(tmpdir, "qsnp.log") qsnp_init = os.path.join(tmpdir, "qsnp.ini") if region: paired = _create_bam_region(paired, region, tmpdir) _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init) cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}") do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {}) out_file = _filter_vcf(out_file) out_file = bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config["algorithm"], out_file, region)) cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) clean_vcf_output(out_file, _clean_freebayes_output, "nodups") ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format(name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp, ref_file, config) return ann_file
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: cl = [ config_utils.get_program("freebayes", config), "-v", tx_out_file, "-f", ref_file, "--use-mapping-quality", "--pvar", "0.7" ] for align_bam in align_bams: broad_runner.run_fn("picard_index", align_bam) cl += ["-b", align_bam] cl += _freebayes_options_from_config(config["algorithm"], out_file, region) do.run(cl, "Genotyping with FreeBayes", {}) _clean_freebayes_output(out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) for x in align_bams: bam.index(x, config) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1 --experimental-gls" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def postcall_annotate(in_file, bam_file, ref_file, vrn_files, config): """Perform post-call annotation of FreeBayes calls in preparation for filtering. """ #out_file = _check_file_gatk_merge(in_file) out_file = annotation.annotate_nongatk_vcf(in_file, bam_file, vrn_files.dbsnp, ref_file, config) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file, items=items) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: opts = " ".join(opts) opts += " --min-repeat-entropy 1" opts += " --no-partial-observations" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() clean_fmt_cmd = _clean_freebayes_fmt_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ( "{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" # Remove partial observations, which cause a preference for heterozygote calls # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765 opts += " --no-partial-observations" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" # For multi-sample outputs, ensure consistent order samples = ( "-s" + ",".join([dd.get_sample_name(d) for d in items])) if len(items) > 1 else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ( "{freebayes} -f {ref_file} {opts} {input_bams} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {fix_ambig} | bcftools view {samples} -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def postcall_annotate(in_file, ref_file, vrn_files, config): """Perform post-call annotation of FreeBayes calls in preparation for filtering. """ #out_file = _check_file_gatk_merge(in_file) out_file = annotation.annotate_nongatk_vcf(in_file, vrn_files.dbsnp, ref_file, config) #filters = ["QUAL < 20.0", "DP < 5"] #out_file = genotype.variant_filtration_with_exp(broad.runner_from_config(config), # out_file, ref_file, "", filters) return out_file
def finalize_genotyper(call_file, bam_file, ref_file, config): """Perform SNP genotyping and analysis. """ vrn_files = configured_vrn_files(config, ref_file) variantcaller = config["algorithm"].get("variantcaller", "gatk") if variantcaller in ["freebayes", "cortex", "samtools", "gatk-haplotype", "varscan"]: call_file = annotation.annotate_nongatk_vcf(call_file, bam_file, vrn_files.dbsnp, ref_file, config) filter_snp = variant_filtration(call_file, ref_file, vrn_files, config) return filter_snp
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus( tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ( "{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} " ) do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: return _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") vcfsamplediff = config_utils.get_program("vcfsamplediff", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency # FreeBayes defaults to 20%, but use 10% by default for the # tumor case min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += " --min-alternate-fraction %s" % min_af opts += " --min-repeat-entropy 1 --experimental-gls" # Recommended settings for cancer calling # https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ opts += " --pooled-discrete --genotype-qualities --report-genotype-likelihood-max" # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one # NOTE: -s in vcfsamplediff (strict checking: i.e., require no # reads in the germline to call somatic) is not used as it is # too stringent compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " "| {vcffilter} -f 'QUAL > 5' -s " "| {vcfallelicprimitives} | {vcfstreamsort} " "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) fix_somatic_calls(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file): if not utils.file_exists(out_file): work_dir = "%s-work" % utils.splitext_plus(out_file)[0] with file_transaction(items[0], work_dir) as tx_work_dir: workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir) _run_workflow(items[0], workflow_file, tx_work_dir) raw_file = os.path.join(work_dir, "results", "variants", "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz") out_file = annotation.annotate_nongatk_vcf(raw_file, align_bams, assoc_files.get("dbsnp"), ref_file, items[0], out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) scalpel = config_utils.get_program("scalpel", config) vcfallelicprimitives = config_utils.get_program( "vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = os.path.dirname(tx_out_file) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov cmd = ( "{scalpel} --single {opts} --ref {ref_file} --bam {input_bams} " ) # first run into temp folder do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) # parse produced variant file further scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "variants." + min_cov + "x.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) sample_name_str = items[0]["name"][1] cl2 = ( "{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' | " "{vcfallelicprimitives} | {vcfstreamsort} {compress_cmd} > {tx_out_file}" ) do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = config_utils.get_program("vardict", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) vcfallelicprimitives = config_utils.get_program( "vcfallelicprimitives", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_somatic.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts = " ".join( _vardict_options_from_config(items, config, out_file, region)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ( "{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}" ) bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: return _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region, out_file) #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency # FreeBayes defaults to 20%, but use 10% by default for the # tumor case min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += " --min-alternate-fraction %s" % min_af opts += " --min-repeat-entropy 1 --experimental-gls" # Recommended settings for cancer calling opts += (" --pooled-discrete --pooled-continuous --genotype-qualities " "--report-genotype-likelihood-max --allele-balance-priors-off") compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " "| vcffilter -f 'QUAL > 5' -s " "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | vcfallelicprimitives --keep-info --keep-geno " "| vt normalize -q -r {ref_file} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def finalize_genotyper(call_file, bam_file, ref_file, config): """Perform SNP genotyping and analysis. """ vrn_files = configured_vrn_files(config, ref_file) variantcaller = config["algorithm"].get("variantcaller", "gatk") if variantcaller in [ "freebayes", "cortex", "samtools", "gatk-haplotype", "varscan" ]: call_file = annotation.annotate_nongatk_vcf(call_file, bam_file, vrn_files.dbsnp, ref_file, config) filter_snp = variant_filtration(call_file, ref_file, vrn_files, config) return filter_snp
def call_variations(data, args): """ Run BisSNP tool """ safe_makedir("bissnp") sample = data['name'] workdir = op.abspath(safe_makedir(op.join("bissnp", sample))) counts_file = _count_covars(data['final_bam'], sample, workdir, args.snp, args.reference, data['config']) recal_bam = _recal_BQ_score(data['final_bam'], sample, workdir, counts_file, args.reference, data['config']) cpg, snp = _call_vcf(recal_bam, sample, workdir, args.reference, data['config']) sort_snp = _correct_vcf(snp) snp = annotation.annotate_nongatk_vcf(sort_snp, [data['final_bam']], args.snp, args.reference, data['config']) return data
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) scalpel = config_utils.get_program("scalpel", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) tmp_path = os.path.dirname(tx_out_file) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov cl = "{scalpel} --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}" bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # somatic scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "main/somatic." + min_cov + "x.indel.vcf"), config ) # common scalpel_tmp_file_common = bgzip_and_index( os.path.join(tmp_path, "main/common." + min_cov + "x.indel.vcf"), config ) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) cl2 = ( "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " sed 's/sample_name/{paired.tumor_name}/g' | " "{vcfstreamsort} {compress_cmd} > {tx_out_file}" ) do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = config_utils.get_program("vardict", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" else: somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -M -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] paired = get_paired_bams(align_bams, items) vcfsamplediff = config_utils.get_program("vcfsamplediff", config) if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config["algorithm"], out_file, region)) opts += " -f {}".format(ref_file) # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one cl = ( "{freebayes} --pooled-discrete --pvar 0.7" " --genotype-qualities {opts} {paired.tumor_bam}" " {paired.normal_bam} | {vcfsamplediff} -s VT" " {paired.normal_sample_name} {paired.tumor_sample_name}" " - > {tx_out_file}" ) bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) cl = cl.format(**locals()) do.run(cl, "Genotyping paired variants with FreeBayes", {}) clean_vcf_output(out_file, _clean_freebayes_output, "nodups") ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with pindel in tumor/[normal] analysis. Only attempts to detect small insertion/deletions and not larger structural events. :param align_bam: (list) bam files :param items: (dict) information from yaml :param ref_file: (str) genome in fasta format :param assoc_file: (dict) files for annotation :param region: (str or tupple) region to analyze :param out_file: (str) final vcf file :returns: (str) final vcf file """ config = items[0]["config"] paired = get_paired_bams(align_bams, items) if out_file is None: out_file = "%s-indels.vcf" % os.path.splitext(align_bams[0])[0] paired_bam = [paired.tumor_bam] paired_name = [paired.tumor_name] if paired.normal_bam: paired_bam.append(paired.normal_bam) paired_name.append(paired.normal_name) if not utils.file_exists(out_file): with tx_tmpdir(config) as tmp_path: for align_bam in align_bams: bam.index(align_bam, config) root_pindel = os.path.join(tmp_path, "pindelroot") pindel = config_utils.get_program("pindel", config) opts = _pindel_options(items, config, out_file, region, tmp_path) tmp_input = _create_tmp_input(paired_bam, paired_name, tmp_path, config) cmd = ( "{pindel} -f {ref_file} -i {tmp_input} -o {root_pindel} " + "{opts} --report_inversions false --report_duplications false " "--report_long_insertions false --report_breakpoints false " "--report_interchromosomal_events false " "--max_range_index 2") do.run(cmd.format(**locals()), "Genotyping with pindel", {}) out_file = _create_vcf(root_pindel, out_file, ref_file, items, paired) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: opts = " ".join(opts) opts += " --min-repeat-entropy 1" opts += " --no-partial-observations" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() clean_fmt_cmd = _clean_freebayes_fmt_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: cl = [config_utils.get_program("freebayes", config), "-v", tx_out_file, "-f", ref_file, "--pvar", "0.7"] for align_bam in align_bams: bam.index(align_bam, config) cl += ["-b", align_bam] cl += _freebayes_options_from_config(items, config["algorithm"], out_file, region) do.run(cl, "Genotyping with FreeBayes", {}) clean_vcf_output(out_file, _clean_freebayes_output, "nodups") ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join( _freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ( "{freebayes} -f {ref_file} {opts} {input_bams} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | " "bcftools view -a - 2> /dev/null | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") vcfsamplediff = config_utils.get_program("vcfsamplediff", config) vcffilter = config_utils.get_program("vcffilter", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) opts += " -f {}".format(ref_file) if "--min-alternate-fraction" not in opts and "-F" not in opts: # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"),20)) / 100.0 opts += " --min-alternate-fraction %s" % min_af # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one # NOTE: -s in vcfsamplediff (strict checking: i.e., require no # reads in the germline to call somatic) is not used as it is # too stringent compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ("{freebayes} --pooled-discrete --genotype-qualities " "{opts} {paired.tumor_bam} {paired.normal_bam} " "| {vcffilter} -f 'QUAL > 1' -s " "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - " "{compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf(tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" # Remove partial observations, which cause a preference for heterozygote calls # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765 opts += " --no-partial-observations" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() clean_fmt_cmd = _clean_freebayes_fmt_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ("{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ") do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = config_utils.get_program("vardict", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_somatic.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with pindel in tumor/[normal] analysis. Only attempts to detect small insertion/deletions and not larger structural events. :param align_bam: (list) bam files :param items: (dict) information from yaml :param ref_file: (str) genome in fasta format :param assoc_file: (dict) files for annotation :param region: (str or tupple) region to analyze :param out_file: (str) final vcf file :returns: (str) final vcf file """ config = items[0]["config"] paired = get_paired_bams(align_bams, items) if out_file is None: out_file = "%s-indels.vcf" % os.path.splitext(align_bams[0])[0] paired_bam = [paired.tumor_bam] paired_name = [paired.tumor_name] if paired.normal_bam: paired_bam.append(paired.normal_bam) paired_name.append(paired.normal_name) if not utils.file_exists(out_file): with tx_tmpdir(config) as tmp_path: for align_bam in align_bams: bam.index(align_bam, config) root_pindel = os.path.join(tmp_path, "pindelroot") pindel = config_utils.get_program("pindel", config) opts = _pindel_options(items, config, out_file, region, tmp_path) tmp_input = _create_tmp_input(paired_bam, paired_name, tmp_path, config) cmd = ("{pindel} -f {ref_file} -i {tmp_input} -o {root_pindel} " + "{opts} --report_inversions false --report_duplications false " "--report_long_insertions false --report_breakpoints false " "--report_interchromosomal_events false " "--max_range_index 2") do.run(cmd.format(**locals()), "Genotyping with pindel", {}) out_file = _create_vcf(root_pindel, out_file, ref_file, items, paired) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) scalpel = config_utils.get_program("scalpel", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) tmp_path = os.path.dirname(tx_out_file) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path min_cov = "5" # minimum coverage (default 5) opts += " --mincov %s" % min_cov cl = ("{scalpel} --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) scalpel_tmp_file = os.path.join(tmp_path, "main/somatic." + min_cov + "x.indel.vcf") scalpel_tmp_file_common = os.path.join(tmp_path, "main/common." + min_cov + "x.indel.vcf") compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl2 = ("cat {scalpel_tmp_file} <(grep -vE '^#' {scalpel_tmp_file_common} | " "sed 's/PASS/REJECT/g') | sed 's/sample_name/{paired.tumor_name}/g' | " "{vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) scalpel = config_utils.get_program("scalpel", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = os.path.dirname(tx_out_file) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov cmd = ("{scalpel} --single {opts} --ref {ref_file} --bam {input_bams} ") # first run into temp folder do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) # parse produced variant file further scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants." + min_cov + "x.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) sample_name_str = items[0]["name"][1] cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' | " "{vcfallelicprimitives} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: cl = [config_utils.get_program("freebayes", config), "-v", tx_out_file, "-f", ref_file, "--use-mapping-quality", "--pvar", "0.7"] for align_bam in align_bams: broad_runner.run_fn("picard_index", align_bam) cl += ["-b", align_bam] cl += _freebayes_options_from_config(config["algorithm"], out_file, region) do.run(cl, "Genotyping with FreeBayes", {}) _clean_freebayes_output(out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp, ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | " "bcftools view -a - 2> /dev/null | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") config = items[0]["config"] vcfsamplediff = config_utils.get_program("vcfsamplediff", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join( _freebayes_options_from_config(items, config["algorithm"], out_file, region)) opts += " -f {}".format(ref_file) # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one cl = ("{freebayes} --pooled-discrete --pvar 0.7" " --genotype-qualities {opts} {paired.tumor_bam}" " {paired.normal_bam} | {vcfsamplediff} -s VT" " {paired.normal_name} {paired.tumor_name}" " - > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) clean_vcf_output(out_file, _clean_freebayes_output, "nodups") ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file