def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ("{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError( "Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ( "{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _create_pileup(bam_file, data, out_base, background): """Create pileup calls in the regions of interest for hg19 -> GRCh37 chromosome mapping. """ out_file = "%s-mpileup.txt" % out_base if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: background_bed = os.path.normpath( os.path.join( os.path.dirname( os.path.realpath(utils.which("verifybamid2"))), "resource", "%s.%s.%s.vcf.gz.dat.bed" % (background["dataset"], background["nvars"], background["build"]))) local_bed = os.path.join( os.path.dirname(out_base), "%s.%s-hg19.bed" % (background["dataset"], background["nvars"])) if not utils.file_exists(local_bed): with file_transaction(data, local_bed) as tx_local_bed: with open(background_bed) as in_handle: with open(tx_local_bed, "w") as out_handle: for line in in_handle: out_handle.write("chr%s" % line) mpileup_cl = samtools.prep_mpileup([bam_file], dd.get_ref_file(data), data["config"], want_bcf=False, target_regions=local_bed) cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}") do.run(cl.format(**locals()), "Create pileup from BAM input") return out_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ( "cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --output-vcf --variants " "> {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) min_af = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") export = utils.local_path_export() cmd = ( "{export} {mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns {opts} " "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}" ) do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = 1000 varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) with open(out_file, "w") as out_handle: mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions, want_bcf=False) varscan = sh.Command("java").bake("-jar", varscan_jar, "mpileup2cns", "--min-coverage", "5", "--p-value", "0.98", "--output-vcf", "--variants", _out=out_handle) varscan(mpileup())
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --output-vcf --variants " "> {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") export = utils.local_path_export() cmd = ("{export} {mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns {opts} " "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = 1000 varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) with open(out_file, "w") as out_handle: mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions, want_bcf=False) varscan = sh.Command("java").bake( "-jar", varscan_jar, "mpileup2cns", "--min-coverage", "5", "--p-value", "0.98", "--output-vcf", "--variants", _out=out_handle, ) varscan(mpileup())
def _get_input_args(bam_file, data, out_base): """Retrieve input args, depending on genome build. VerifyBamID2 only handles GRCh37 (1, 2, 3) not hg19, so need to generate a pileup for hg19 and fix chromosome naming. """ if dd.get_genome_build(data) in ["hg19"]: out_file = "%s-mpileup.txt" % out_base if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: mpileup_cl = samtools.prep_mpileup( [bam_file], dd.get_ref_file(data), data["config"], want_bcf=False, target_regions=_get_autosomal_bed(data, tx_out_file)) cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}") do.run(cl.format(**locals()), "Create pileup from BAM input") return ["--PileupFile", out_file] else: return ["--BamFile", bam_file]
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ( "{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _create_pileup(bam_file, data, out_base, background): """Create pileup calls in the regions of interest for hg19 -> GRCh37 chromosome mapping. """ out_file = "%s-mpileup.txt" % out_base if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: background_bed = os.path.normpath(os.path.join( os.path.dirname(os.path.realpath(utils.which("verifybamid2"))), "resource", "%s.%s.%s.vcf.gz.dat.bed" % (background["dataset"], background["nvars"], background["build"]))) local_bed = os.path.join(os.path.dirname(out_base), "%s.%s-hg19.bed" % (background["dataset"], background["nvars"])) if not utils.file_exists(local_bed): with file_transaction(data, local_bed) as tx_local_bed: with open(background_bed) as in_handle: with open(tx_local_bed, "w") as out_handle: for line in in_handle: out_handle.write("chr%s" % line) mpileup_cl = samtools.prep_mpileup([bam_file], dd.get_ref_file(data), data["config"], want_bcf=False, target_regions=local_bed) cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}") do.run(cl.format(**locals()), "Create pileup from BAM input") return out_file
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not utils.file_exists(out_file): assert out_file.endswith(".vcf.gz"), "Expect bgzipped output to VarScan" normal_mpileup_cl = samtools.prep_mpileup([paired.normal_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) base, ext = utils.splitext_plus(out_file) indel_file = base + "-indel.vcf" snp_file = base + "-snp.vcf" with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) remove_zerocoverage = r"ifne grep -v -P '\t0\t\t$'" varscan_cmd = ("varscan {jvm_opts} somatic " " <({normal_mpileup_cl} | {remove_zerocoverage}) " "<({tumor_mpileup_cl} | {remove_zerocoverage}) " "--output-snp {tx_snp} --output-indel {tx_indel} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) to_combine = [] for fname in [snp_file, indel_file]: if utils.file_exists(fname): fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0]) with file_transaction(config, fix_file) as tx_fix_file: fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") normal_name = paired.normal_name tumor_name = paired.tumor_name cmd = ("cat {fname} | " "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x," """ "{normal_name}", "{tumor_name}")' | """ "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | " """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """ "{py_cl} -x 'bcbio.variation.varscan.spv_freq_filter(x, 1)' | " "bgzip -c > {tx_fix_file}") do.run(cmd.format(**locals()), "Varscan paired fix") to_combine.append(fix_file) if not to_combine: out_file = write_empty_vcf(out_file, config) else: out_file = combine_variant_files(to_combine, out_file, ref_file, config, region=target_regions) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling") if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): with utils.curdir_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"),10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError( "Require both tumor and normal BAM files for VarScan cancer calling" ) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams( align_bams, items) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, normal_name, tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, normal_name, tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(config, mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0] tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0] varscan_cmd = ( "java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} " "--output-snp {tx_snp_in} --output-indel {tx_indel_in} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) for orig_fname, fname in [(tx_snp_in, tx_snp), (tx_indel_in, tx_indel)]: cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}" do.run(cmd.format(**locals()), "Varscan paired fix") # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name, config) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name, config) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)