def get_version(config): try: pdir = config_utils.get_program(program_name, config, "dir") # not configured except ValueError: return "" jar = os.path.basename(config_utils.get_jar(jar_name, pdir)) for to_remove in [jar_name, ".jar", "-standalone"]: jar = jar.replace(to_remove, "") if jar.startswith(("-", ".")): jar = jar[1:] if not jar: logger.warn("Unable to determine version for program '{}' from jar file {}".format( program_name, config_utils.get_jar(jar_name, pdir))) return jar
def illumina_qual_bin(in_file, ref_file, out_dir, config): """Uses CRAM to perform Illumina 8-bin approaches to existing BAM files. Bins quality scores according to Illumina scheme: http://www.illumina.com/Documents/products/whitepapers/whitepaper_datacompression.pdf Also fixes output header to remove extra run groups added by CRAM during conversion. """ index_file = ref_file + ".fai" assert os.path.exists(index_file), "Could not find FASTA reference index: %s" % index_file out_file = os.path.join(out_dir, "%s-qualbin%s" % os.path.splitext(os.path.basename(in_file))) cram_jar = config_utils.get_jar("cramtools", config_utils.get_program("cram", config, "dir")) samtools = config_utils.get_program("samtools", config) if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: orig_header = "%s-header.sam" % os.path.splitext(out_file)[0] header_cmd = "{samtools} view -H -o {orig_header} {in_file}" cmd = ("java -jar {cram_jar} cram --input-bam-file {in_file} " " --reference-fasta-file {ref_file} --preserve-read-names " " --capture-all-tags --lossy-quality-score-spec '*8' " "| java -jar {cram_jar} bam --output-bam-format " " --reference-fasta-file {ref_file} " "| {samtools} reheader {orig_header} - " "> {tx_out_file}") logger.info("Quality binning with CRAM") subprocess.check_call(header_cmd.format(**locals()), shell=True) subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def run(data): #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == "GRCh37": # assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ["star"]: input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ["tophat", "tophat2"]: input_file = _fix_tophat_junction_output(input_file) elif "hg19" not in genome_build: return None #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, "oncofuse_out.txt") if file_exists(out_file): return out_file oncofuse_jar = config_utils.get_jar("Oncofuse", config_utils.get_program("oncofuse", config, "dir")) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = ["java"] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) with file_transaction(data, out_file) as tx_out_file: cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, tx_out_file] cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data) #return out_file return out_file
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config") return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": config["algorithm"].get("num_cores", 1)} jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: java_args = ["-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"] cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1))] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ("{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def sort_by_ref(vcf_file, data): """Sort a VCF file by genome reference and position. """ out_file = "%s-prep%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir") ) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) cmd = ( ["java"] + jvm_opts + [ "-jar", bv_jar, "variant-utils", "sort-vcf", vcf_file, tz.get_in(["reference", "fasta", "base"], data), "--sortpos", ] ) do.run(cmd, "Sort VCF by reference") return out_file
def bcbio_variation_comparison(config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) cmd = ["java"] + jvm_opts + broad.get_default_jvm_opts(tmp_dir) + ["-jar", bv_jar, "variant-compare", config_file] do.run(cmd, "Comparing variant calls using bcbio.variation", data)
def get_version(config): try: pdir = config_utils.get_program(pname, config, "dir") except ValueError: return "" jar = config_utils.get_jar(jar_name, pdir) kwargs["cmd"] = "java" kwargs["args"] = "-Xms128m -Xmx256m -jar %s" % jar return _get_cl_version(kwargs, config)
def trim_adapters(fastq_files, dirs, config): QUALITY_CUTOFF = 5 to_trim = _get_sequences_to_trim(config, ALIENTRIMMER_ADAPTERS) resources = config_utils.get_resources("AlienTrimmer", config) try: jarpath = config_utils.get_program("AlienTrimmer", config, "dir") # fall back on Cutadapt if AlienTrimmer is not installed # XXX: remove after it has been live for a while except: return trim_read_through(fastq_files, dirs, config) jarfile = config_utils.get_jar("AlienTrimmer", jarpath) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])) base_cmd = ("java -jar {jvm_opts} {jarfile} -k 10 -l 20 ") fastq1 = fastq_files[0] supplied_quality_format = _get_quality_format(config) cores = config["algorithm"].get("num_cores", 0) out_files = _get_read_through_trimmed_outfiles(fastq_files, dirs) fastq1_out = out_files[0] if supplied_quality_format == "illumina": quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][0] else: quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][1] quality_flag = '-q ' + quality_flag if len(fastq_files) == 1: if file_exists(fastq1_out): return [fastq1_out] base_cmd += ("-i {fastq1} -o {tx_fastq1_out} -c {temp_file} " "{quality_flag}") message = "Trimming %s from %s with AlienTrimmer." % (to_trim, fastq1) else: fastq2 = fastq_files[1] fastq2_out = out_files[1] if all(map(file_exists, [fastq1_out, fastq2_out])): return [fastq1_out, fastq2_out] base_cmd += ("-if {fastq1} -ir {fastq2} -of {tx_fastq1_out} " "-or {tx_fastq2_out} -c {temp_file} {quality_flag}") message = ("Trimming %s from %s and %s with AlienTrimmer." % (to_trim, fastq1, fastq2)) with tempfile.NamedTemporaryFile(delete=False) as temp: temp_file = temp.name for adapter in to_trim: temp.write(adapter + "\n") temp.close() if len(fastq_files) == 1: with file_transaction(fastq1_out) as tx_fastq1_out: do.run(base_cmd.format(**locals()), message) return [fastq1_out] else: with file_transaction([fastq1_out, fastq2_out]) as tx_out_files: tx_fastq1_out = tx_out_files[0] tx_fastq2_out = tx_out_files[1] do.run(base_cmd.format(**locals()), message) return [fastq1_out, fastq2_out]
def rnaseqc_runner_from_config(config): """ get a runner for Broad's RNA-SeQC tool using a bcbio-nextgen config dict to configure it """ resources = config_utils.get_resources("rnaseqc", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) bwa_path = config_utils.get_program("bwa", config) rnaseqc_dir = config_utils.get_program("rnaseqc", config, "dir") rnaseqc_path = config_utils.get_jar("RNA-SeQC", rnaseqc_dir) return RNASeQCRunner(rnaseqc_path, bwa_path, jvm_opts)
def bcbio_variation_comparison(config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) cmd = ["java"] + jvm_opts + broad.get_default_jvm_opts(tmp_dir) + \ ["-jar", bv_jar, "variant-compare", config_file] do.run(cmd, "Comparing variant calls using bcbio.variation", data)
def get_version(config): try: pdir = config_utils.get_program(program_name, config, "dir") # not configured except ValueError: return "" jar = os.path.basename(config_utils.get_jar(jar_name, pdir)) for to_remove in [jar_name, ".jar", "-standalone"]: jar = jar.replace(to_remove, "") if jar.startswith(("-", ".")): jar = jar[1:] return jar
def bcbio_variation_comparison(config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-compare", config_file] log_cmd("Comparing variant calls using bcbio.variation", data, " ".join(cmd)) subprocess.check_call(cmd)
def get_cmd(cmd_name, datadir, config): """Retrieve snpEff base command line, handling command line and jar based installs. """ resources = config_utils.get_resources("snpeff", config) memory = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])) try: snpeff = config_utils.get_program("snpEff", config) cmd = "{snpeff} {memory} {cmd_name} -dataDir {datadir}" except config_utils.CmdNotFound: snpeff_jar = config_utils.get_jar("snpEff", config_utils.get_program("snpeff", config, "dir")) config_file = "%s.config" % os.path.splitext(snpeff_jar)[0] cmd = "java {memory} -jar {snpeff_jar} {cmd_name} -c {config_file} -dataDir {datadir}" return cmd.format(**locals())
def _bcbio_variation_ensemble(vrn_files, out_file, ref_file, config_file, base_dir, config): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", config, "dir")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-ensemble", config_file, ref_file, out_file] + vrn_files with utils.chdir(base_dir): do.run(cmd, "Ensemble calling: %s" % os.path.basename(base_dir))
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --output-vcf --variants " "> {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True)
def sort_by_ref(vcf_file, data): """Sort a VCF file by genome reference and position. """ out_file = "%s-prep%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) cmd = ["java"] + jvm_opts + ["-jar", bv_jar, "variant-utils", "sort-vcf", vcf_file, dd.get_ref_file(data), "--sortpos"] do.run(cmd, "Sort VCF by reference") return out_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = 1000 varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) with open(out_file, "w") as out_handle: mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions, want_bcf=False) varscan = sh.Command("java").bake("-jar", varscan_jar, "mpileup2cns", "--min-coverage", "5", "--p-value", "0.98", "--output-vcf", "--variants", _out=out_handle) varscan(mpileup())
def get_cmd(cmd_name, datadir, config): """Retrieve snpEff base command line, handling command line and jar based installs. """ resources = config_utils.get_resources("snpeff", config) memory = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])) try: snpeff = config_utils.get_program("snpEff", config) cmd = "{snpeff} {memory} {cmd_name} -dataDir {datadir}" except config_utils.CmdNotFound: snpeff_jar = config_utils.get_jar( "snpEff", config_utils.get_program("snpeff", config, "dir")) config_file = "%s.config" % os.path.splitext(snpeff_jar)[0] cmd = "java {memory} -jar {snpeff_jar} {cmd_name} -c {config_file} -dataDir {datadir}" return cmd.format(**locals())
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: cmd = ( "cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line)
def bcbio_variation_comparison(config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bv_jar, "variant-compare", config_file ] do.run(cmd, "Comparing variant calls using bcbio.variation", data) subprocess.check_call(cmd)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --output-vcf --variants " "> {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True)
def _get_jar(self, command, alts=None): """Retrieve the jar for running the specified command. """ dirs = [] for bdir in [self._gatk_dir, self._picard_ref]: dirs.extend([bdir, os.path.join(bdir, os.pardir, "gatk")]) if alts is None: alts = [] for check_cmd in [command] + alts: for dir_check in dirs: try: check_file = config_utils.get_jar(command, dir_check) return check_file except ValueError: pass raise ValueError("Could not find jar %s in %s:%s" % (command, self._picard_ref, self._gatk_dir))
def _get_jar(self, command, alts=None): """Retrieve the jar for running the specified command. """ dirs = [] for bdir in [self._gatk_dir, self._picard_ref]: dirs.extend([bdir, os.path.join(bdir, os.pardir, "gatk")]) if alts is None: alts = [] for check_cmd in [command] + alts: for dir_check in dirs: try: check_file = config_utils.get_jar(command, dir_check) return check_file except ValueError, msg: if str(msg).find("multiple") > 0: raise else: pass
def _freebayes_custom(in_file, ref_file, config): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. """ bv_ver = programs.get_version("bcbio.variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp")) bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", config, "dir")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file
def _run_snpeff(snp_in, genome, se_interval, out_format, config): snpeff_jar = config_utils.get_jar("snpEff", config_utils.get_program("snpEff", config, "dir")) config_file = "%s.config" % os.path.splitext(snpeff_jar)[0] resources = config_utils.get_resources("snpEff", config) ext = "vcf" if out_format == "vcf" else "tsv" out_file = "%s-effects.%s" % (os.path.splitext(snp_in)[0], ext) if not file_exists(out_file): cl = ["java"] cl += resources.get("jvm_opts", []) cl += ["-jar", snpeff_jar, "eff", "-c", config_file, "-1", "-i", "vcf", "-o", out_format, genome, snp_in] if se_interval: cl.extend(["-filterInterval", se_interval]) print " ".join(cl) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: subprocess.check_call(cl, stdout=out_handle) return out_file
def _run_snpeff(snp_in, genome, se_interval, out_format, config): snpeff_jar = config_utils.get_jar("snpEff", config_utils.get_program("snpEff", config, "dir")) config_file = "%s.config" % os.path.splitext(snpeff_jar)[0] resources = config_utils.get_resources("snpEff", config) ext = "vcf" if out_format == "vcf" else "tsv" out_file = "%s-effects.%s" % (os.path.splitext(snp_in)[0], ext) if not file_exists(out_file): cl = ["java"] cl += resources.get("jvm_opts", []) cl += ["-jar", snpeff_jar, "-c", config_file, "-1", "-i", "vcf", "-o", out_format, genome, snp_in] if se_interval: cl.extend(["-filterInterval", se_interval]) print " ".join(cl) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: subprocess.check_call(cl, stdout=out_handle) return out_file
def run(data): #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file config = data["config"] input_type, input_dir, input_file = _get_input_para(data) out_file = os.path.join(input_dir, 'oncofuse_out.txt') oncofuse_jar = config_utils.get_jar("Oncofuse", config_utils.get_program("oncofuse", config, "dir")) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = ["java"] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file] with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: cmd = " ".join(cl) do.run(cmd, "oncofuse fusion detection", data) return out_file
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = 1000 varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) with open(out_file, "w") as out_handle: mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, target_regions, want_bcf=False) varscan = sh.Command("java").bake( "-jar", varscan_jar, "mpileup2cns", "--min-coverage", "5", "--p-value", "0.98", "--output-vcf", "--variants", _out=out_handle, ) varscan(mpileup())
def _run_bcbio_variation(config_file, base_dir, sample, data): tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(sample)) out_bed_file = os.path.join(base_dir, "{0}-callregions.bed".format(sample)) if not utils.file_exists(out_vcf_file): bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] subprocess.check_call(["java"] + java_args + ["-jar", bv_jar, "variant-compare", config_file]) base_vcf = glob.glob(os.path.join(base_dir, sample, "work", "prep", "*-cfilter.vcf"))[0] base_bed = glob.glob(os.path.join(base_dir, sample, "work", "prep", "*-multicombine.bed"))[0] os.symlink(base_vcf, out_vcf_file) os.symlink(base_bed, out_bed_file) return {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": out_bed_file}
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config") return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file, out_file, "-c", str(config["algorithm"]["num_cores"])] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def _freebayes_custom(in_file, ref_file, config): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. """ bv_ver = programs.get_version("bcbio.variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir( os.path.join(os.path.dirname(in_file), "tmp")) bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", config, "dir")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file ] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar( "bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning( "No coverage calculations: Did not find bcbio.coverage jar from system config" ) return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = { "direction": "increase", "magnitude": config["algorithm"].get("num_cores", 1) } jvm_opts = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: java_args = [ "-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true" ] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1)) ] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def run(data): #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == 'GRCh37': #assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ['star']: input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ['tophat', 'tophat2']: input_file = _fix_tophat_junction_output(input_file) #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, 'oncofuse_out.txt') if file_exists(out_file): return out_file oncofuse_jar = config_utils.get_jar( "Oncofuse", config_utils.get_program("oncofuse", config, "dir")) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = ["java"] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) cl += [ "-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file ] with open(out_file, "w") as out_handle: cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: return out_file return out_file
def run(data): #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == 'GRCh37': #assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ['star']: input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ['tophat', 'tophat2']: input_file = _fix_tophat_junction_output(input_file) #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, 'oncofuse_out.txt') if file_exists(out_file): return out_file oncofuse_jar = config_utils.get_jar("Oncofuse", config_utils.get_program("oncofuse", config, "dir")) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = ["java"] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file] with open(out_file, "w") as out_handle: cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: return out_file return out_file
def _run_bcbio_variation(config_file, base_dir, sample, data): tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(sample)) out_bed_file = os.path.join(base_dir, "{0}-callregions.bed".format(sample)) if not utils.file_exists(out_vcf_file): bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] subprocess.check_call(["java"] + java_args + ["-jar", bv_jar, "variant-compare", config_file]) base_vcf = glob.glob( os.path.join(base_dir, sample, "work", "prep", "*-cfilter.vcf"))[0] base_bed = glob.glob( os.path.join(base_dir, sample, "work", "prep", "*-multicombine.bed"))[0] os.symlink(base_vcf, out_vcf_file) os.symlink(base_bed, out_bed_file) return { "variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": out_bed_file }
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError( "Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ( "{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: vcfutils.write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(config, mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0] tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0] varscan_cmd = ( "java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} " "--output-snp {tx_snp_in} --output-indel {tx_indel_in} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) for orig_fname, fname in [(tx_snp_in, tx_snp), (tx_indel_in, tx_indel)]: cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}" do.run(cmd.format(**locals()), "Varscan paired fix") # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name, config) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name, config) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams( align_bams, items) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, normal_name, tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, normal_name, tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError( "Require both tumor and normal BAM files for VarScan cancer calling" ) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling") if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): with utils.curdir_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"),10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)