def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) if library != "non-strand-specific": logger.info( "Qualimap can get the orientation wrong for stranded reads, so we run it in unstranded mode. This gives comparable results to unstranded for RNA-seq data (see https://groups.google.com/forum/#!topic/qualimap/ZGo-k8LGmHQ) for a further explanation." ) library = "non-strand-specific" paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export() cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, todo="square"): """Run squaring or merging analysis using bcbio.variation.recall. """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) # adjust memory by cores but leave room for run program memory memcores = int(math.ceil(float(cores) / 5.0)) jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]), {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": memcores}}}) # Write unique VCFs and BAMs to input file input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0] with open(input_file, "w") as out_handle: out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n") if todo == "square": out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n") variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "") cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] if todo == "square": cmd += ["--caller", variantcaller] cmd += [out_file, ref_file, input_file] cmd = "%s %s" % (utils.local_path_export(), " ".join(str(x) for x in cmd)) do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region))) return out_file
def _run_ensemble_intersection(batch_id, vrn_files, callers, base_dir, edata): """Run intersection n out of x based ensemble method using bcbio.variation.recall. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id)) if not utils.file_exists(out_vcf_file): num_pass = _get_num_pass(edata, len(vrn_files)) cmd = [ config_utils.get_program( "bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--numpass", str(num_pass), "--names", ",".join(callers) ] # Remove filtered calls, do not try to rescue, unless configured if not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata): cmd += ["--nofiltered"] with file_transaction(edata, out_vcf_file) as tx_out_file: cmd += [tx_out_file, dd.get_ref_file(edata)] + vrn_files cmd = "%s %s" % (utils.local_path_export(), " ".join(str(x) for x in cmd)) do.run(cmd, "Ensemble intersection calling: %s" % (batch_id)) in_data = utils.deepish_copy(edata) in_data["vrn_file"] = out_vcf_file return {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None}
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if (sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len(depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ("{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any( tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps( dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join( work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if ( sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len( depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ( "{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def fastq_size_output(fastq_file, tocheck): head_count = 8000000 fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith( ".gz") else "cat {fastq_file}" cmd = (utils.local_path_export() + gzip_cmd + " | head -n {head_count} | " "seqtk sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") def fix_signal(): """Avoid spurious 'cat: write error: Broken pipe' message due to head command. Work around from: https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output """ signal.signal(signal.SIGPIPE, signal.SIG_DFL) count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", preexec_fn=fix_signal).decode() if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) for count, size in (l.strip().split() for l in count_out.strip().split("\n")): yield count, size
def _run_gridss(inputs, background, work_dir): out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores}}}) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file ] cl = "%s %s" % (utils.local_path_export(), " ".join( [str(x) for x in cl])) do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists( tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write( line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move( os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move( "%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): raise ValueError( "FastQC failed to produce output HTML file: %s" % os.path.listdir(tx_tmp_dir)) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs( vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), { "algorithm": { "memory_adjust": { "magnitude": threads, "direction": "increase" } } }) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = [ "rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % ( utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def get_cmd(cmd_name, datadir, config, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", config) memory = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])) snpeff = config_utils.get_program("snpEff", config) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) export = utils.local_path_export() cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _run_tool(cmd, use_container=True): """Run with injection of bcbio path. Place at end for runs without containers to avoid overriding other bcbio installations. """ if isinstance(cmd, (list, tuple)): cmd = " ".join([str(x) for x in cmd]) cmd = utils.local_path_export(at_start=use_container) + cmd subprocess.check_call(cmd, shell=True)
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) return [[data] for data in samples]
def get_cmd(cmd_name, datadir, config, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", config) memory = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])) snpeff = config_utils.get_program("snpEff", config) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir( os.path.join(os.path.dirname(out_file), "tmp")) export = utils.local_path_export() cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files += glob.glob(os.path.join(out_dir, "multiqc_config.yaml")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append(file_list_final) out.append([data]) return out
def _bcbio_variation_ensemble(vrn_files, out_file, ref_file, config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ vrn_files = [_handle_somatic_ensemble(v, data) for v in vrn_files] tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["bcbio-variation"] + jvm_opts + java_args + \ ["variant-ensemble", config_file, ref_file, out_file] + vrn_files with utils.chdir(base_dir): cmd = "%s %s" % (utils.local_path_export(), " ".join(str(x) for x in cmd)) do.run(cmd, "Ensemble calling: %s" % os.path.basename(base_dir))
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data, validate_method): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) if isinstance(rtg_ref, dict) and "base" in rtg_ref: rtg_ref = os.path.dirname(rtg_ref["base"]) assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {"algorithm": {"memory_adjust": {"magnitude": threads, "direction": "increase"}}}) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = ["rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] if validate_method == "rtg-squash-ploidy": cmd += ["--squash-ploidy"] rm_samples = vcfutils.get_samples(rm_file) if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples: cmd += ["--sample=%s" % dd.get_sample_name(data)] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) min_af = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") export = utils.local_path_export() cmd = ( "{export} {mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns {opts} " "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}" ) do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir) if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"] else None) if ds_file is not None: bam_file = ds_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file] cl = "%s %s %s" % (utils.java_freetype_fix(), utils.local_path_export(), " ".join([str(x) for x in cl])) do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists(tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write(line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move("%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): raise ValueError("FastQC failed to produce output HTML file: %s" % os.listdir(tx_tmp_dir)) logger.info("Produced HTML report %s" % sentry_file) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ try: input_rec = next(iter(pybedtools.BedTool(in_file))) except StopIteration: # empty file utils.copy_plus(in_file, out_file) return # keep everything after standard chrom/start/end, 1-based extra_fields = list(range(4, len(input_rec.fields) + 1)) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = ( r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s || $NF < -%s) $%s = "."} {print}'""" % (max_distance, max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd(os.path.dirname(out_file)) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" # Ensure gene transcripts match reference genome ready_gene_file = os.path.join( os.path.dirname(out_file), "%s-genomeonly.bed" % (utils.splitext_plus(os.path.basename(gene_file))[0])) ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data) exports = "export TMPDIR=%s && %s" % (os.path.dirname(out_file), utils.local_path_export()) bcbio_py = sys.executable gsort = config_utils.get_program("gsort", data) cmd = ( "{exports}{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | " "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | " "{gsort} - {fai_file} | " "bedtools closest -g {fai_file} " "-D ref -t first -a - -b <({gsort} {ready_gene_file} {fai_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}" ) do.run(cmd.format(**locals()), "Annotate BED file with gene info")
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") samples = _report_summary(samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(samples, out_dir, tx_out) in_files += _merge_metrics(samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append(file_list_final) out.append([data]) return out
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library} " "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals()) return cmd
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library} " "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals()) return cmd
def _run_tool(cmd, use_container=True, work_dir=None, log_file=None): """Run with injection of bcbio path. Place at end for runs without containers to avoid overriding other bcbio installations. """ if isinstance(cmd, (list, tuple)): cmd = " ".join([str(x) for x in cmd]) cmd = utils.local_path_export(at_start=use_container) + cmd if log_file: cmd += " 2>&1 | tee -a %s" % log_file try: subprocess.check_call(cmd, shell=True) finally: if use_container and work_dir: _chown_workdir(work_dir)
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) # scale by cores, defaulting to 2x base usage to ensure we have enough memory # for single core runs to use with human genomes jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": max(2, dd.get_cores(data))}}}) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) export = utils.local_path_export() cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _run_gridss(inputs, background, work_dir): out_file = os.path.join( work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = [ "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true" ] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": cores } } }) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file( inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += [ "INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data) ] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"): gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else "" exports = utils.local_path_export() cmd = ("{exports} {gemini} {gemini_opts} load {load_opts} " "-v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def _call_cnv(items, work_dir, read_mapping_file, coverage_file, control_sample_names): output_fpath = os.path.join(work_dir, "calls_combined.tsv") cov2lr = "cov2lr.pl" lr2gene = "lr2gene.pl" control_opt = "" lr2gene_opt = "" if control_sample_names: control_opt = "-c " + ":".join(control_sample_names) lr2gene_opt = "-c" if not utils.file_exists(output_fpath): with file_transaction(items[0], output_fpath) as tx_out_file: export = utils.local_path_export() cmd = ("{export} {cov2lr} -a {control_opt} {read_mapping_file} {coverage_file} | " + "{lr2gene} {lr2gene_opt} > {output_fpath}") do.run(cmd.format(**locals()), "Seq2C CNV calling") return output_fpath
def _convert_fastq(srafn, outdir, single=False): "convert sra to fastq" cmd = "fastq-dump --split-files --gzip {srafn}" cmd = "%s %s" % (utils.local_path_export(), cmd) sraid = os.path.basename(utils.splitext_plus(srafn)[0]) if not srafn: return None if not single: out_file = [os.path.join(outdir, "%s_1.fastq.gz" % sraid), os.path.join(outdir, "%s_2.fastq.gz" % sraid)] if not utils.file_exists(out_file[0]): with utils.chdir(outdir): do.run(cmd.format(**locals()), "Covert to fastq %s" % sraid) if not utils.file_exists(out_file[0]): raise IOError("SRA %s didn't convert, something happened." % srafn) return [out for out in out_file if utils.file_exists(out)] else: raise ValueError("Not supported single-end sra samples for now.")
def create_gemini_db_orig(gemini_vcf, data, gemini_db=None, ped_file=None): """Original GEMINI specific data loader, only works with hg19/GRCh37. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) load_opts = "" if "gemini_allvariants" not in dd.get_tools_on(data): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else "" exports = utils.local_path_export() cmd = ( "{exports} {gemini} {gemini_opts} load {load_opts} " "-v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}" ) cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") export = utils.local_path_export() cmd = ("{export} {mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns {opts} " "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _convert_fastq(srafn, outdir, single=False): "convert sra to fastq" cmd = "fastq-dump --split-files --gzip {srafn}" cmd = "%s %s" % (utils.local_path_export(), cmd) sraid = os.path.basename(utils.splitext_plus(srafn)[0]) if not srafn: return None if not single: out_file = [ os.path.join(outdir, "%s_1.fastq.gz" % sraid), os.path.join(outdir, "%s_2.fastq.gz" % sraid) ] if not utils.file_exists(out_file[0]): with utils.chdir(outdir): do.run(cmd.format(**locals()), "Covert to fastq %s" % sraid) if not utils.file_exists(out_file[0]): raise IOError("SRA %s didn't convert, something happened." % srafn) return [out for out in out_file if utils.file_exists(out)] else: raise ValueError("Not supported single-end sra samples for now.")
def fastq_size_output(fastq_file, tocheck): head_count = 8000000 fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(".gz") else "cat {fastq_file}" cmd = (utils.local_path_export() + gzip_cmd + " | head -n {head_count} | " "seqtk sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") def fix_signal(): """Avoid spurious 'cat: write error: Broken pipe' message due to head command. Work around from: https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output """ signal.signal(signal.SIGPIPE, signal.SIG_DFL) count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", preexec_fn=fix_signal).decode() if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) for count, size in (l.strip().split() for l in count_out.strip().split("\n")): yield count, size
def _can_use_mem(fastq_file, data, read_min_size=None): """bwa-mem handle longer (> 70bp) reads with improved piping. Randomly samples 5000 reads from the first two million. Default to no piping if more than 75% of the sampled reads are small. If we've previously calculated minimum read sizes (from rtg SDF output) we can skip the formal check. """ min_size = 70 if read_min_size and read_min_size >= min_size: return True thresh = 0.75 head_count = 8000000 tocheck = 5000 fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith( ".gz") else "cat {fastq_file}" cmd = (utils.local_path_export() + gzip_cmd + " | head -n {head_count} | " "seqtk sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") def fix_signal(): """Avoid spurious 'cat: write error: Broken pipe' message due to head command. Work around from: https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output """ signal.signal(signal.SIGPIPE, signal.SIG_DFL) count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", preexec_fn=fix_signal) if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) shorter = 0 for count, size in (l.strip().split() for l in count_out.strip().split("\n")): if int(size) < min_size: shorter += int(count) return (float(shorter) / float(tocheck)) <= thresh
def _call_cnv(items, work_dir, read_mapping_file, coverage_file, control_sample_names): output_fpath = os.path.join(work_dir, "calls_combined.tsv") cov2lr = "cov2lr.pl" lr2gene = "lr2gene.pl" cov2lr_opts, lr2gene_opts = _get_seq2c_options(items[0]) if control_sample_names: cov2lr_opts += ["-c", ":".join(control_sample_names)] if "-c" not in lr2gene_opts: lr2gene_opts += ["-c"] cov2lr_opt = " ".join(cov2lr_opts) lr2gene_opt = " ".join(lr2gene_opts) if not utils.file_exists(output_fpath): with file_transaction(items[0], output_fpath) as tx_out_file: with utils.chdir(work_dir): export = utils.local_path_export() cmd = ( "{export} {cov2lr} -a {cov2lr_opt} {read_mapping_file} {coverage_file} | " + "{lr2gene} {lr2gene_opt} > {tx_out_file}") do.run(cmd.format(**locals()), "Seq2C CNV calling") return output_fpath
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ try: input_rec = next(iter(pybedtools.BedTool(in_file))) except StopIteration: # empty file utils.copy_plus(in_file, out_file) return # keep everything after standard chrom/start/end, 1-based extra_fields = list(range(4, len(input_rec.fields) + 1)) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s || $NF < -%s) $%s = "."} {print}'""" % (max_distance, max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd(os.path.dirname(out_file)) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" # Ensure gene transcripts match reference genome ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" % (utils.splitext_plus(os.path.basename(gene_file))[0])) ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data) exports = "export TMPDIR=%s && %s" % (os.path.dirname(out_file), utils.local_path_export()) bcbio_py = sys.executable gsort = config_utils.get_program("gsort", data) cmd = ("{exports}{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | " "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | " "{gsort} - {fai_file} | " "bedtools closest -g {fai_file} " "-D ref -t first -a - -b <({gsort} {ready_gene_file} {fai_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info")
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not utils.file_exists(out_file): assert out_file.endswith( ".vcf.gz"), "Expect bgzipped output to VarScan" normal_mpileup_cl = samtools.prep_mpileup( [paired.normal_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) base, ext = utils.splitext_plus(out_file) indel_file = base + "-indel.vcf" snp_file = base + "-snp.vcf" with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" export = utils.local_path_export() varscan_cmd = ( "{export} varscan {jvm_opts} somatic " "<({normal_mpileup_cl} | {remove_zerocoverage}) " "<({tumor_mpileup_cl} | {remove_zerocoverage}) " "--output-snp {tx_snp} --output-indel {tx_indel} " "--output-vcf {opts} ") # add minimum AF min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) to_combine = [] for fname in [snp_file, indel_file]: if utils.file_exists(fname): fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0]) with file_transaction(config, fix_file) as tx_fix_file: fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") normal_name = paired.normal_name tumor_name = paired.tumor_name cmd = ( "cat {fname} | " "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x," """ "{normal_name}", "{tumor_name}")' | """ "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | " """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """ "{py_cl} -x 'bcbio.variation.varscan.spv_freq_filter(x, 1)' | " "bgzip -c > {tx_fix_file}") do.run(cmd.format(**locals()), "Varscan paired fix") to_combine.append(fix_file) if not to_combine: out_file = write_empty_vcf(out_file, config) else: out_file = combine_variant_files(to_combine, out_file, ref_file, config, region=target_regions) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config)
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ("{path_export}{export_tmp}{locale_export} " "{multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}") do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append(tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not utils.file_exists(out_file): assert out_file.endswith(".vcf.gz"), "Expect bgzipped output to VarScan" normal_mpileup_cl = samtools.prep_mpileup([paired.normal_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) base, ext = utils.splitext_plus(out_file) indel_file = base + "-indel.vcf" snp_file = base + "-snp.vcf" with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" export = utils.local_path_export() varscan_cmd = ("{export} varscan {jvm_opts} somatic " "<({normal_mpileup_cl} | {remove_zerocoverage}) " "<({tumor_mpileup_cl} | {remove_zerocoverage}) " "--output-snp {tx_snp} --output-indel {tx_indel} " "--output-vcf {opts} ") # add minimum AF min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) to_combine = [] for fname in [snp_file, indel_file]: if utils.file_exists(fname): fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0]) with file_transaction(config, fix_file) as tx_fix_file: fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") normal_name = paired.normal_name tumor_name = paired.tumor_name cmd = ("cat {fname} | " "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x," """ "{normal_name}", "{tumor_name}")' | """ "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """ "bgzip -c > {tx_fix_file}") do.run(cmd.format(**locals()), "Varscan paired fix") to_combine.append(fix_file) if not to_combine: out_file = write_empty_vcf(out_file, config) else: out_file = combine_variant_files(to_combine, out_file, ref_file, config, region=target_regions) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config)
def _run_tool(cmd): if isinstance(cmd, (list, tuple)): cmd = " ".join([str(x) for x in cmd]) cmd = utils.local_path_export() + cmd subprocess.check_call(cmd, shell=True)