def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, todo="square"): """Run squaring or merging analysis using bcbio.variation.recall. """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) # adjust memory by cores but leave room for run program memory memcores = int(math.ceil(float(cores) / 5.0)) jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]), {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": memcores}}}) # Write unique VCFs and BAMs to input file input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0] with open(input_file, "w") as out_handle: out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n") if todo == "square": out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n") variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "") cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] if todo == "square": cmd += ["--caller", variantcaller] cmd += [out_file, ref_file, input_file] bcbio_env = utils.get_bcbio_env() cmd = " ".join(str(x) for x in cmd) do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region)), env=bcbio_env) return out_file
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) depth_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions") if not utils.file_uptodate(callable_file, bam_file): cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]), "--processes", str(dd.get_num_cores(data)), "--ordered"] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, depth_file) as tx_depth_file: with utils.chdir(os.path.dirname(tx_depth_file)): tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed") prefix = tx_depth_file.replace(".depth.bed", "") bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0] bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data) cmd += ["--reference", bam_ref_file] cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) out_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions", file_prefix=prefix) if not utils.file_uptodate(out_file, bam_file): ref_file = dd.get_ref_file(data) cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"] window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0] if not utils.file_uptodate(window_file, bam_file): with file_transaction(data, window_file) as tx_out_file: if not variant_regions: variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0] with open(variant_regions, "w") as out_handle: for c in shared.get_noalt_contigs(data): out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size)) pybedtools.BedTool().window_maker(w=params["parallel_window_size"], b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file) cmd += ["--bed", window_file] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, out_file) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed") prefix = tx_out_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) out_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions") if not utils.file_uptodate(out_file, bam_file): ref_file = dd.get_ref_file(data) cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"] window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0] if not utils.file_uptodate(window_file, bam_file): with file_transaction(data, window_file) as tx_out_file: if not variant_regions: variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0] with open(variant_regions, "w") as out_handle: for c in shared.get_noalt_contigs(data): out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size)) pybedtools.BedTool().window_maker(w=params["parallel_window_size"], b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file) cmd += ["--bed", window_file] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, out_file) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed") prefix = tx_out_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any( tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps( dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # return _parse_qualimap_metrics(report_file, data) return dict()