def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning("Can't run Seq2C without a svregions or variant_regions BED file") else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" else: merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: cmd = ("sambamba depth region -F \"not unmapped\" -t {cores} -C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 -T 80 -T 100 -L {bed_file} {in_bam} | sed 's/# chrom/chrom/' > {parse_file}") do.run(cmd.format(**locals()), "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, bed_file, sample) _calculate_percentiles(parse_file, sample) data['coverage'] = os.path.abspath(parse_file) return data
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def calculate(bam_file, data): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None), ("sv_regions", regions.get_sv_bed(data), None, None), ("coverage", dd.get_coverage(data), None, DEPTH_THRESHOLDS)] depth_files = {} for target_name, region_bed, quantize, thresholds in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds) for attr in ("dist", "regions", "thresholds"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_total_file = os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data["vrn_file"] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out, ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, "w") as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug("parsing coverage: %s" % sample) return data
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def _prep_real_counts(bam_file, data, samtools_stats): out = {} if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: bed = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": bed = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: bed = None target_name = "genome" dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if bed: out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage() out["Preseq_read_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name) ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file) if dedupped: out["Preseq_unique_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name) # Counting average on-target alignment length, based on the equation: # avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size total_alignments = out.get("Preseq_unique_count") or out["Preseq_read_count"] out["Preseq_read_length"] = ontrg_unique_depth * out["Preseq_genome_size"] // total_alignments else: # WGS out["Preseq_genome_size"] = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) out["Preseq_read_count"] = int(samtools_stats["Total_reads"]) out["Preseq_read_length"] = int(samtools_stats["Average_read_length"]) if dedupped: out["Preseq_unique_count"] = out["Preseq_read_count"] - int(samtools_stats["Duplicates"]) return out
def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def run(bam_file, data, out_dir): """Run coverage QC analysis """ out = dict() if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: merged_bed_file = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools') from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"]) out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) if total_reads: out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if mapped: out['Duplicates_pct'] = 100.0 * dups / mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped_unique if merged_bed_file: ontarget = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) out["Ontarget_unique_reads"] = ontarget if mapped_unique: out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file(out_dir, merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads out_files = cov.coverage_region_detailed_stats(data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) for ext in ["coverage.bed", "summary.bed"]: out_files += [x for x in glob.glob(os.path.join(out_dir, "*%s" % ext)) if os.path.isfile(x)] indexcov_files = _goleft_indexcov(bam_file, data, out_dir) out_files += [x for x in indexcov_files if x and utils.file_exists(x)] out = {"metrics": out} if len(out_files) > 0: out["base"] = out_files[0] out["secondary"] = out_files[1:] return out
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [ None, False, "None" ] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps( bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file) }