def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] avg_coverage = coverage.get_average_coverage("rawumi", dd.get_variant_regions(data), data) if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" est_options = _estimate_fgbio_defaults(avg_coverage) group_opts, cons_opts, filter_opts = _get_fgbio_options(data, est_options, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out, avg_coverage
def _prep_real_counts(bam_file, data, samtools_stats): out = {} if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: bed = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": bed = dd.get_variant_regions_merged(data) or dd.get_sample_callable( data) target_name = "variant_regions" else: bed = None target_name = "genome" dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if bed: out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage() out["Preseq_read_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name) ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file) if dedupped: out["Preseq_unique_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name) # Counting average on-target alignment length, based on the equation: # avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size total_alignments = out.get( "Preseq_unique_count") or out["Preseq_read_count"] out["Preseq_read_length"] = ontrg_unique_depth * out[ "Preseq_genome_size"] // total_alignments else: # WGS out["Preseq_genome_size"] = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) out["Preseq_read_count"] = int(samtools_stats["Total_reads"]) out["Preseq_read_length"] = int(samtools_stats["Average_read_length"]) if dedupped: out["Preseq_unique_count"] = out["Preseq_read_count"] - int( samtools_stats["Duplicates"]) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" else: merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() if dd.get_coverage(data): bed_file = bedutils.merge_overlaps(dd.get_coverage(data), data) target_name = "coverage" elif dd.get_variant_regions_merged(data): bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: bed_file = None target_name = "wgs" bed_file = clean_file(bed_file, data, prefix="cov-", simple=True) offtarget_stats_file = calculate_offtarget_stats(bam_file, data, bed_file, target_name) if offtarget_stats_file and utils.file_exists(offtarget_stats_file): with open(offtarget_stats_file) as in_handle: stats = yaml.safe_load(in_handle) offtarget = stats.get('offtarget') mapped_unique = stats['mapped_unique'] if offtarget and mapped_unique: out['offtarget_rate'] = 1.0 * offtarget / mapped_unique mapped = stats['mapped'] if mapped: out['Duplicates'] = mapped - mapped_unique out['Duplicates_pct'] = 1.0 * (mapped - mapped_unique) / mapped total_reads = stats['total_reads'] if total_reads: out['usable_rate'] = 1.0 * (mapped_unique - offtarget) / total_reads avg_coverage = get_average_coverage(data, bam_file, bed_file, target_name) out['avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def _prep_real_counts(bam_file, data, samtools_stats): out = {} if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: bed = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": bed = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: bed = None target_name = "genome" dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if bed: out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage() out["Preseq_read_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name) ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file) if dedupped: out["Preseq_unique_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name) # Counting average on-target alignment length, based on the equation: # avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size total_alignments = out.get("Preseq_unique_count") or out["Preseq_read_count"] out["Preseq_read_length"] = ontrg_unique_depth * out["Preseq_genome_size"] // total_alignments else: # WGS out["Preseq_genome_size"] = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) out["Preseq_read_count"] = int(samtools_stats["Total_reads"]) out["Preseq_read_length"] = int(samtools_stats["Average_read_length"]) if dedupped: out["Preseq_unique_count"] = out["Preseq_read_count"] - int(samtools_stats["Duplicates"]) return out
def run(bam_file, data, out_dir): """Run coverage QC analysis """ out = dict() out_dir = utils.safe_makedir(out_dir) if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data), data, prefix="cov-", simple=True) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data) if target_name == "coverage": out_files = cov.coverage_region_detailed_stats(target_name, merged_bed_file, data, out_dir) else: out_files = [] out['Avg_coverage'] = avg_depth samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools') from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)["metrics"] out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"]) out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) if total_reads: out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if mapped: out['Duplicates_pct'] = 100.0 * dups / mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: mapped_unique = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped_unique if merged_bed_file: ontarget = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) out["Ontarget_unique_reads"] = ontarget if mapped_unique: out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( out_dir, merged_bed_file, 200, data) ontarget_padded = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads indexcov_files = _goleft_indexcov(bam_file, data, out_dir) out_files += [x for x in indexcov_files if x and utils.file_exists(x)] out = {"metrics": out} if len(out_files) > 0: out["base"] = out_files[0] out["secondary"] = out_files[1:] return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" else: merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int( samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data): cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats( data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def run(bam_file, data, out_dir): """Run coverage QC analysis """ out = dict() if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: merged_bed_file = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools') from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"]) out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) if total_reads: out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if mapped: out['Duplicates_pct'] = 100.0 * dups / mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped_unique if merged_bed_file: ontarget = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) out["Ontarget_unique_reads"] = ontarget if mapped_unique: out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file(out_dir, merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads out_files = cov.coverage_region_detailed_stats(data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) for ext in ["coverage.bed", "summary.bed"]: out_files += [x for x in glob.glob(os.path.join(out_dir, "*%s" % ext)) if os.path.isfile(x)] indexcov_files = _goleft_indexcov(bam_file, data, out_dir) out_files += [x for x in indexcov_files if x and utils.file_exists(x)] out = {"metrics": out} if len(out_files) > 0: out["base"] = out_files[0] out["secondary"] = out_files[1:] return out