def population_variant_regions(items): """Retrieve the variant region BED file from a population of items. If tumor/normal, return the tumor BED file. If a population, return the BED file covering the most bases. """ import pybedtools if len(items) == 1: return dd.get_variant_regions(items[0]) or dd.get_sample_callable( items[0]) else: paired = vcfutils.get_paired(items) if paired: return dd.get_variant_regions( paired.tumor_data) or dd.get_sample_callable(paired.tumor_data) else: vrs = [] for data in items: vr_bed = dd.get_variant_regions( data) or dd.get_sample_callable(data) if vr_bed: vrs.append( (pybedtools.BedTool(vr_bed).total_coverage(), vr_bed)) vrs.sort(reverse=True) if vrs: return vrs[0][1]
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ if vrn_file and "gvcf" in dd.get_tools_on(data): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ from bcbio.bam import callable if vrn_file and vcfutils.is_gvcf_file(vrn_file): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): data = utils.deepish_copy(data) data["work_bam"] = data.pop("work_bam_callable") return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > GENOME_COV_THRESH: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data), vrs or callable_file, "variant_regions") if offtarget_pct > OFFTARGET_THRESH: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists( hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info( "Skipping GATK BaseRecalibrator because no VCF file of known variants was found." ) return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator( broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data) or dd.get_sample_callable(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def get_sv_bed(data, method=None, out_dir=None, include_gene_names=True): """Retrieve a BED file of regions for SV and heterogeneity calling using the provided method. method choices: - exons: Raw BED file of exon regions - transcripts: Full collapsed regions with the min and max of each transcript. - transcriptsXXXX: Collapsed regions around transcripts with a window size of XXXX. - A custom BED file of regions """ if method is None: method = (tz.get_in(["config", "algorithm", "sv_regions"], data) or dd.get_variant_regions(data) or dd.get_sample_callable(data)) gene_file = dd.get_gene_bed(data) if method and os.path.isfile(method): return method elif not gene_file or not method: return None elif method == "exons": return gene_file elif method.startswith("transcripts"): window = method.split("transcripts")[-1] window = int(float(window)) if window else 0 return _collapse_transcripts(gene_file, window, data, out_dir, include_gene_names=include_gene_names) else: raise ValueError("Unexpected transcript retrieval method: %s" % method)
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = get_sv_bed(data, include_gene_names=include_gene_names) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions near genes as targets if cov_interval == "genome": base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions( data) or dd.get_sample_callable(data) return bedutils.clean_file(base_regions, data)
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter}" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def _run_purecn_dx(out, paired): """Extract signatures and mutational burdens from PureCN rds file.""" # no solution - no signatures if not "rds" in out: return out rscript = utils.Rscript_cmd() purecndx_r = utils.R_package_script("PureCN", "extdata/Dx.R", env="base") simple_repeat_bed = dd.get_variation_resources( paired.tumor_data)["simple_repeat"] callable_bed = dd.get_sample_callable(paired.tumor_data) out_base = utils.splitext_plus(out["rds"])[0] mutation_burden_csv = out_base + "_mutation_burden.csv" if not utils.file_uptodate(mutation_burden_csv, out["rds"]): # no signatures - so we generate them with file_transaction(paired.tumor_data, out_base) as tx_out_base: cmd = [ rscript, purecndx_r, "--rds", out["rds"], "--callable", callable_bed, "--signatures", "--exclude", simple_repeat_bed, "--out", tx_out_base ] do.run(cmd, "PureCN Dx mutational burden and signatures") out_base, out, all_files = _get_purecn_dx_files(paired, out, require_exist=True) # if a file was not generated it would not go to the upload for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def _get_variant_regions(data): out = dd.get_variant_regions(data) or dd.get_sample_callable(data) if merged: merged_out = dd.get_variant_regions_merged(data) if merged_out: out = merged_out else: out = merge_overlaps(out, data) return out
def _get_variant_regions(data): out = dd.get_variant_regions(data) or dd.get_sample_callable(data) # Only need to merge for variant region inputs, not callable BED regions which don't overlap if merged and dd.get_variant_regions(data): merged_out = dd.get_variant_regions_merged(data) if merged_out: out = merged_out else: out = merge_overlaps(out, data) return out
def _maybe_add_callable(data, out): """Add callable and depth regions to output folder. """ callable_bed = dd.get_sample_callable(data) if callable_bed: out.append({"path": callable_bed, "type": "bed", "ext": "callable"}) perbase_bed = tz.get_in(["depth", "variant_regions", "per_base"], data) if perbase_bed: out.append({"path": perbase_bed, "type": "bed.gz", "ext": "depth-per-base"}) return out
def _prep_real_counts(bam_file, data, samtools_stats): out = {} if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: bed = dd.get_coverage_merged(data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": bed = dd.get_variant_regions_merged(data) or dd.get_sample_callable( data) target_name = "variant_regions" else: bed = None target_name = "genome" dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if bed: out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage() out["Preseq_read_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name) ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file) if dedupped: out["Preseq_unique_count"] = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name) # Counting average on-target alignment length, based on the equation: # avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size total_alignments = out.get( "Preseq_unique_count") or out["Preseq_read_count"] out["Preseq_read_length"] = ontrg_unique_depth * out[ "Preseq_genome_size"] // total_alignments else: # WGS out["Preseq_genome_size"] = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) out["Preseq_read_count"] = int(samtools_stats["Total_reads"]) out["Preseq_read_length"] = int(samtools_stats["Average_read_length"]) if dedupped: out["Preseq_unique_count"] = out["Preseq_read_count"] - int( samtools_stats["Duplicates"]) return out
def _run_purecn_dx(out, paired): """Extract signatures and mutational burdens from PureCN rds file. """ out_base, out, all_files = _get_purecn_dx_files(paired, out) if not utils.file_uptodate(out["mutation_burden"], out["rds"]): with file_transaction(paired.tumor_data, out_base) as tx_out_base: cmd = ["PureCN_Dx.R", "--rds", out["rds"], "--callable", dd.get_sample_callable(paired.tumor_data), "--signatures", "--out", tx_out_base] do.run(cmd, "PureCN Dx mutational burden and signatures") for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.05 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions(data) callable_file = dd.get_sample_callable(data) if vrs: seq_size = pybedtools.BedTool(vrs).total_coverage() else: seq_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) genome_cov_pct = seq_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 else: offtarget_stats_file = dd.get_offtarget_stats(data) if not offtarget_stats_file: offtarget_pct = 0.0 else: with open(offtarget_stats_file) as in_handle: stats = yaml.safe_load(in_handle) if stats.get("offtarget") and stats["mapped_unique"]: offtarget_pct = float( stats["offtarget"]) / stats["mapped_unique"] else: offtarget_pct = 0.0 if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info( "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = get_sv_bed(data, include_gene_names=include_gene_names) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions near genes as targets if cov_interval == "genome": base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) or dd.get_sample_callable(data) return bedutils.clean_file(base_regions, data)
def _evaluate_vcf(calls, truth_vcf, work_dir, data): out_file = os.path.join( work_dir, os.path.join("%s-sv-validate.csv" % dd.get_sample_name(data))) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow( ["sample", "caller", "vtype", "metric", "value"]) for call in calls: detail_dir = utils.safe_makedir( os.path.join(work_dir, call["variantcaller"])) for stats in _validate_caller_vcf( call["vrn_file"], truth_vcf, dd.get_sample_callable(data), call["variantcaller"], detail_dir, data): writer.writerow(stats) return out_file
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.10 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions(data) callable_file = dd.get_sample_callable(data) if vrs: seq_size = pybedtools.BedTool(vrs).total_coverage() else: seq_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = seq_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 else: offtarget_stat_file = dd.get_offtarget_stats(data) if not offtarget_stat_file: offtarget_pct = 0.0 else: with open(offtarget_stat_file) as in_handle: stats = yaml.safe_load(in_handle) if float(stats["mapped"]) > 0: offtarget_pct = stats["offtarget"] / float(stats["mapped"]) else: offtarget_pct = 0.0 if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data) or dd.get_sample_callable(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def _run_purecn_dx(out, paired): """Extract signatures and mutational burdens from PureCN rds file.""" out_base, out, all_files = _get_purecn_dx_files(paired, out) rscript = utils.Rscript_cmd("r36") purecndx_r = utils.R_package_script("r36", "PureCN", "extdata/Dx.R") simple_repeat_bed = dd.get_variation_resources(paired.tumor_data)["simple_repeat"] callable_bed = dd.get_sample_callable(paired.tumor_data) if not utils.file_uptodate(out["mutation_burden"], out["rds"]): with file_transaction(paired.tumor_data, out_base) as tx_out_base: cmd = [rscript, purecndx_r, "--rds", out["rds"], "--callable", callable_bed, "--signatures", "--exclude", simple_repeat_bed, "--out", tx_out_base] do.run(cmd, "PureCN Dx mutational burden and signatures") for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def run(bam_file, data, out_dir): """Run coverage QC analysis """ out = dict() out_dir = utils.safe_makedir(out_dir) if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data), data, prefix="cov-", simple=True) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged( data) or dd.get_sample_callable(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data) if target_name == "coverage": out_files = cov.coverage_region_detailed_stats(target_name, merged_bed_file, data, out_dir) else: out_files = [] out['Avg_coverage'] = avg_depth samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools') from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)["metrics"] out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"]) out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) if total_reads: out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if mapped: out['Duplicates_pct'] = 100.0 * dups / mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: mapped_unique = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped_unique if merged_bed_file: ontarget = readstats.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) out["Ontarget_unique_reads"] = ontarget if mapped_unique: out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( out_dir, merged_bed_file, 200, data) ontarget_padded = readstats.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads indexcov_files = _goleft_indexcov(bam_file, data, out_dir) out_files += [x for x in indexcov_files if x and utils.file_exists(x)] out = {"metrics": out} if len(out_files) > 0: out["base"] = out_files[0] out["secondary"] = out_files[1:] return out
def _evaluate_vcf(calls, truth_vcf, work_dir, data): out_file = os.path.join(work_dir, os.path.join("%s-sv-validate.csv" % dd.get_sample_name(data))) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["sample", "caller", "vtype", "metric", "value"]) for call in calls: detail_dir = utils.safe_makedir(os.path.join(work_dir, call["variantcaller"])) if call.get("vrn_file"): for stats in _validate_caller_vcf(call["vrn_file"], truth_vcf, dd.get_sample_callable(data), call["variantcaller"], detail_dir, data): writer.writerow(stats) return out_file