def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate( out_file, in_bam): return out_file cmdl = sambamba.make_command( data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_total_file = os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample, data=data) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data) return os.path.abspath(parse_file)
def regions_coverage(data, bed_file, bam_file, target_name): work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) out_file = os.path.join(work_dir, target_name + "_regions_depth.bed") if utils.file_uptodate(out_file, bam_file) and utils.file_uptodate(out_file, bed_file): return out_file with file_transaction(data, out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file) + " -o " + tx_out_file message = "Calculating regions coverage of {target_name} in {bam_file}" do.run(cmdl, message.format(**locals())) return out_file
def regions_coverage(data, bed_file, bam_file, target_name): work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) out_file = os.path.join(work_dir, target_name + "_regions_depth.bed") if utils.file_uptodate(out_file, bam_file) and utils.file_uptodate(out_file, bed_file): return out_file with file_transaction(out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file) + " -o " + tx_out_file message = "Calculating regions coverage of {target_name} in {bam_file}" do.run(cmdl, message.format(**locals())) return out_file
def regions_coverage(data, bed_file, bam_file, target_name, depth_thresholds=None): """Generate coverage over regions of interest using sambamba depth. sambamba can segfault with multiple threads so provides a single threaded backup implementation in case of failures. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) out_file = os.path.join(work_dir, target_name + "_regions_depth.bed") if utils.file_uptodate(out_file, bam_file) and utils.file_uptodate(out_file, bed_file): return out_file with file_transaction(data, out_file) as tx_out_file: try: cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file, depth_thresholds=depth_thresholds) cmdl += " -o " + tx_out_file message = "Calculating regions coverage of {target_name} in {bam_file}" do.run(cmdl, message.format(**locals())) except subprocess.CalledProcessError: cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file, depth_thresholds=depth_thresholds, multicore=False) cmdl += " -o " + tx_out_file message = "Calculating regions coverage of {target_name} in {bam_file} -- single thread backup" do.run(cmdl, message.format(**locals())) return out_file
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file with file_transaction(data, out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed) parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmdl += " | {parse_cmd} > {tx_out_file}" message = "Calculating base coverage of {bed_file} in {in_bam}" do.run(cmdl.format(**locals()), message.format(**locals())) return out_file
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file with file_transaction(out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed) parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmdl += " | {parse_cmd} > {tx_out_file}" message = "Calculating base coverage of {bed_file} in {in_bam}" do.run(cmdl.format(**locals()), message.format(**locals())) return out_file
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file