def coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_coverage_experimental(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") if not file_exists(parse_file): total_cov = cov_class(0, None, sample) with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: HEADER = ["#chrom", "start", "end", "region", "reads", "strand", "size", "sample", "mean", "sd", "cutoff10", "cutoff20", "cutoff4", "cutoff50"] out_handle.write("\t".join(HEADER) + "\n") with tmpfile() as tx_tmp_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += str(line) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) cmd = ("samtools view -b {in_bam} {coord_string} | " "bedtools coverage -a {region_file} -b - " "-hist > {tx_tmp_file}") _silence_run(cmd.format(**locals())) total_cov = _get_exome_coverage_stats(os.path.abspath(tx_tmp_file), sample, out_tx, total_cov) logger.debug("Processed %d regions." % lcount) total_cov.write_coverage(parse_total_file) data['coverage'] = os.path.abspath(parse_file) return data
def priority_coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): data['priority_coverage'] = os.path.abspath(out_file) return data with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) logger.debug("Calculating priority coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) with file_transaction(out_file) as tx_out_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += "%s\t%s\t%s\n" % (chrom, start, end) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample samtools = config_utils.get_program("samtools", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ( "{samtools} view -b {in_bam} {coord_string} | " "{bedtools} coverage -sorted -d -a {region_file} -b - | " "awk {awk_string} >> {tx_out_file}") _silence_run(cmd.format(**locals())) data['priority_coverage'] = os.path.abspath(out_file) return data
def priority_coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): data['priority_coverage'] = os.path.abspath(out_file) return data with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) logger.debug("Calculating priority coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) with file_transaction(out_file) as tx_out_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += "%s\t%s\t%s\n" % (chrom, start, end) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample samtools = config_utils.get_program("samtools", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ("{samtools} view -b {in_bam} {coord_string} | " "{bedtools} coverage -sorted -d -a {region_file} -b - | " "awk {awk_string} >> {tx_out_file}") _silence_run(cmd.format(**locals())) data['priority_coverage'] = os.path.abspath(out_file) return data