def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with file_transaction(out_file) as tx_out_file: parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "{in_bam} | {parse_cmd} > {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) return out_file
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ( "{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def _handle_multiple_svcallers(data, stage): """Retrieve configured structural variation caller, handling multiple. data is one sample """ svs = get_svcallers(data) # special cases -- prioritization if stage == "ensemble" and dd.get_svprioritize(data): svs.append("prioritize") out = [] for svcaller in svs: if svcaller in _get_callers([data], stage): base = copy.deepcopy(data) # clean SV callers present in multiple rounds and not this caller final_svs = [] for sv in data.get("sv", []): if (stage == "ensemble" or sv["variantcaller"] == svcaller or sv["variantcaller"] not in svs or svcaller not in _get_callers( [data], stage, special_cases=True)): final_svs.append(sv) base["sv"] = final_svs base["config"]["algorithm"]["svcaller"] = svcaller base["config"]["algorithm"]["svcaller_orig"] = svs out.append(base) return out
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate( out_file, in_bam): return out_file cmdl = sambamba.make_command( data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ( "{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def run(items): """Perform detection of structural variations with lumpy. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", "minimap2", False, None] for data in items): raise ValueError( "Require bwa or minimap2 alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired(items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: full_bams.append(dd.get_align_bam(data)) sr_bam, disc_bam = sshared.find_existing_split_discordants(data) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} # Retain paired samples with tumor/normal genotyped in one file if paired and paired.normal_name: batches = [[paired.tumor_data, paired.normal_data]] else: batches = [[x] for x in items] for batch_items in batches: for data in batch_items: gt_vcfs[dd.get_sample_name(data)] = _filter_by_support( lumpy_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs.get(dd.get_sample_name(data)) if vcf_file: if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out
def get_coords(data): for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}), ("amplification", {"AMPLIFICATION"})]: out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {}) priority_file = dd.get_svprioritize(data) if priority_file and os.path.basename(priority_file).find("civic") >= 0: for chrom, start, end, gene in _civic_regions(priority_file, vtypes, dd.get_disease(data)): out[gene] = (chrom, start, end) yield category, out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) elif "lumpy-genotype" in dd.get_tools_off(data): gt_vcf = sample_vcf else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _handle_multiple_svcallers(data, stage): """Retrieve configured structural variation caller, handling multiple. """ svs = get_svcallers(data) # special cases -- prioritization if stage == "ensemble" and dd.get_svprioritize(data): svs.append("prioritize") out = [] for svcaller in svs: if svcaller in _CALLERS[stage]: base = copy.deepcopy(data) base["config"]["algorithm"]["svcaller"] = svcaller base["config"]["algorithm"]["svcaller_orig"] = svs out.append(base) return out
def _add_scatter_plot(out, data): out_file = "%s-scatter.pdf" % os.path.splitext(out["cnr"])[0] priority_bed = dd.get_svprioritize(data) if not priority_bed: return None priority_bed = plot._prioritize_plot_regions(pybedtools.BedTool(priority_bed), data, os.path.dirname(out_file)) if utils.file_exists(out_file): return out_file cnr = _remove_haplotype_chroms(out["cnr"], data) cns = _remove_haplotype_chroms(out["cns"], data) with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "scatter", "-s", cns, "-o", tx_out_file, "-l", priority_bed, cnr] do.run(cmd, "CNVkit scatter plot") return out_file
def _add_scatter_plot(out, data): out_file = "%s-scatter.pdf" % os.path.splitext(out["cnr"])[0] priority_bed = dd.get_svprioritize(data) if not priority_bed: return None priority_bed = plot._prioritize_plot_regions(pybedtools.BedTool(priority_bed), data, os.path.dirname(out_file)) if utils.file_exists(out_file): return out_file cnr = _remove_haplotype_chroms(out["cnr"], data) cns = _remove_haplotype_chroms(out["cns"], data) with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "scatter", "-s", cns, "-o", tx_out_file, "-l", priority_bed, cnr] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit scatter plot") return out_file
def priority_coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): data['priority_coverage'] = os.path.abspath(out_file) return data with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) logger.debug("Calculating priority coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) with file_transaction(out_file) as tx_out_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += "%s\t%s\t%s\n" % (chrom, start, end) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample samtools = config_utils.get_program("samtools", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ( "{samtools} view -b {in_bam} {coord_string} | " "{bedtools} coverage -sorted -d -a {region_file} -b - | " "awk {awk_string} >> {tx_out_file}") _silence_run(cmd.format(**locals())) data['priority_coverage'] = os.path.abspath(out_file) return data
def priority_coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): data['priority_coverage'] = os.path.abspath(out_file) return data with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) logger.debug("Calculating priority coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) with file_transaction(out_file) as tx_out_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += "%s\t%s\t%s\n" % (chrom, start, end) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample samtools = config_utils.get_program("samtools", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ("{samtools} view -b {in_bam} {coord_string} | " "{bedtools} coverage -sorted -d -a {region_file} -b - | " "awk {awk_string} >> {tx_out_file}") _silence_run(cmd.format(**locals())) data['priority_coverage'] = os.path.abspath(out_file) return data
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file with file_transaction(out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed) parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmdl += " | {parse_cmd} > {tx_out_file}" message = "Calculating base coverage of {bed_file} in {in_bam}" do.run(cmdl.format(**locals()), message.format(**locals())) return out_file
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file with file_transaction(data, out_file) as tx_out_file: cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed) parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmdl += " | {parse_cmd} > {tx_out_file}" message = "Calculating base coverage of {bed_file} in {in_bam}" do.run(cmdl.format(**locals()), message.format(**locals())) return out_file
def get_coords(data): """Retrieve coordinates of genes of interest for prioritization. Can read from CIViC input data or a supplied BED file of chrom, start, end and gene information. """ for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}), ("amplification", {"AMPLIFICATION"})]: out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {}) priority_file = dd.get_svprioritize(data) if priority_file: if os.path.basename(priority_file).find("civic") >= 0: for chrom, start, end, gene in _civic_regions(priority_file, vtypes, dd.get_disease(data)): out[gene] = (chrom, start, end) elif os.path.basename(priority_file).find(".bed") >= 0: for line in utils.open_gzipsafe(priority_file): parts = line.strip().split("\t") if len(parts) >= 4: chrom, start, end, gene = parts[:4] out[gene] = (chrom, int(start), int(end)) yield category, out
def _handle_multiple_svcallers(data, stage): """Retrieve configured structural variation caller, handling multiple. """ svs = get_svcallers(data) # special cases -- prioritization if stage == "ensemble" and dd.get_svprioritize(data): svs.append("prioritize") out = [] for svcaller in svs: if svcaller in _get_callers([data], stage): base = copy.deepcopy(data) # clean SV callers present in multiple rounds and not this caller final_svs = [] for sv in data.get("sv", []): if stage == "ensemble" or sv["variantcaller"] == svcaller or sv["variantcaller"] not in svs: final_svs.append(sv) base["sv"] = final_svs base["config"]["algorithm"]["svcaller"] = svcaller base["config"]["algorithm"]["svcaller_orig"] = svs out.append(base) return out
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam): return out_file cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file
def get_coords(data): """Retrieve coordinates of genes of interest for prioritization. Can read from CIViC input data or a supplied BED file of chrom, start, end and gene information. """ for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}), ("amplification", {"AMPLIFICATION"})]: out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {}) priority_file = dd.get_svprioritize(data) if priority_file: if os.path.basename(priority_file).find("civic") >= 0: for chrom, start, end, gene in _civic_regions( priority_file, vtypes, dd.get_disease(data)): out[gene] = (chrom, start, end) elif os.path.basename(priority_file).find(".bed") >= 0: for line in utils.open_gzipsafe(priority_file): parts = line.strip().split("\t") if len(parts) >= 4: chrom, start, end, gene = parts[:4] out[gene] = (chrom, int(start), int(end)) yield category, out
def priority_coverage(data, out_dir): bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "{in_bam} | {parse_cmd} > {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) return out_file