def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = shared.remove_exclude_regions( base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file
def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = shared.remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file
def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data, prefix="cleaned-") merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning("Can't run Seq2C without a svregions or variant_regions BED file") else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed elif regions.get_sv_bed(data): dd.set_sv_regions(data, clean_file(regions.get_sv_bed(data), data, prefix="svregions-")) return data
def _create_subset_file(in_file, work_dir, data): """Subset the VCF to a set of smaller regions, matching what was used for CNV calling. """ out_file = os.path.join(work_dir, "%s-orig.bcf" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: region_bed = regions.get_sv_bed(data) if not region_bed: region_bed = regions.get_sv_bed(data, "transcripts1e4", work_dir) cmd = "bcftools view -R {region_bed} -o {tx_out_file} -O b {in_file}" do.run(cmd.format(**locals()), "Extract SV only regions for BubbleTree") return out_file
def calculate(bam_file, data): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None), ("sv_regions", regions.get_sv_bed(data), None, None), ("coverage", dd.get_coverage(data), None, DEPTH_THRESHOLDS)] depth_files = {} for target_name, region_bed, quantize, thresholds in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds) for attr in ("dist", "regions", "thresholds"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def add_genes(in_file, data, max_distance=10000): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) with file_transaction(data, out_file) as tx_out_file: # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) cmd = ("sort -k1,1 -k2,2n {in_file} | " "bedtools closest -d -t all -a - -b {gene_file} | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info") return out_file else: return in_file
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("r36") interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources(data)["purecn_mappability"] genome = dd.get_genome_build(data) cmd = [rscript, interval_file_r, "--infile", bed_file, "--fasta", ref_file, "--outfile", ready_file, "--offtarget", "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def calculate(bam_file, data): """Calculate coverage in parallel using mosdepth. Removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"min": dd.get_coverage_depth_min(data)} variant_regions = dd.get_variant_regions_merged(data) if not variant_regions: variant_regions = _create_genome_regions(data) # Back compatible with previous pre-mosdepth callable files callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage.callable.bed" % (dd.get_sample_name(data))) if not utils.file_uptodate(callable_file, bam_file): vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"]) to_calculate = [("variant_regions", variant_regions, vr_quantize, None), ("sv_regions", bedutils.clean_file(regions.get_sv_bed(data), data), None, None), ("coverage", bedutils.clean_file(dd.get_coverage(data), data), None, DEPTH_THRESHOLDS)] depth_files = {} for target_name, region_bed, quantize, thresholds in to_calculate: if region_bed: cur_depth = {} depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds) for attr in ("dist", "regions", "thresholds"): val = getattr(depth_info, attr, None) if val: cur_depth[attr] = val depth_files[target_name] = cur_depth if target_name == "variant_regions": callable_file = depth_info.quantize else: depth_files = {} final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return final_callable, depth_files
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def _prep_bed(data, work_dir): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ bed_file = regions.get_sv_bed(data) if bed_file: bed_file = clean_file(bed_file, data, prefix="svregions-") else: bed_file = clean_file(dd.get_variant_regions(data), data) col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect( input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def add_genes(in_file, data, max_distance=10000): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cmd = ("{sort_cmd} -k1,1 -k2,2n {in_file} | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info") return out_file else: return in_file
def _create_subset_file(in_file, work_dir, data): """Subset the VCF to a set of smaller regions, matching what was used for CNV calling. """ out_file = os.path.join( work_dir, "%s-orig.bcf" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: region_bed = regions.get_sv_bed(data) if not region_bed: region_bed = regions.get_sv_bed(data, "transcripts1e4", work_dir) cmd = "bcftools view -R {region_bed} -o {tx_out_file} -O b {in_file}" do.run(cmd.format(**locals()), "Extract SV only regions for BubbleTree") return out_file
def get_base_cnv_regions(data, work_dir): """Retrieve set of target regions for CNV analysis. Subsets to extended transcript regions for WGS experiments to avoid long runtimes. """ cov_interval = dd.get_coverage_interval(data) base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) return base_regions
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = " ".join( resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])) cmd = ( "bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}" ) do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index( transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index( vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ( "zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def subset_by_genes(in_file, data, out_dir, pad): """Subset BED file of regions to only those within pad of the final output. """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) fai_file = ref.fasta_idx(dd.get_ref_file(data)) if not gene_file or not utils.file_exists(in_file): return in_file else: out_file = os.path.join(out_dir, "%s-geneonly.bed" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: want_region_file = "%s-targetregions%s" % utils.splitext_plus(out_file) pybedtools.BedTool(gene_file).slop(g=fai_file, b=pad).merge().saveas(want_region_file) pybedtools.BedTool(in_file).intersect(b=want_region_file).sort().saveas(tx_out_file) return out_file
def add_genes(in_file, data, max_distance=10000, work_dir=None): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: _add_genes_to_bed(in_file, gene_file, fai_file, tx_out_file, data, max_distance) return out_file else: return in_file
def _setup_variant_regions(data, out_dir): """Ensure we have variant regions for calling, using transcript if not present. Respects noalt_calling by removing additional contigs to improve speeds. """ vr_file = dd.get_variant_regions(data) if not vr_file: vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir) contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))]) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")), "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0]) if not utils.file_uptodate(out_file, vr_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(vr_file): if r.chrom in contigs: if chromhacks.is_nonalt(r.chrom): out_handle.write(str(r)) data = dd.set_variant_regions(data, out_file) return data
def precall(items): """Perform initial pre-calling steps -- coverage calcuation by sample. Use sambamba to call average region coverage in regions, and convert into a correct format. """ items = [utils.to_single_data(x) for x in items] assert len(items) == 1, "Expect one item to Seq2C coverage calculation" data = utils.to_single_data(items) assert dd.get_coverage_interval(data) != "genome", "Seq2C only for amplicon and exome sequencing" work_dir = _sv_workdir(data) bed_file = regions.get_sv_bed(data) or dd.get_variant_regions(data) bed_file = _prep_bed(data, bed_file, work_dir) bam_file = dd.get_align_bam(data) sample_name = dd.get_sample_name(data) cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name) if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "seq2c", "coverage": cov_file}) return [data]
def precall(items): """Perform initial pre-calling steps -- coverage calcuation by sample. Use sambamba to call average region coverage in regions, and convert into a correct format. """ items = [utils.to_single_data(x) for x in items] assert len(items) == 1, "Expect one item to Seq2C coverage calculation" data = utils.to_single_data(items) assert dd.get_coverage_interval( data) != "genome", "Seq2C only for amplicon and exome sequencing" work_dir = _sv_workdir(data) bed_file = regions.get_sv_bed(data) or dd.get_variant_regions(data) bed_file = _prep_bed(data, bed_file, work_dir) bam_file = dd.get_align_bam(data) sample_name = dd.get_sample_name(data) cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name) if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "seq2c", "coverage": cov_file}) return [data]