def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError( "Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter( lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError("Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def to_standardonly(in_file, ref_file, data): """Subset a VCF input file to standard chromosomes (1-22,X,Y,MT). """ from bcbio.heterogeneity import chromhacks out_file = "%s-stdchrs.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_exists(out_file): stds = [] for c in ref.file_contigs(ref_file): if chromhacks.is_nonalt(c.name): stds.append(c.name) if stds: with file_transaction(data, out_file) as tx_out_file: stds = ",".join(stds) in_file = bgzip_and_index(in_file, data["config"]) cmd = "bcftools view -o {tx_out_file} -O z {in_file} {stds}" do.run(cmd.format(**locals()), "Subset to standard chromosomes") return bgzip_and_index(out_file, data["config"]) if utils.file_exists(out_file) else in_file
def _maybe_limit_chromosomes(data): """Potentially limit chromosomes to avoid problematically named HLA contigs. HLAs have ':' characters in them which confuse downstream processing. If we have no problematic chromosomes we don't limit anything. """ std_chroms = [] prob_chroms = [] noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)): prob_chroms.append(contig.name) else: std_chroms.append(contig.name) if len(prob_chroms) > 0: return std_chroms else: return []
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(msg): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def main(ref_file): ref_bedtool = get_ref_bedtool(ref_file, {}) mappable_file = os.path.basename(URL) r = requests.get(URL, stream=True) with open(mappable_file, "wb") as f: shutil.copyfileobj(r.raw, f) ref_bedtool.subtract(mappable_file, nonamecheck=True).saveas(OUT_FILE + ".tmp") with open(OUT_FILE + ".tmp") as in_handle: with open(OUT_FILE, "w") as out_handle: for line in in_handle: if chromhacks.is_nonalt(line.split()[0]): out_handle.write("%s\tumap_k100_mappability\n" % line.strip()) os.remove(OUT_FILE + ".tmp") vcfutils.bgzip_and_index(OUT_FILE) os.remove(mappable_file)
def _setup_variant_regions(data, out_dir): """Ensure we have variant regions for calling, using transcript if not present. Respects noalt_calling by removing additional contigs to improve speeds. """ vr_file = dd.get_variant_regions(data) if not vr_file: vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir) contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))]) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")), "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0]) if not utils.file_uptodate(out_file, vr_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(vr_file): if r.chrom in contigs: if chromhacks.is_nonalt(r.chrom): out_handle.write(str(r)) data = dd.set_variant_regions(data, out_file) return data
def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) if not file_exists(out_file_orig): with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params if not utils.file_uptodate(out_file_mutect, out_file_orig): out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list)) and chromhacks.is_nonalt(region[0])): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): from bcbio.structural import pindel out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file