def get_multisample_vcf(fnames, name, caller, data): """Retrieve a multiple sample VCF file in a standard location. Handles inputs with multiple repeated input files from batches. """ unique_fnames = [] for f in fnames: if f not in unique_fnames: unique_fnames.append(f) out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) if len(unique_fnames) > 1: gemini_vcf = os.path.join(out_dir, "%s-%s.vcf.gz" % (name, caller)) vrn_file_batch = None for variant in data.get("variants", []): if variant["variantcaller"] == caller and variant.get("vrn_file_batch"): vrn_file_batch = variant["vrn_file_batch"] if vrn_file_batch: utils.symlink_plus(vrn_file_batch, gemini_vcf) return gemini_vcf else: return vcfutils.merge_variant_files(unique_fnames, gemini_vcf, dd.get_ref_file(data), data["config"]) else: gemini_vcf = os.path.join(out_dir, "%s-%s%s" % (name, caller, utils.splitext_plus(unique_fnames[0])[1])) utils.symlink_plus(unique_fnames[0], gemini_vcf) return gemini_vcf
def _bedpe_to_vcf(bedpe_file, sconfig_file, items): """Convert BEDPE output into a VCF file. """ tovcf_script = do.find_cmd("bedpeToVcf") if tovcf_script: out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0] out_nogzip = out_file.replace(".vcf.gz", ".vcf") raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0] if not utils.file_exists(out_file): if not utils.file_exists(raw_file): with file_transaction(raw_file) as tx_raw_file: ref_file = tz.get_in(["reference", "fasta", "base"], items[0]) cmd = [ sys.executable, tovcf_script, "-c", sconfig_file, "-f", ref_file, "-b", bedpe_file, "-o", tx_raw_file, ] do.run(cmd, "Convert lumpy bedpe output to VCF") prep_file = vcfutils.sort_by_ref(raw_file, items[0]) if not utils.file_exists(out_nogzip): utils.symlink_plus(prep_file, out_nogzip) out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"]) return out_file
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = utils.to_single_data(data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(bam_file_ready, ref_file, data) sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data) offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats} data = coverage.assign_interval(data) highdepth_bed = highdepth.identify(data) data["regions"]["highdepth"] = highdepth_bed if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def _amber_het_file(method, vrn_files, work_dir, paired): """Create file of BAFs in normal heterozygous positions compatible with AMBER. Two available methods: - pon -- Use panel of normals with likely heterozygous sites. - variants -- Use pre-existing variant calls, filtered to likely heterozygotes. https://github.com/hartwigmedical/hmftools/tree/master/amber https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf """ assert vrn_files, "Did not find compatible variant calling files for PURPLE inputs" from bcbio.heterogeneity import bubbletree if method == "variants": amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], work_dir, paired, AmberWriter) utils.symlink_plus(prep_file, out_file) pcf_file = out_file + ".pcf" if not utils.file_exists(pcf_file): with file_transaction(paired.tumor_data, pcf_file) as tx_out_file: r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R") with open(r_file, "w") as out_handle: out_handle.write(_amber_seg_script) cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file, out_file, pcf_file) do.run(cmd, "PURPLE: AMBER baf segmentation") else: assert method == "pon" out_file = _run_amber(paired, work_dir) return out_file
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. This does not yet handle mapping over 1 -> chr1 issues since this requires a ton of search/replace which slows down conversion. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_exists(out_file): out_file = os.path.join(work_dir, "%s-noextras.bam" % dd.get_sample_name(data)) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = _target_chroms_and_header(in_bam, data) str_chroms = " ".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable ref_file = dd.get_ref_file(data) local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam)) cores = dd.get_cores(data) utils.symlink_plus(in_bam, local_bam) bam.index(local_bam, data["config"]) cmd = ("samtools view -@ {cores} -h {local_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{ref_file}")' | """ "samtools view -@ {cores} -u - | " "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ") do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def cram_compress(in_bam): import os import subprocess from bcbio import utils from bcbio.distributed.transaction import file_transaction print in_bam ref_file = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa" out_file = "%s.cram" % os.path.splitext(in_bam)[0] jvm_opts = "-Xms1g -Xmx3g" if not utils.file_exists(out_file): print "cramming", out_file with file_transaction(out_file) as tx_out_file: cmd = ("cramtools {jvm_opts} cram " "--input-bam-file {in_bam} " "--capture-all-tags " "--ignore-tags 'BD:BI' " "--reference-fasta-file {ref_file} " "--lossy-quality-score-spec '*8' " "--output-cram-file {tx_out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) if not utils.file_exists(out_file + ".crai"): print "indexing", out_file + ".crai" with file_transaction(out_file + ".crai") as tx_out_file: tx_in_file = os.path.splitext(tx_out_file)[0] utils.symlink_plus(out_file, tx_in_file) cmd = ("cramtools {jvm_opts} index " "--input-file {tx_in_file}") subprocess.check_call(cmd.format(**locals()), shell=True) if os.path.exists(in_bam) and utils.file_exists(out_file): if in_bam != out_file and in_bam.endswith(".bam") and out_file.endswith(".cram"): os.remove(in_bam) return out_file
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) assert sambamba, "Did not find sambamba for indexing" samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) try: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" do.run(cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam)) except subprocess.CalledProcessError: cmd = "{samtools} index {in_bam} {tx_index_file}" do.run(cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def link_bam_file(orig_file, new_dir): """Provide symlinks of BAM file and existing indexes. """ new_dir = utils.safe_makedir(new_dir) sym_file = os.path.join(new_dir, os.path.basename(orig_file)) utils.symlink_plus(orig_file, sym_file) return sym_file
def _install_kraken_db(datadir, args): """Install kraken minimal DB in genome folder. """ kraken = os.path.join(datadir, "genomes/kraken") url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz" compress = os.path.join(kraken, os.path.basename(url)) base, ext = utils.splitext_plus(os.path.basename(url)) db = os.path.join(kraken, base) tooldir = args.tooldir or get_defaults()["tooldir"] requests.packages.urllib3.disable_warnings() last_mod = urllib.urlopen(url).info().getheader("Last-Modified") last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc()) if os.path.exists(os.path.join(tooldir, "bin", "kraken")): if not os.path.exists(db): is_new_version = True else: cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0] cur_version = datetime.datetime.utcfromtimestamp(os.path.getmtime(cur_file)) is_new_version = last_mod.date() > cur_version.date() if is_new_version: shutil.move(cur_file, cur_file.replace("minikraken", "old")) if not os.path.exists(kraken): utils.safe_makedir(kraken) if is_new_version: if not os.path.exists(compress): subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"]) cmd = ["tar", "-xzvf", compress, "-C", kraken] subprocess.check_call(cmd) last_version = glob.glob(os.path.join(kraken, "minikraken_*")) utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken")) utils.remove_safe(compress) else: print "You have the latest version %s." % last_mod else: raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
def variantcall_sample(data, region=None, align_bams=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ if out_file is None or not os.path.exists(out_file) or not os.path.lexists(out_file): utils.safe_makedir(os.path.dirname(out_file)) sam_ref = data["sam_ref"] config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if len(align_bams) == 1: items = [data] else: items = multi.get_orig_items(data) assert len(items) == len(align_bams) assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} for bam_file in align_bams: bam.index(bam_file, data["config"], check_timestamp=False) do_phasing = data["config"]["algorithm"].get("phasing", False) call_file = "%s-raw%s" % utils.splitext_plus(out_file) if do_phasing else out_file call_file = caller_fn(align_bams, items, sam_ref, assoc_files, region, call_file) if do_phasing == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) utils.symlink_plus(call_file, out_file) if region: data["region"] = region data["vrn_file"] = out_file return [data]
def index(in_bam, config): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if (not utils.file_uptodate(index_file, in_bam) and not utils.file_uptodate(alt_index_file, in_bam)): # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) if sambamba: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" else: cmd = "{samtools} index {tx_bam_file}" do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam)) return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def index(in_bam, config): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if (not utils.file_uptodate(index_file, in_bam) and not utils.file_uptodate(alt_index_file, in_bam)): sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: if sambamba: cmd = "{sambamba} index -t {num_cores} {in_bam} {tx_index_file}" else: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) cmd = "{samtools} index {tx_bam_file}" # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam), log_error=False) except: do.run(samtools_cmd.format(**locals()), "Index BAM file (single core): %s" % os.path.basename(in_bam)) return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(data["group_orig"]): if vcfutils.get_paired_phenotype(sub_data) == "tumor": if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in data["group_orig"]: sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) # We cannot symlink in CWL, but may be able to use inputs or copy if data.get("is_cwl"): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): return in_file else: return utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) _rename_allelic_fraction_field(out_file_mutect,config) disable_SID = True # SID isn't great, so use Scalpel instead if "appistry" not in broad_runner.get_mutect_version() or disable_SID: # Scalpel InDels is_paired = "-I:normal" in params out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(out_file_indels) as tx_out_file2: if not is_paired: scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) else: # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def variantcall_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ safe_makedir(os.path.dirname(out_file)) sam_ref = data["sam_ref"] config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if isinstance(data["work_bam"], basestring): align_bams = [data["work_bam"]] items = [data] else: align_bams = data["work_bam"] items = data["work_items"] call_file = "%s-raw%s" % os.path.splitext(out_file) call_file = caller_fn(align_bams, items, sam_ref, data["genome_resources"]["variation"], region, call_file) if data["config"]["algorithm"].get("phasing", False) == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) utils.symlink_plus(call_file, out_file) if "work_items" in data: del data["work_items"] data["vrn_file"] = out_file return [data]
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True, rerun_effects=True, remove_oldeffects=False, nonrefonly=False, work_dir=None): """Normalizes variants and reruns SnpEFF for resulting VCF """ if remove_oldeffects: out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file) else: out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _normalize(in_file, data, passonly=passonly, normalize_indels=normalize_indels, split_biallelic=split_biallelic, remove_oldeffects=remove_oldeffects, nonrefonly=nonrefonly, work_dir=work_dir) if rerun_effects: ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file utils.symlink_plus(ready_ma_file, out_file) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(get_orig_items(data)): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] else: sub_data.pop("vrn_file", None) out.append([sub_data]) return out # joint calling, do not split back up due to potentially large sample sizes elif tz.get_in(("config", "algorithm", "jointcaller"), data): return [[data]] # population or single sample else: out = [] for sub_data in get_orig_items(data): sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file_batch"] = data["vrn_file"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, names["lane"]) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % names["lane"]) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outSAMunmapped Within") cmd += _read_group_option(names) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif" run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file) do.run(cmd.format(**locals()), run_message, None) out_file = bam.sam_to_bam(out_file, config) out_file = _fix_sam_header(out_file, config) if not file_exists(final_out): symlink_plus(out_file, final_out) return final_out
def _get_samples_to_process(fn, out_dir, config, force_single): """parse csv file with one line per file. It will merge all files that have the same description name""" out_dir = os.path.abspath(out_dir) samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) > 0: if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]) or is_gsm(cols[0]): if cols[0].find(" ") > -1: new_name = os.path.abspath(cols[0].replace(" ", "_")) logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name)) logger.warning("Please, avoid names with spaces in the future.") utils.symlink_plus(os.path.abspath(cols[0]), new_name) cols[0] = new_name samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.items(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" elif is_gsm(items[0][0]): fn = "query_gsm" ext = ".fastq.gz" files = [os.path.abspath(fn_file[0]) if not is_gsm(fn_file[0]) else fn_file[0] for fn_file in items] samples[sample] = [{'files': _check_paired(files, force_single), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data)} data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter(lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items]))) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if trim_reads: adapter = dd.get_adapters(data)[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") else: symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def run(items): assert len(items) == 1, ("Expect one input to biological prioritization: %s" % ", ".join([dd.get_sample_name(d) for d in items])) data = items[0] inputs = [] for call in data.get("sv", []): vcf_file = call.get("vcf_file", call.get("vrn_file")) if vcf_file and vcf_file.endswith((".vcf", "vcf.gz")): pp_fn = POST_PRIOR_FNS.get(call["variantcaller"]) if pp_fn: pp_fn = pp_fn(call) inputs.append((call["variantcaller"], vcf_file, pp_fn)) if len(inputs) > 0: prioritize_by = tz.get_in(["config", "algorithm", "svprioritize"], data) if prioritize_by: work_dir = _sv_workdir(data) priority_files = [_prioritize_vcf(vcaller, vfile, prioritize_by, post_prior_fn, work_dir, data) for vcaller, vfile, post_prior_fn in inputs] priority_tsv = _combine_files([xs[0] for xs in priority_files], work_dir, data) raw_files = {} for svcaller, fname in zip([xs[0] for xs in inputs], [xs[1] for xs in priority_files]): clean_fname = os.path.join(os.path.dirname(fname), "%s-%s-prioritized%s" % (dd.get_sample_name(data), svcaller, utils.splitext_plus(fname)[-1])) utils.symlink_plus(fname, clean_fname) raw_files[svcaller] = clean_fname data["sv"].append({"variantcaller": "sv-prioritize", "vrn_file": priority_tsv, "raw_files": raw_files}) # Disabled on move to CWL, not used and tested with CNVkit changes # data = _cnv_prioritize(data) return [data]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]] adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError("Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def index_cram(fname): out_file = "%s.crai" % fname if not utils.file_exists(out_file): print "Indexing", fname with file_transaction(out_file) as tx_out_file: tx_in_file = os.path.splitext(tx_out_file)[0] utils.symlink_plus(fname, tx_in_file) subprocess.check_call(["cram_index", tx_in_file]) return out_file
def create_tumor_bamdir(tumor, tumor_bam, normal_bam, work_dir): """Create expected input directory with tumor/normal BAMs in one place. """ bam_dir = utils.safe_makedir(os.path.join(work_dir, tumor, "in_bams")) normal_bam_ready = os.path.join(bam_dir, os.path.basename(normal_bam)) utils.symlink_plus(normal_bam, normal_bam_ready) tumor_bam_ready = os.path.join(bam_dir, os.path.basename(tumor_bam)) utils.symlink_plus(tumor_bam, tumor_bam_ready) return bam_dir, normal_bam_ready
def index(in_cram, config): """Ensure CRAM file has a .crai index file using cram_index from scramble. """ if not utils.file_exists(in_cram + ".crai"): with file_transaction(config, in_cram + ".crai") as tx_out_file: tx_in_file = os.path.splitext(tx_out_file)[0] utils.symlink_plus(in_cram, tx_in_file) cmd = "cram_index {tx_in_file}" subprocess.check_call(cmd.format(**locals()), shell=True)
def _install_kraken_db(datadir, args): """Install kraken minimal DB in genome folder. """ kraken = os.path.join(datadir, "genomes/kraken") url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz" compress = os.path.join(kraken, os.path.basename(url)) base, ext = utils.splitext_plus(os.path.basename(url)) db = os.path.join(kraken, base) tooldir = args.tooldir or get_defaults()["tooldir"] requests.packages.urllib3.disable_warnings() last_mod = urllib.request.urlopen(url).info().getheader('Last-Modified') last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc()) if os.path.exists(os.path.join(tooldir, "bin", "kraken")): if not os.path.exists(db): is_new_version = True else: cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0] cur_version = datetime.datetime.utcfromtimestamp( os.path.getmtime(cur_file)) is_new_version = last_mod.date() > cur_version.date() if is_new_version: shutil.move(cur_file, cur_file.replace('minikraken', 'old')) if not os.path.exists(kraken): utils.safe_makedir(kraken) if is_new_version: if not os.path.exists(compress): subprocess.check_call( ["wget", "-O", compress, url, "--no-check-certificate"]) cmd = ["tar", "-xzvf", compress, "-C", kraken] subprocess.check_call(cmd) last_version = glob.glob(os.path.join(kraken, "minikraken_*")) utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken")) utils.remove_safe(compress) else: print("You have the latest version %s." % last_mod) else: raise argparse.ArgumentTypeError( "kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") ref_file = dd.get_ref_file(data) if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data) } data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) elif dd.get_variant_regions(data): callable_region_bed, nblock_bed = \ callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": dd.get_variant_regions(data), "sample_callable": dd.get_variant_regions(data) } return [[data]]
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate( index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists( alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) assert sambamba, "Did not find sambamba for indexing" samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) try: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" do.run( cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam)) except subprocess.CalledProcessError: cmd = "{samtools} index {in_bam} {tx_index_file}" do.run( cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True, rerun_effects=True, remove_oldeffects=False): """Normalizes variants and reruns SnpEFF for resulting VCF """ if remove_oldeffects: out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file) else: out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _normalize(in_file, data, passonly=passonly, normalize_indels=normalize_indels, split_biallelic=split_biallelic, remove_oldeffects=remove_oldeffects) if rerun_effects: ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file utils.symlink_plus(ready_ma_file, out_file) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def _enforce_max_region_size(in_file, data): """Ensure we don't have any chunks in the region greater than 1Mb. Larger sections have high memory usage on VarDictJava and failures on VarDict. This creates minimum windows from the input BED file to avoid these issues. Downstream VarDict merging sorts out any variants across windows. """ max_size = 1e6 overlap_size = 250 def _has_larger_regions(f): return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f)) out_file = "%s-regionlimit%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if _has_larger_regions(in_file): with file_transaction(data, out_file) as tx_out_file: pybedtools.BedTool().window_maker(w=max_size, s=max_size - overlap_size, b=pybedtools.BedTool(in_file)).saveas(tx_out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def _grabix_index(data): """Create grabix index of bgzip input file. grabix does not allow specification of output file, so symlink the original file into a transactional directory. """ in_file = data["bgzip_file"] config = data["config"] grabix = config_utils.get_program("grabix", config) gbi_file = _get_grabix_index(in_file) # We always build grabix input so we can use it for counting reads and doing downsampling if not gbi_file or _is_partial_index(gbi_file): if gbi_file: utils.remove_safe(gbi_file) else: gbi_file = in_file + ".gbi" with file_transaction(data, gbi_file) as tx_gbi_file: tx_in_file = os.path.splitext(tx_gbi_file)[0] utils.symlink_plus(in_file, tx_in_file) do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file)) assert utils.file_exists(gbi_file) return [gbi_file]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ params = { "min_coverage_for_downsampling": 10, "max_downsample_multiplier": 200 } data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "coverage_depth_bed": covinfo.depth } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = utils.to_single_data(data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.callable, bam_file_ready, ref_file, data) vrs_file = dd.get_variant_regions_merged(data) offtarget_stats = callable.calculate_offtarget_stats( bam_file_ready, data, vrs_file, "variant_regions") data["regions"] = { "nblock": nblock_bed, "callable": callable_bed, "highdepth": covinfo.highdepth, "sample_callable": covinfo.callable, "coverage_bed": covinfo.coverage, "avg_coverage": covinfo.avg_coverage, "offtarget_stats": offtarget_stats } data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def _bedpe_to_vcf(bedpe_file, sconfig_file, items): """Convert BEDPE output into a VCF file. """ tovcf_script = do.find_cmd("bedpeToVcf") if tovcf_script: out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0] out_nogzip = out_file.replace(".vcf.gz", ".vcf") raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0] if not utils.file_exists(out_file): if not utils.file_exists(raw_file): with file_transaction(items[0], raw_file) as tx_raw_file: cmd = [ sys.executable, tovcf_script, "-c", sconfig_file, "-f", dd.get_ref_file(items[0]), "-t", "LUMPY", "-b", bedpe_file, "-o", tx_raw_file ] do.run(cmd, "Convert lumpy bedpe output to VCF") prep_file = vcfutils.sort_by_ref(raw_file, items[0]) if not utils.file_exists(out_nogzip): utils.symlink_plus(prep_file, out_nogzip) out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"]) return out_file
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter( lambda x: x is not None, list( set([ tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items ]))) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. This does not yet handle mapping over 1 -> chr1 issues since this requires a ton of search/replace which slows down conversion. """ work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = _target_chroms_and_header(in_bam, data) str_chroms = " ".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable ref_file = dd.get_ref_file(data) local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam)) utils.symlink_plus(in_bam, local_bam) bam.index(local_bam, data["config"]) cmd = ( "samtools view -h {local_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{ref_file}")' | """ "samtools view -u - | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - " ) do.run( cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def _prep_genome(out_dir, data): """Create prepped reference directory for pisces. Requires a custom GenomeSize.xml file present. """ genome_name = utils.splitext_plus(os.path.basename( dd.get_ref_file(data)))[0] out_dir = utils.safe_makedir(os.path.join(out_dir, genome_name)) ref_file = dd.get_ref_file(data) utils.symlink_plus(ref_file, os.path.join(out_dir, os.path.basename(ref_file))) with open(os.path.join(out_dir, "GenomeSize.xml"), "w") as out_handle: out_handle.write('<sequenceSizes genomeName="%s">' % genome_name) for c in pysam.AlignmentFile( "%s.dict" % utils.splitext_plus(ref_file)[0]).header["SQ"]: cur_ploidy = ploidy.get_ploidy([data], region=[c["SN"]]) out_handle.write( '<chromosome fileName="%s" contigName="%s" totalBases="%s" knownBases="%s" ' 'isCircular="false" ploidy="%s" md5="%s"/>' % (os.path.basename(ref_file), c["SN"], c["LN"], c["LN"], cur_ploidy, c["M5"])) out_handle.write('</sequenceSizes>') return out_dir
def _trna_annotation(data): """ use tDRmapper to quantify tRNAs """ trna_ref = op.join(dd.get_srna_trna_file(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna", name)) in_file = op.basename(data["clean_fastq"]) tdrmapper = os.path.join(os.path.dirname(sys.executable), "TdrMappingScripts.pl") perl_export = utils.get_perl_exports() if not file_exists(trna_ref) or not file_exists(tdrmapper): logger.info("There is no tRNA annotation to run TdrMapper.") return work_dir out_file = op.join(work_dir, in_file + ".hq_cs.mapped") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*mapped*"): shutil.move(filename, work_dir) return work_dir
def _amber_het_file(method, vrn_files, work_dir, paired): """Create file of BAFs in normal heterozygous positions compatible with AMBER. Two available methods: - pon -- Use panel of normals with likely heterozygous sites. - variants -- Use pre-existing variant calls, filtered to likely heterozygotes. https://github.com/hartwigmedical/hmftools/tree/master/amber https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf """ assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs" from bcbio.heterogeneity import bubbletree if method == "variants": amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join( amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], work_dir, paired, AmberWriter) utils.symlink_plus(prep_file, out_file) else: assert method == "pon" tumor_counts, normal_counts = gatkcnv.heterogzygote_counts(paired) out_file = _count_files_to_amber(tumor_counts, normal_counts, work_dir, paired.tumor_data) pcf_file = out_file + ".pcf" if not utils.file_exists(pcf_file): with file_transaction(paired.tumor_data, pcf_file) as tx_out_file: r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R") with open(r_file, "w") as out_handle: out_handle.write(_amber_seg_script) cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports( ), utils.Rscript_cmd(), r_file, out_file, pcf_file) do.run(cmd, "PURPLE: AMBER baf segmentation") return out_file
def _add_wham_classification(in_file, items): """Run WHAM classifier to assign a structural variant type to each call. """ wham_sharedir = os.path.normpath( os.path.join( os.path.dirname(subprocess.check_output(["which", "WHAM-BAM"])), os.pardir, "share", "wham")) minclassfreq = 0.4 # Need at least this much support to classify a variant, otherwise unknown out_file = "%s-class%s" % utils.splitext_plus(in_file) training_data = os.path.join(wham_sharedir, "WHAM_training_data.txt") training_data = _prep_training_data(training_data, out_file, items[0]) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): with file_transaction(items[0], out_file) as tx_out_file: this_python = sys.executable cores = dd.get_cores(items[0]) cmd = ( "{this_python} {wham_sharedir}/classify_WHAM_vcf.py --proc {cores} " "--minclassfreq {minclassfreq} {in_file} {training_data} > {tx_out_file}" ) do.run(cmd.format(**locals()), "Classify WHAM calls") else: utils.symlink_plus(in_file, out_file) return out_file
def variantcall_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ if out_file is None or not os.path.exists(out_file) or not os.path.lexists(out_file): utils.safe_makedir(os.path.dirname(out_file)) sam_ref = data["sam_ref"] config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if isinstance(data["work_bam"], basestring): align_bams = [data["work_bam"]] items = [data] else: align_bams = data["work_bam"] items = data["group_orig"] call_file = "%s-raw%s" % utils.splitext_plus(out_file) call_file = caller_fn(align_bams, items, sam_ref, data["genome_resources"]["variation"], region, call_file) if data["config"]["algorithm"].get("phasing", False) == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) utils.symlink_plus(call_file, out_file) data["vrn_file"] = out_file return [data]
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(get_orig_items(data)): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] else: sub_data.pop("vrn_file", None) out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in get_orig_items(data): sub_vrn_file = data["vrn_file"].replace( str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file_batch"] = data["vrn_file"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def _enforce_max_region_size(in_file, data): """Ensure we don't have any chunks in the region greater than 20kb. VarDict memory usage depends on size of individual windows in the input file. This breaks regions into 20kb chunks with 250bp overlaps. 20kb gives ~1Gb/core memory usage and the overlaps avoid missing indels spanning a gap. Downstream VarDict merging sorts out any variants across windows. https://github.com/AstraZeneca-NGS/VarDictJava/issues/64 """ max_size = 20000 overlap_size = 250 def _has_larger_regions(f): return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f)) out_file = "%s-regionlimit%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if _has_larger_regions(in_file): with file_transaction(data, out_file) as tx_out_file: pybedtools.BedTool().window_maker(w=max_size, s=max_size - overlap_size, b=pybedtools.BedTool(in_file)).saveas(tx_out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _run_ensemble(batch_id, vrn_files, config_file, base_dir, ref_file, data): """Run an ensemble call using merging and SVM-based approach in bcbio.variation """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) out_bed_file = os.path.join(base_dir, "{0}-callregions.bed".format(batch_id)) work_dir = "%s-work" % os.path.splitext(out_vcf_file)[0] if not utils.file_exists(out_vcf_file): _bcbio_variation_ensemble(vrn_files, out_vcf_file, ref_file, config_file, base_dir, data) if not utils.file_exists(out_vcf_file): base_vcf = glob.glob( os.path.join(work_dir, "prep", "*-cfilter.vcf"))[0] utils.symlink_plus(base_vcf, out_vcf_file) if not utils.file_exists(out_bed_file): multi_beds = glob.glob( os.path.join(work_dir, "prep", "*-multicombine.bed")) if len(multi_beds) > 0: utils.symlink_plus(multi_beds[0], out_bed_file) return { "variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": out_bed_file if os.path.exists(out_bed_file) else None }
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" if in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert: needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) utils.symlink_plus(in_file, out_file) return out_file
def _prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, tx_work_dir): """Prepare BAMs for smoove, linking in pre-existing split/disc BAMs if present. Smoove can use pre-existing discordant and split BAMs prepared by samblaster if present as $sample.split.bam and $sample.disc.bam. """ input_dir = utils.safe_makedir(tx_work_dir) out = [] for full, sr, disc, data in zip(full_bams, sr_bams, disc_bams, items): if sr and disc: new_full = os.path.join(input_dir, "%s.bam" % dd.get_sample_name(data)) new_sr = os.path.join(input_dir, "%s.split.bam" % dd.get_sample_name(data)) new_disc = os.path.join(input_dir, "%s.disc.bam" % dd.get_sample_name(data)) utils.symlink_plus(full, new_full) utils.symlink_plus(sr, new_sr) utils.symlink_plus(disc, new_disc) out.append(new_full) else: out.append(full) return out
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ data = umi_transform(data) in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) log_out = os.path.join(out_dir, "%s.log" % names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) data["log_trimming"] = log_out return [[data]] adapter = dd.get_adapters(data) is_4n = any([a == "4N" for a in adapter]) adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)] if adapter and not trim_reads: trim_reads = True logger.info("Adapter is set up in config file, but trim_reads is not true." "If you want to skip trimming, skip adapter option from config.") if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) if trim_reads: adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if not trim_reads or len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) if any([a for a in adapters if re.compile("^N+$").match(a)]): adapter_cmd = "-N %s" % adapter_cmd out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) atropos = _get_atropos() options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", "")) if options.strip() == "-u 4 -u -4": options = "" is_4n = "4N" cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError("Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if is_4n: options = "-u 4 -u -4" in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "atropos with this parameters %s for %s" %(options, names)) data["log_trimming"] = log_out else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _run_purple(paired, het_file, depth_file, work_dir): """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs. XXX Need to add output conversion into VCF for standard formats """ purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple")) out_file = os.path.join( purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = [ "PURPLE", "-amber", os.path.dirname(het_file), "-baf", het_file, "-cobalt", os.path.dirname(depth_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"], "-output_dir", os.path.dirname(tx_out_file), "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19", "-run_dir", work_dir, "-threads", dd.get_num_cores(paired.tumor_data), "-tumor_sample", dd.get_sample_name(paired.tumor_data), "-ref_sample", dd.get_sample_name(paired.normal_data) ] # Avoid X11 display errors when writing plots cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd]) do.run(cmd, "PURPLE: purity and ploidy estimation") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(purple_dir, f)) out_file_export = os.path.join( purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data))) if not utils.file_exists(out_file_export): utils.symlink_plus(out_file, out_file_export) out = { "variantcaller": "purple", "call_file": out_file_export, "plot": {}, "metrics": {} } for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]: plot_file = os.path.join( purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext)) if os.path.exists(plot_file): out["plot"][name] = plot_file purity_file = os.path.join( purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data)) with open(purity_file) as in_handle: header = in_handle.readline().replace("#", "").split("\t") vals = in_handle.readline().split("\t") for h, v in zip(header, vals): try: v = float(v) except ValueError: pass out["metrics"][h] = v return out
def _get_samples_to_process(fn, out_dir, config, force_single, separators): """parse csv file with one line per file. It will merge all files that have the same description name""" out_dir = os.path.abspath(out_dir) samples = defaultdict(list) with open(fn) as handle: for l in handle: if l.find("description") > 0: logger.info("Skipping header.") continue cols = l.strip().split(",") if len(cols) > 0: if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]) or is_gsm(cols[0]) or is_srr( cols[0]): if cols[0].find(" ") > -1: new_name = os.path.abspath(cols[0].replace(" ", "_")) logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name)) logger.warning( "Please, avoid names with spaces in the future.") utils.symlink_plus(os.path.abspath(cols[0]), new_name) cols[0] = new_name samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.items(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" elif is_gsm(items[0][0]): fn = "query_gsm" ext = ".fastq.gz" elif is_srr(items[0][0]): fn = "query_gsm" ext = ".fastq.gz" files = [ os.path.abspath(fn_file[0]) if utils.file_exists(fn_file[0]) else fn_file[0] for fn_file in items ] samples[sample] = [{ 'files': _check_paired(files, force_single, separators), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir }] return [samples[sample] for sample in samples]
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError( "Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.bam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist( fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-coverage-search"] = True options["no-mixed"] = True cmd = [sys.executable, config_utils.get_program("tophat", config)] for k, v in options.items(): if v is True: cmd.append("--%s" % k) else: assert not isinstance(v, bool) cmd.append("--%s=%s" % (k, v)) # tophat requires options before arguments, otherwise it silently ignores them cmd += files do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file)) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base), ref_file, config) else: fixed = out_file fixed_unmapped = _fix_unmapped(fixed, unmapped, data) fixed = merge_unmapped(fixed, fixed_unmapped, config) fixed = _add_rg(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"], os.path.splitext(fixed)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): raw_file = "%s-raw%s" % utils.splitext_plus(out_file) with file_transaction(items[0], raw_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if dd.get_avg_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ( "unset R_HOME && unset JAVA_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """ "| bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = raw_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if raw_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) if assoc_files.get("dbsnp"): annotation.add_dbsnp(raw_file, assoc_files["dbsnp"], items[0], out_file) else: utils.symlink_plus(raw_file, out_file) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): raw_file = "%s-raw%s" % utils.splitext_plus(out_file) with file_transaction(items[0], raw_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join( _vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if dd.get_avg_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ( "unset R_HOME && unset JAVA_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """ "{freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if assoc_files.get("dbsnp"): annotation.add_dbsnp(raw_file, assoc_files["dbsnp"], items[0], out_file) else: utils.symlink_plus(raw_file, out_file) return out_file
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, config) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError( "Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "%s.sam" % out_base) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.sam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist( fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-convert-bam"] = True options["no-coverage-search"] = True options["no-mixed"] = True tophat_runner = sh.Command( config_utils.get_program("tophat", config)) ready_options = {} for k, v in options.iteritems(): ready_options[k.replace("-", "_")] = v # tophat requires options before arguments, # otherwise it silently ignores them tophat_ready = tophat_runner.bake(**ready_options) cmd = str(tophat_ready.bake(*files)) do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None) _fix_empty_readnames(out_file) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.sam" % out_base), ref_file, config) else: fixed = out_file fixed = merge_unmapped(fixed, unmapped, config) fixed = _fix_unmapped(fixed, config, names) fixed = bam.sort(fixed, config) fixed = bam.bam_to_sam(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) if not file_exists(out_file_orig): with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params if not utils.file_uptodate(out_file_mutect, out_file_orig): out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list)) and chromhacks.is_autosomal_or_sex(region[0])): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): from bcbio.structural import pindel out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file