def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False): var2vcf_opts = [] opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] cores = dd.get_num_cores(items[0]) if cores and cores > 1: opts += ["-th", str(cores)] # Disable SV calling for vardict, causes issues with regional analysis # by detecting SVs outside of target regions, which messes up merging # SV calling will be worked on as a separate step vardict_cl = get_vardict_command(items[0]) version = programs.get_version_manifest(vardict_cl) if (vardict_cl and version and ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or (vardict_cl == "vardict" and LooseVersion(version) >= LooseVersion("2018.07.25")))): opts += ["--nosv"] if (vardict_cl and version and (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))): opts += ["--deldupvar"] # remove low mapping quality reads if not is_rnaseq: opts += ["-Q", "10"] # Remove QCfail reads, avoiding high depth repetitive regions opts += ["-F", "0x700"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] resources = config_utils.get_resources("var2vcf", config) if resources.get("options"): var2vcf_opts += [str(x) for x in resources["options"]] if target and _is_bed_file(target): target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option return " ".join(opts), " ".join(var2vcf_opts)
def cpu_and_memory(programs, items): """Retrieve CPU and memory/core specified in configuration input. """ assert len(items) > 0, "Finding job resources but no items to process" config = items[0]["config"] all_cores = [] all_memory = [] algs = [config_utils.get_algorithm_config(x) for x in items] progs = _get_resource_programs(programs, algs) # Calculate cores for prog in progs: resources = config_utils.get_resources(prog, config) all_cores.append(resources.get("cores", 1)) if len(all_cores) == 0: all_cores.append(1) cores_per_job = max(all_cores) # Calculate memory. Use 1Gb memory usage per core as min baseline if not specified for prog in progs: resources = config_utils.get_resources(prog, config) memory = _get_prog_memory(resources, cores_per_job) if memory: all_memory.append(memory) if len(all_memory) == 0: all_memory.append(1) memory_per_core = max(all_memory) return cores_per_job, memory_per_core
def sort(in_bam, config, order="coordinate"): """Sort a BAM file, skipping if already present. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam if bam_already_sorted(in_bam, config, order): return in_bam sort_stem = _get_sort_stem(in_bam, order) sort_file = sort_stem + ".bam" if not utils.file_exists(sort_file): sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, sort_file) as tx_sort_file: tx_sort_stem = os.path.splitext(tx_sort_file)[0] tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file)) order_flag = "-n" if order == "queryname" else "" resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} " "{in_bam} {tx_sort_stem}") if sambamba: if tz.get_in(["resources", "sambamba"], config): sm_resources = config_utils.get_resources("sambamba", config) mem = sm_resources.get("memory", "2G") # sambamba uses total memory, not memory per core mem = config_utils.adjust_memory(mem, cores, "increase").upper() # Use samtools compatible natural sorting # https://github.com/lomereiter/sambamba/issues/132 order_flag = "--natural-sort" if order == "queryname" else "" cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} " "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}") else: cmd = samtools_cmd # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run(cmd.format(**locals()), "Sort BAM file (multi core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) except: logger.exception("Multi-core sorting failed, reverting to single core") resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") cores = 1 order_flag = "-n" if order == "queryname" else "" do.run(samtools_cmd.format(**locals()), "Sort BAM file (single core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) return sort_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] if _joint_calling(items): # Prepare gVCFs if doing joint calling params += ["--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _fix_gatk_header(exist_files, out_file, config): """Ensure consistent headers for VCF concatenation. Fixes problems for genomes that start with chrM by reheadering the first file. These files do haploid variant calling which lack the PID phasing key/value pair in FORMAT, so initial chrM samples cause errors during concatenation due to the lack of header merging. This fixes this by updating the first header. """ from bcbio.variation import ploidy c, base_file = exist_files[0] replace_file = base_file items = [{"config": config}] if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1: for c, x in exist_files[1:]: if ploidy.get_ploidy(items, (c, 1, 2)) > 1: replace_file = x break base_fix_file = os.path.join(os.path.dirname(out_file), "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file))) with file_transaction(config, base_fix_file) as tx_out_file: header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0] do.run("zgrep ^# %s > %s" % (replace_file, header_file), "Prepare header file for merging") resources = config_utils.get_resources("picard", config) ropts = [] if "options" in resources: ropts += [str(x) for x in resources.get("options", [])] do.run("%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s %s" % (utils.get_java_clprep(), header_file, base_file, base_fix_file, " ".join(ropts)), "Reheader initial VCF file in merge") bgzip_and_index(base_fix_file, config) return [base_fix_file] + [x for (c, x) in exist_files[1:]]
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), assoc_files.get("cosmic"), region, out_file) assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" with file_transaction(items[0], out_file) as tx_out_file: params += ["-T", "MuTect2", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("mutect2") broad_runner.run_gatk(params) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( "{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [utils.get_program_python("configManta.py"), config_script] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += ["--config", _prep_streamlined_config(config_script, work_dir)] do.run(cmd, "Configure manta SV analysis") return out_file
def _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files, data): """Trimming with cutadapt, using version installed with bcbio-nextgen. """ if all([utils.file_exists(x) for x in out_files]): return out_files if quality_format == "illumina": quality_base = "64" else: quality_base = "33" # --times=2 tries twice remove adapters which will allow things like: # realsequenceAAAAAAadapter to remove both the poly-A and the adapter # this behavior might not be what we want; we could also do two or # more passes of cutadapt cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) ropts = " ".join(str(x) for x in config_utils.get_resources("cutadapt", data["config"]).get("options", [])) base_cmd = ("{cutadapt} {ropts} --times=2 --quality-base={quality_base} " "--quality-cutoff=5 --format=fastq " "{adapter_cmd} ").format(**locals()) if len(fastq_files) == 2: # support for the single-command paired trimming introduced in # cutadapt 1.8 adapter_cmd = adapter_cmd.replace("-a ", "-A ") base_cmd += "{adapter_cmd} ".format(adapter_cmd=adapter_cmd) return _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data) else: return _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data)
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] for a in annotation.get_gatk_annotations(items[0]["config"]): params += ["--annotation", a] paired = vcfutils.get_paired_bams(align_bams, items) params += _add_tumor_params(paired) params += _add_region_params(region, out_file, items) params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner = broad.runner_from_config(items[0]["config"]) assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))) pp_cmd = _post_process_cl(paired) cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def run(name, chip_bam, input_bam, genome_build, out_dir, config): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): return out_file macs2 = config_utils.get_program("macs2", config) options = " ".join(config_utils.get_resources("macs2", config).get("options", "")) if genome_build not in HS and options.find("-g") == -1: raise ValueError("This %s genome doesn't have a pre-set value." "You can add specific values using resources " "option for macs2 in the YAML file (-g genome_size)." "Check Chip-seq configuration in " "bcbio-nextgen documentation.") genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build] with utils.chdir(out_dir): cmd = _macs2_cmd() try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") return out_file
def _bgzip_from_bam(bam_file, dirs, config): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = _get_bgzip_cmd(config) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None if not utils.file_exists(out_file_1): with file_transaction(out_file_1) as tx_out_file: fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)]) return [x for x in [out_file_1, out_file_2] if x is not None]
def __init__(self, picard_ref, gatk_dir, config): resources = config_utils.get_resources("gatk", config) self._jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) self._picard_ref = config_utils.expand_path(picard_ref) self._gatk_dir = config_utils.expand_path(gatk_dir) or config_utils.expand_path(picard_ref) self._config = config self._gatk_version, self._picard_version = None, None
def run(data): #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == "GRCh37": # assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ["star"]: input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ["tophat", "tophat2"]: input_file = _fix_tophat_junction_output(input_file) elif "hg19" not in genome_build: return None #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, "oncofuse_out.txt") if file_exists(out_file): return out_file oncofuse = config_utils.get_program("oncofuse", config) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = [oncofuse] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) with file_transaction(data, out_file) as tx_out_file: cl += [input_file, input_type, tissue_type, tx_out_file] cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data) #return out_file return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def run_vep(data): """Annotate input VCF file with Ensembl variant effect predictor. """ out_file = utils.append_stem(data["vrn_file"], "-vepeffects") assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) rg_info = get_rg_info(names) if not utils.file_exists(out_file): check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect somatic mutations with qSNP. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): out_file = out_file.replace(".gz", "") with file_transaction(config, out_file) as tx_out_file: with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): paired = get_paired_bams(align_bams, items) qsnp = config_utils.get_program("qsnp", config) resources = config_utils.get_resources("qsnp", config) mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"])) qsnp_log = os.path.join(tmpdir, "qsnp.log") qsnp_init = os.path.join(tmpdir, "qsnp.ini") if region: paired = _create_bam_region(paired, region, tmpdir) _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init) cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}") do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {}) out_file = _filter_vcf(out_file) out_file = bgzip_and_index(out_file, config) return out_file
def _config_params(base_config, assoc_files, region, out_file): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = base_config["algorithm"].get("variant_regions") region = subset_variant_regions(variant_regions, region, out_file) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += ["--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0)] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] return params
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts): # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 20)) / 100.0 opts += ["--min-alternate-fraction", str(min_af)] return opts
def _scalpel_options_from_config(items, config, out_file, region, tmp_path): opts = [] opts += ["--format", "vcf", "--intarget"] # output vcf, report only variants within bed regions variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--bed", target] else: tmp_bed = os.path.join(tmp_path, "tmp.bed") with file_transaction(tmp_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region print("%s\t%s\t%s" % (chrom, start, end), file=tx_tmp_bed) opts += ["--bed", tmp_bed] resources = config_utils.get_resources("scalpel", config) if resources.get("options"): opts += resources["options"] if "--outratio" not in " ".join(opts): # add minimum reportable allele frequency, for which Scalpel defaults to 5 # but other somatic tools in bcbio default to 10 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += ["--outratio", str(min_af)] return opts
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, todo="square"): """Run squaring or merging analysis using bcbio.variation.recall. """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) # adjust memory by cores but leave room for run program memory memcores = int(math.ceil(float(cores) / 5.0)) jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]), {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": memcores}}}) # Write unique VCFs and BAMs to input file input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0] with open(input_file, "w") as out_handle: out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n") if todo == "square": out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n") variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "") cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] if todo == "square": cmd += ["--caller", variantcaller] cmd += [out_file, ref_file, input_file] do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region))) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "768M") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): _check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir() as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _apply_vqsr(in_file, ref_file, recal_file, tranch_file, sensitivity_cutoff, filter_type, data): """Apply VQSR based on the specified tranche, returning a filtered VCF file. """ broad_runner = broad.runner_from_config(data["config"]) base, ext = utils.splitext_plus(in_file) out_file = "{base}-{filter}filter{ext}".format(base=base, ext=ext, filter=filter_type) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "ApplyRecalibration", "-R", ref_file, "--input", in_file, "--out", tx_out_file, "--tranches_file", tranch_file, "--recal_file", recal_file, "--mode", filter_type ] resources = config_utils.get_resources("gatk_apply_recalibration", data["config"]) opts = resources.get("options", []) if not opts: opts += ["--ts_filter_level", sensitivity_cutoff] params += opts broad_runner.run_gatk(params) return out_file
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data): """GenotypeGVCFs from a merged GenomicsDB input: GATK4. ropts += [str(x) for x in resources.get("options", [])] No core scaling -- not yet supported in GATK4. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) params = ["-T", "GenotypeGVCFs", "--variant", "gendb://%s" % genomics_db, "-R", dd.get_ref_file(data), "--output", tx_out_file, "-L", bamprep.region_to_gatk(region)] params += ["-ploidy", str(ploidy.get_ploidy([data], region))] # Avoid slow genotyping runtimes with improved quality score calculation in GATK4 # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1 params += ["--use-new-qual-calculator"] resources = config_utils.get_resources("gatk", data["config"]) params += [str(x) for x in resources.get("options", [])] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect somatic mutations with qSNP. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): out_file = out_file.replace(".gz", "") with file_transaction(config, out_file) as tx_out_file: with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): paired = get_paired_bams(align_bams, items) qsnp = config_utils.get_program("qsnp", config) resources = config_utils.get_resources("qsnp", config) mem = " ".join( resources.get("jvm_opts", ["-Xms750m -Xmx4g"])) qsnp_log = os.path.join(tmpdir, "qsnp.log") qsnp_init = os.path.join(tmpdir, "qsnp.ini") if region: paired = _create_bam_region(paired, region, tmpdir) _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init) cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}") do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {}) out_file = _filter_vcf(out_file) out_file = bgzip_and_index(out_file, config) return out_file
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) resources = config_utils.get_resources("salmon", dd.get_config(data)) params = "" if resources.get("options") is not None: params = " ".join([str(x) for x in resources.get("options", [])]) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} {params} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped( fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) #sailfish.sleuthify_sailfish(tx_out_dir) return out_file
def _get_snpeff_cmd(cmd_name, datadir, data, out_file): """Retrieve snpEff base command line. """ resources = config_utils.get_resources("snpeff", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"]) # scale by cores, defaulting to 2x base usage to ensure we have enough memory # for single core runs to use with human genomes jvm_opts = config_utils.adjust_opts( jvm_opts, { "algorithm": { "memory_adjust": { "direction": "increase", "magnitude": max(2, dd.get_cores(data)) } } }) memory = " ".join(jvm_opts) snpeff = config_utils.get_program("snpEff", data["config"]) java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir( os.path.join(os.path.dirname(out_file), "tmp")) export = "unset JAVA_HOME && export PATH=%s:$PATH && " % ( utils.get_java_binpath()) cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}" return cmd.format(**locals())
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2, out_o1, out_o2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2", "o1", "o2"]] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \ (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2): cram_file = objectstore.cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]
def _config_params(base_config, assoc_files, region, out_file, items): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] # set low frequency calling parameter if adjusted # to set other MuTect parameters on contamination, pass options to resources for mutect # --fraction_contamination --minimum_normal_allele_fraction min_af = tz.get_in(["algorithm", "min_allele_fraction"], base_config) if min_af: params += [ "--minimum_mutation_cell_fraction", "%.2f" % (min_af / 100.0) ] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] # Output quality scores if "--enable_qscore_output" not in params: params.append("--enable_qscore_output") # drf not currently supported in MuTect to turn off duplicateread filter # params += gatk.standard_cl_params(items) return params
def _novoalign_args_from_config(config, need_quality=True): """Select novoalign options based on configuration parameters. """ if need_quality: qual_format = config["algorithm"].get("quality_format", "").lower() qual_flags = ["-F", "ILMFQ" if qual_format == "illumina" else "STDFQ"] else: qual_flags = [] multi_mappers = config["algorithm"].get("multiple_mappers") if multi_mappers is True: multi_flag = "Random" elif isinstance(multi_mappers, basestring): multi_flag = multi_mappers else: multi_flag = "None" multi_flags = ["-r"] + multi_flag.split() resources = config_utils.get_resources("novoalign", config) # default arguments for improved variant calling based on # comparisons to reference materials: turn off soft clipping and recalibrate if resources.get("options") is None: extra_args = ["-o", "FullNW", "-k"] else: extra_args = [str(x) for x in resources.get("options", [])] return qual_flags + multi_flags + extra_args
def _run_break_point_inspector(data, variant_file, paired): output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi") if not utils.file_exists(output_vcf): with file_transaction(data, output_vcf) as tx_output_vcf: cores = dd.get_num_cores(data) resources = config_utils.get_resources("break-point-inspector", data["config"]) memory = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]), { "algorithm": { "memory_adjust": { "magnitude": cores, "direction": "increase" } } }) cmd = ["break-point-inspector"] cmd += memory cmd += ["-vcf", variant_file] if paired: cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam] cmd += ["-output_vcf", tx_output_vcf] do.run(cmd, "Running Break Point Inspector for Manta SV calls") return output_vcf
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): ds_bam = bam.downsample(bam_file, data, 1e7) bam_file = ds_bam if ds_bam else bam_file utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = data["genome_resources"]["aliases"].get("ensembl", "").upper() if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)
def _config_params(base_config, assoc_files, region, out_file): """Add parameters based on configuration variables, associated files and genomic regions. """ params = [] contamination = base_config["algorithm"].get("fraction_contamination", 0) params += ["--fraction_contamination", contamination] dbsnp = assoc_files.get("dbsnp") if dbsnp: params += ["--dbsnp", dbsnp] cosmic = assoc_files.get("cosmic") if cosmic: params += ["--cosmic", cosmic] variant_regions = base_config["algorithm"].get("variant_regions") region = subset_variant_regions(variant_regions, region, out_file) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] resources = config_utils.get_resources("mutect", base_config) if resources.get("options") is not None: params += [str(x) for x in resources.get("options", [])] return params
def _get_fgbio_options(data, umi_method): """Get adjustable, through resources, or default options for fgbio. """ group_opts = ["--edits", "--min-map-q"] cons_opts = ["--min-input-base-quality"] if umi_method != "paired": cons_opts += ["--min-reads", "--min-consensus-base-quality"] defaults = { "--min-reads": "1", "--min-map-q": "1", "--min-consensus-base-quality": "13", "--min-input-base-quality": "2", "--edits": "1" } ropts = config_utils.get_resources("fgbio", data["config"]).get("options", []) assert len( ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts defaults.update(dict(tz.partition(2, ropts))) group_out = " ".join(["%s %s" % (x, defaults[x]) for x in group_opts]) cons_out = " ".join(["%s %s" % (x, defaults[x]) for x in cons_opts]) if umi_method != "paired": cons_out += " --output-per-base-tags=false" return group_out, cons_out
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ Then converts a tab delimited set of outputs into interleaved fastq. awk changes spaces to underscores since SNAP only takes the initial name. SNAP requires /1 and /2 at the end of read names. If these are not present in the initial fastq may need to expand awk code to do this. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = ( r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = fastq_file[2:-1] else: final_file = None assert fastq_file.endswith(".gz") if pair_file: stream_input = ( r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) tmp_dir = os.path.dirname(tx_out_file) cmd = ( "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | " "snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run( cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def _run_purecn_normaldb(paired, out): """Run PureCN with normaldb and native segmentation paired is one t/n pair or only """ sample = utils.to_single_data(paired.tumor_data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) sample_name = dd.get_sample_name(sample) work_dir = _sv_workdir(sample) rscript = utils.Rscript_cmd() purecn_r = utils.R_package_script("PureCN", "extdata/PureCN.R", env="base") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) bam_file = dd.get_align_bam(sample) # termline and somatic - just annotated and filters assigned variants_vcf = tz.get_in(["variants"], sample)[0].get("germline") # in a T/N case, there is no germline file - vrn file with all variants if not variants_vcf: variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file") normaldb = tz.get_in([ "config", "algorithm", "background", "cnv_reference", "purecn_normaldb" ], sample) mappingbiasfile = tz.get_in([ "config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias" ], sample) sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample) simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"] result_file = os.path.join(work_dir, sample_name + ".rds") genome = dd.get_genome_build(sample) cmd = [ rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage, "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb", normaldb, "--mapping-bias-file", mappingbiasfile, "--intervals", intervals, "--snp-blacklist", simple_repeat_bed, "--genome", genome, "--force", "--post-optimize", "--seed", "123", "--bootstrapn", "500", "--cores", dd.get_num_cores(sample) ] resources = config_utils.get_resources("purecn", sample) if "options" in resources: cmd += [str(x) for x in resources.get("options", [])] # it is not recommended to use matched normal sample in PureCN analysis, # because then it skips PON coverage normalization and denoising steps! # but still, if it is supplied, we useit if paired.normal_data: normal_sample = utils.to_single_data(paired.normal_data) if normal_sample: normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample) cmd.extend(["--normal", normal_coverage]) if not os.path.exists(result_file): try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib( env="base"), utils.get_R_exports(env="base"), " ".join( [str(x) for x in cmd])) do.run(cmd_line, "PureCN copy number calling") logger.debug("Saved PureCN output to " + work_dir) except subprocess.CalledProcessError as msg: logger.info("PureCN failed") out_base, out, all_files = _get_purecn_files(paired, work_dir, require_exist=True) return out
def _get_options_from_config(config): opts = [] resources = config_utils.get_resources("hisat2", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] return opts
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() f1r2_file = None _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = [ "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations( items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) if all(is_paired(bam) for bam in align_bams) and ("mutect2_readmodel" in utils.get_in( items[0], "config", "tools_on")): orientation_filter = True else: orientation_filter = False if gatk_type == "gatk4" and orientation_filter: f1r2_file = "{}-f1r2.tar.gz".format( utils.splitext_plus(out_file)[0]) params += ["--f1r2-tar-gz", f1r2_file] # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus( out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus( tx_out_file) if orientation_filter: tx_f1r2_file = "{}-read-orientation-model.tar.gz" tx_f1r2_file = tx_f1r2_file.format( utils.splitext_plus(f1r2_file)[0]) tx_read_orient_cmd = _mutect2_read_filter( broad_runner, f1r2_file, tx_f1r2_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file, tx_f1r2_file) else: filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file) if orientation_filter: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}" else: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = { "loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion } plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in( ("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) config_args, config_fields, prediction_fields = [], [], [] if is_human: plugin_fns = { "dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer } plugins = tz.get_in( ("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"]) for plugin in plugins: plugin_args, plugin_fields = plugin_fns[plugin](data) config_args += plugin_args plugin_fields += plugin_fields config_args += ["--sift", "b", "--polyphen", "b"] prediction_fields += ["PolyPhen", "SIFT"] # Use HGVS by default, requires indexing the reference genome config_args += [ "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data) ] config_fields += ["HGVSc", "HGVSp"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in( ("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] std_fields = [ "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON" ] + prediction_fields + [ "Protein_position", "BIOTYPE", "CANONICAL", "CCDS" ] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + config_fields)] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([ _unpack_fastq(fastq_file), _unpack_fastq(pair_file) ]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join( [str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): if not hla_on(data) or needs_separate_hla(data): bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-", with_hla=False) else: bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-", with_hla=True) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "unset JAVA_HOME && " "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) data["work_bam"] = out_file hla_file = "HLA-" + out_file if needs_separate_hla(data) and not utils.file_exists(hla_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, hla_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): bwa_cmd = _get_bwa_mem_cmd(data, hla_file, ref_file, "-", with_hla=True) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "unset JAVA_HOME && " "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file, names, rg_info, data) data["hla_bam"] = hla_file return data
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [ None, False, "None" ] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps( bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file) }
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: if _use_spark(num_cores, gatk_type): params += [ "-T", "HaplotypeCallerSpark", "--spark-master", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--conf", "spark.executor.heartbeatInterval=100" ] else: params += ["-T", "HaplotypeCaller"] params += [ "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.1"): if _supports_avx(): # Scale down HMM thread default to avoid overuse of cores # https://github.com/bcbio/bcbio-nextgen/issues/2442 if gatk_type == "gatk4": params += ["--native-pair-hmm-threads", "1"] # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE # GATK3 needs to be explicitly set else: params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True if gatk_type == "gatk4": params += ["--emit-ref-confidence", "GVCF"] else: params += ["--emitRefConfidence", "GVCF"] params += [ "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000" ] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.3"): # GenomicsDB does not support non-diploid samples in GATK4 joint calling # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 if not is_joint and gatk_type == "gatk4": params += [ "-ploidy", str(ploidy.get_ploidy(items, region)) ] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] if gatk_type == "gatk4": # GATK4 Spark calling does not support bgzipped output, use plain VCFs if is_joint and _use_spark(num_cores, gatk_type): tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf") params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] broad_runner.new_resources("gatk-haplotype") memscale = { "magnitude": 0.9 * num_cores, "direction": "increase" } if num_cores > 1 else None try: broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=_use_spark( num_cores, gatk_type)) except subprocess.CalledProcessError as msg: # Spark failing on regions without any reads, write an empty VCF instead # https://github.com/broadinstitute/gatk/issues/4234 if (_use_spark(num_cores, gatk_type) and str( msg ).find("java.lang.UnsupportedOperationException: empty collection" ) >= 0 and str(msg).find("at org.apache.spark.rdd.RDD") >= 0): vcfutils.write_empty_vcf( tx_out_file, samples=[dd.get_sample_name(d) for d in items]) else: raise if tx_out_file.endswith(".vcf"): vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) # avoid bug in GATK where files can get output as non-compressed if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"): with open(out_file, "r") as in_handle: is_plain_text = in_handle.readline().startswith("##fileformat") if is_plain_text: text_out_file = out_file out_file = out_file.replace(".vcf.gz", ".vcf") shutil.move(text_out_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: if num_cores > 1 and gatk_type == "gatk4": params += [ "-T", "HaplotypeCallerSpark", "--sparkMaster", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file) ] else: params += ["-T", "HaplotypeCaller"] params += [ "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] if gatk_type == "gatk4": params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.1"): # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE if not gatk_type == "gatk4" and _supports_avx(): params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True params += ["--emitRefConfidence", "GVCF"] if not gatk_type == "gatk4": params += [ "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000" ] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.3"): # GenomicsDB does not support non-diploid samples in GATK4 joint calling # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 if not is_joint and gatk_type == "gatk4": params += [ "-ploidy", str(ploidy.get_ploidy(items, region)) ] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") memscale = { "magnitude": 0.9 * num_cores, "direction": "increase" } if num_cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=(num_cores > 1 and gatk_type == "gatk4")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def calculate(parallel, items, sysinfo, config, multiplier=1, max_multicore=None): """Determine cores and workers to use for this stage based on used programs. multiplier specifies the number of regions items will be split into during processing. max_multicore specifies an optional limit on the maximum cores. Can use to force single core processing during specific tasks. sysinfo specifies cores and memory on processing nodes, allowing us to tailor jobs for available resources. """ assert len(items) > 0, "Finding job resources but no items to process" all_cores = [] all_memory = [] # Provide 100Mb of additional memory for the system system_memory = 0.10 algs = [config_utils.get_algorithm_config(x) for x in items] progs = _get_resource_programs(parallel.get("progs", []), algs) # Calculate cores for prog in progs: resources = config_utils.get_resources(prog, config) all_cores.append(resources.get("cores", 1)) if len(all_cores) == 0: all_cores.append(1) cores_per_job = max(all_cores) if max_multicore: cores_per_job = min(cores_per_job, max_multicore) if "cores" in sysinfo: cores_per_job = min(cores_per_job, int(sysinfo["cores"])) total = parallel["cores"] if total > cores_per_job: num_jobs = total // cores_per_job else: num_jobs, cores_per_job = 1, total # Calculate memory. Use 1Gb memory usage per core as min baseline if not specified for prog in progs: resources = config_utils.get_resources(prog, config) memory = _get_prog_memory(resources, cores_per_job) if memory: all_memory.append(memory) if len(all_memory) == 0: all_memory.append(1) memory_per_core = max(all_memory) logger.debug("Resource requests: {progs}; memory: {memory}; cores: {cores}".format( progs=", ".join(progs), memory=", ".join("%.2f" % x for x in all_memory), cores=", ".join(str(x) for x in all_cores))) cores_per_job, memory_per_core = _ensure_min_resources(progs, cores_per_job, memory_per_core, min_memory=parallel.get("ensure_mem", {})) if cores_per_job == 1: memory_per_job = "%.2f" % memory_per_core num_jobs, mem_pct = _scale_jobs_to_memory(num_jobs, memory_per_core, sysinfo) else: cores_per_job, memory_per_job, mem_pct = _scale_cores_to_memory(cores_per_job, memory_per_core, sysinfo, system_memory) # For local runs with multiple jobs and multiple cores, potentially scale jobs down if num_jobs > 1 and parallel.get("type") == "local": memory_per_core = float(memory_per_job) / cores_per_job num_jobs, _ = _scale_jobs_to_memory(num_jobs, memory_per_core, sysinfo) # do not overschedule if we don't have extra items to process num_jobs = min(num_jobs, len(items) * multiplier) logger.debug("Configuring %d jobs to run, using %d cores each with %sg of " "memory reserved for each job" % (num_jobs, cores_per_job, str(memory_per_job))) parallel = copy.deepcopy(parallel) parallel["cores_per_job"] = cores_per_job parallel["num_jobs"] = num_jobs parallel["mem"] = str(memory_per_job) parallel["mem_pct"] = "%.2f" % mem_pct parallel["system_cores"] = sysinfo.get("cores", 1) return parallel
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) if is_human: dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) prediction_args = ["--sift", "b", "--polyphen", "b"] prediction_fields = ["PolyPhen", "SIFT"] else: dbnsfp_args, dbnsfp_fields = [], [] loftee_args, loftee_fields = [], [] prediction_args, prediction_fields = [], [] if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one variant per gene # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick # Also use hgvs reporting but requires indexing the reference file clinical_args = [ "--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data) ] clinical_fields = ["HGVSc", "HGVSp"] else: clinical_args, clinical_fields = [], [] std_fields = [ "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON" ] + prediction_fields + [ "Protein_position", "BIOTYPE", "CANONICAL", "CCDS" ] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \ prediction_args + dbnsfp_args + loftee_args + clinical_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type, data): """Run variant quality score recalibration. """ cutoffs = [ "100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93", "99.92", "99.91", "99.9", "99.8", "99.7", "99.6", "99.5", "99.0", "98.0", "90.0" ] if sensitivity_cutoff not in cutoffs: cutoffs.append(sensitivity_cutoff) cutoffs.sort() broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() base = utils.splitext_plus(in_file)[0] recal_file = ("%s-vqsrrecal.vcf.gz" % base) if gatk_type == "gatk4" else ("%s.recal" % base) tranches_file = "%s.tranches" % base plot_file = "%s-plots.R" % base if not utils.file_exists(recal_file): with file_transaction(data, recal_file, tranches_file, plot_file) as (tx_recal, tx_tranches, tx_plot_file): params = [ "-T", "VariantRecalibrator", "-R", ref_file, "--mode", filter_type ] if gatk_type == "gatk4": params += [ "--variant", in_file, "--output", tx_recal, "--tranches-file", tx_tranches, "--rscript-file", tx_plot_file ] else: params += [ "--input", in_file, "--recal_file", tx_recal, "--tranches_file", tx_tranches, "--rscript_file", tx_plot_file ] params += _get_vqsr_training(filter_type, vrn_files, gatk_type) resources = config_utils.get_resources("gatk_variant_recalibrator", data["config"]) opts = resources.get("options", []) if not opts: for cutoff in cutoffs: opts += ["-tranche", str(cutoff)] for a in _get_vqsr_annotations(filter_type, data): opts += ["-an", a] params += opts cores = dd.get_cores(data) memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False, memscale=memscale, parallel_gc=True) except: # Can fail to run if not enough values are present to train. return None, None if gatk_type == "gatk4": vcfutils.bgzip_and_index(recal_file, data["config"]) return recal_file, tranches_file
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs( vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), { "algorithm": { "memory_adjust": { "magnitude": threads, "direction": "increase" } } }) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = [ "rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] rm_samples = vcfutils.get_samples(rm_file) if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples: cmd += ["--sample=%s" % dd.get_sample_name(data)] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % ( utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def _get_preseq_params(data, preseq_cmd, read_count): """ Get parameters through resources. If "step" or "extrap" limit are not provided, then calculate optimal values based on read count. """ defaults = { 'seg_len': 100000, # maximum segment length when merging paired end bam reads 'steps': 300, # number of points on the plot 'extrap_fraction': 3, # extrapolate up to X times read_count 'extrap': None, # extrapolate up to X reads 'step': None, # step size (number of reads between points on the plot) 'options': '', } params = {} main_opts = [("-e", "-extrap"), ("-l", "-seg_len"), ("-s", "-step")] other_opts = config_utils.get_resources("preseq", data["config"]).get("options", []) if isinstance(other_opts, str): other_opts = [other_opts] for sht, lng in main_opts: if sht in other_opts: i = other_opts.index(sht) elif lng in other_opts: i = other_opts.index(lng) else: i = None if i is not None: params[lng[1:]] = other_opts[i + 1] other_opts = other_opts[:i] + other_opts[i + 2:] params['options'] = ' '.join(other_opts) for k, v in config_utils.get_resources("preseq", data["config"]).items(): if k != 'options': params[k] = v params['steps'] = params.get('steps', defaults['steps']) if preseq_cmd == 'c_curve': params['extrap_fraction'] = 1 else: if params.get('step') is None: if params.get('extrap') is None: unrounded__extrap = read_count * params.get( 'extrap_fraction', defaults['extrap_fraction']) unrounded__step = unrounded__extrap // params['steps'] if params.get( 'extrap_fraction' ) is not None: # extrap_fraction explicitly provided params['extrap'] = unrounded__extrap params['step'] = unrounded__step else: power_of_10 = 10**math.floor(math.log(unrounded__step, 10)) rounded__step = int( math.floor(unrounded__step // power_of_10) * power_of_10) rounded__extrap = int(rounded__step) * params['steps'] params['step'] = rounded__step params['extrap'] = rounded__extrap else: params['step'] = params['extrap'] // params['steps'] elif params.get('extrap') is None: params['extrap'] = params['step'] * params['steps'] params['step'] = params.get('step', defaults['step']) params['extrap'] = params.get('extrap', defaults['extrap']) params['seg_len'] = params.get('seg_len', defaults['seg_len']) logger.info( "Preseq: running {steps} steps of size {step}, extap limit {extrap}". format(**params)) return params
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs( [utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir( samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources( "multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add( os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add( os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [ data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*")) ] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json( samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append( data_json_final) file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append( file_list_final) return [[data] for data in samples]
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join( [str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]