def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None): """bgzip and tabix index an input file, handling VCF and BED. """ if config is None: config = {} out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if out_dir: remove_orig = False out_file = os.path.join(out_dir, os.path.basename(out_file)) if (not utils.file_exists(out_file) or not os.path.lexists(out_file) or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))): assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file assert os.path.exists(in_file), "Input file %s not found" % in_file if not utils.file_uptodate(out_file, in_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" if prep_cmd: prep_cmd = "| %s " % prep_cmd cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}" try: do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) except subprocess.CalledProcessError: # Race conditions: ignore errors where file has been deleted by another if os.path.exists(in_file) and not os.path.exists(out_file): raise if remove_orig: try: os.remove(in_file) except OSError: # Handle cases where run in parallel and file has been deleted pass tabix_index(out_file, config, tabix_args=tabix_args) return out_file
def _run_bwa_align(fastq_file, ref_file, out_file, config): aln_cl = [config_utils.get_program("bwa", config), "aln", "-n 2", "-k 2"] aln_cl += _bwa_args_from_config(config) aln_cl += [ref_file, fastq_file] cmd = "{cl} > {out_file}".format(cl=" ".join(aln_cl), out_file=out_file) do.run(cmd, "bwa aln: {f}".format(f=os.path.basename(fastq_file)), None)
def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") batch = sshared.get_cur_batch(items) ext = "-%s-cnv" % batch if batch else "-cnv" out_file = os.path.join(work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename(work_bams[0]))[0], ext, chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write(_script.format(prep_str=_prep_load_script(work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = config_utils.get_program("Rscript", items[0]["config"]) try: do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError, msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write('track name=empty description="No CNVs found"\n') else: logger.exception() raise
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items): """Run delly, calling structural variations for the specified type. """ out_file = os.path.join(work_dir, "%s-svs%s-%s.vcf" % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type, chrom)) cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1), len(bam_files)) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: if not _has_variant_regions(items, out_file, chrom): vcfutils.write_empty_vcf(tx_out_file) else: exclude = ["-x", prepare_exclude_file(items, out_file, chrom)] cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files multi_cmd = "export OMP_NUM_THREADS=%s && " % cores try: do.run(multi_cmd + " ".join(cmd), "delly structural variant") # Delly will write nothing if no variants found if not utils.file_exists(tx_out_file): vcfutils.write_empty_vcf(tx_out_file) except subprocess.CalledProcessError, msg: # delly returns an error exit code if there are no variants if "No structural variants found" in str(msg): vcfutils.write_empty_vcf(tx_out_file) else: raise
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get("analysis", "").lower() not in ["standard"] else None) bam_file = ds_bam if ds_bam else bam_file num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file] do.run(cl, "FastQC: %s" % data["name"][-1]) fastqc_outdir = os.path.join(tx_tmp_dir, "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0]) if os.path.exists("%s.zip" % fastqc_outdir): os.remove("%s.zip" % fastqc_outdir) if not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(fastqc_outdir, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """Perform a BWA alignment, generating a SAM file. """ config = data["config"] sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base) sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base) if pair_file else None) sam_file = os.path.join(align_dir, "%s.sam" % out_base) if not utils.file_exists(sam_file): if not utils.file_exists(sai1_file): with file_transaction(sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not utils.file_exists(sai2_file): with file_transaction(sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) align_type = "sampe" if sai2_file else "samse" sam_cl = [config_utils.get_program("bwa", config), align_type, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file) as tx_sam_file: cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file) do.run(cmd, "bwa {align_type}".format(**locals()), None) return sam_file
def convert_to_kallisto(data): files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq") out_file = os.path.join(kallisto_dir, "barcodes.batch") umis = config_utils.get_program("umis", dd.get_config(data)) if file_exists(out_file): return out_file if dd.get_minimum_barcode_depth(data): cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt") cb_cutoff = dd.get_minimum_barcode_depth(data) cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}" cb_options = cb_options.format(**locals()) else: cb_options = "" cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}") with file_transaction(data, kallisto_dir) as tx_kallisto_dir: safe_makedir(tx_kallisto_dir) message = ("Transforming %s to Kallisto singlecell format. " % fq1) do.run(cmd.format(**locals()), message) return out_file
def _run_kraken(data,ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") stats = out = out_stats = None db = data['config']["algorithm"]["kraken"] if db == "minikraken": db = os.path.join(_get_data_dir(),"genome","kraken","minikraken") else: if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report" : "null"} if not os.path.exists(os.path.join(kraken_out,"kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) files = data["files"] with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir,"kraken_out") out_stats = os.path.join(tx_tmp_dir,"kraken_stats") cl = (" ").join([config_utils.get_program("kraken", data["config"]), "--db",db,"--quick", "--preload","--min-hits","2","--threads",str(num_cores), "--out", out, files[0]," 2>",out_stats]) do.run(cl,"kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out,db,data) return metrics
def slim_vcf(in_file, data): """Remove larger annotations which slow down VCF processing """ to_remove = ["ANN", "LOF"] to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove]) in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False) out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): cur_remove = [] with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break elif line.startswith(to_remove_str): cur_id = line.split("ID=")[-1].split(",")[0] cur_remove.append("INFO/%s" % cur_id) with file_transaction(data, out_file) as tx_out_file: if cur_remove: cur_remove = ",".join(cur_remove) cmd = ("bcftools view -f 'PASS,.' {in_file} | " "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}") else: cmd = ("bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}") do.run(cmd.format(**locals()), "Create slim VCF") return out_file
def run(name, chip_bam, input_bam, genome_build, out_dir, config): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): return out_file macs2 = config_utils.get_program("macs2", config) options = " ".join(config_utils.get_resources("macs2", config).get("options", "")) if genome_build not in HS and options.find("-g") == -1: raise ValueError("This %s genome doesn't have a pre-set value." "You can add specific values using resources " "option for macs2 in the YAML file (-g genome_size)." "Check Chip-seq configuration in " "bcbio-nextgen documentation.") genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build] with utils.chdir(out_dir): cmd = _macs2_cmd() try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") return out_file
def bgzip_and_index(in_file, config, remove_orig=True, prep_cmd=""): """bgzip and tabix index an input file, handling VCF and BED. """ out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if not utils.file_exists(out_file) or not os.path.lexists(out_file): assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) if prep_cmd: cmd = "cat {in_file} | {prep_cmd} | {bgzip} -c > {tx_out_file}" else: cmd = "{bgzip} -c {in_file} > {tx_out_file}" try: do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) except subprocess.CalledProcessError: # Race conditions: ignore errors where file has been deleted by another if os.path.exists(in_file) and not os.path.exists(out_file): raise if remove_orig: try: os.remove(in_file) except OSError: # Handle cases where run in parallel and file has been deleted pass tabix_index(out_file, config) return out_file
def _bgzip_from_bam(bam_file, dirs, config): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = _get_bgzip_cmd(config) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None if not utils.file_exists(out_file_1): with file_transaction(out_file_1) as tx_out_file: fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)]) return [x for x in [out_file_1, out_file_2] if x is not None]
def merge_overlaps(in_file, data, distance=None, out_dir=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ config = data["config"] if in_file: bedtools = config_utils.get_program("bedtools", config, default="bedtools") work_dir = tz.get_in(["dirs", "work"], data) if out_dir: bedprep_dir = out_dir elif work_dir: bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep")) else: bedprep_dir = os.path.dirname(in_file) out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0])) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: distance = "-d %s" % distance if distance else "" cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare merged BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip and not needs_convert: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if needs_convert else "" do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] cmd += ["--assemble=1"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", "--minVarFreq", "0.0"] # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = " | %s | vcfallelicprimitives | vcfstreamsort | bgzip -c > %s" % ( vcfutils.fix_ambiguous_cl(), tx_out_file) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _piped_input_cl(data, region, tmp_dir, out_base_file, prep_params): """Retrieve the commandline for streaming input into preparation step. If marking duplicates, this requires writing an intermediate file since MarkDuplicates uses multiple passed on an input. """ broad_runner = broad.runner_from_config(data["config"]) cl = _gatk_extract_reads_cl(data, region, prep_params, tmp_dir) if prep_params["dup"] == "picard": sel_file = "%s-select%s" % os.path.splitext(out_base_file) if not utils.file_exists(sel_file): with file_transaction(sel_file) as tx_out_file: cl += ["-o", tx_out_file] do.run(cl, "GATK: PrintReads {0}".format(region), data) dup_metrics = "%s-dup.dup_metrics" % os.path.splitext(out_base_file)[0] compression = "5" if prep_params["realign"] == "gatk" else "0" cl = broad_runner.cl_picard("MarkDuplicates", [("INPUT", sel_file), ("OUTPUT", "/dev/stdout"), ("METRICS_FILE", dup_metrics), ("PROGRAM_RECORD_ID", "null"), ("COMPRESSION_LEVEL", compression), ("TMP_DIR", tmp_dir)]) elif not prep_params["dup"]: sel_file = data["work_bam"] else: raise ValueError("Duplication approach not supported with GATK: %s" % prep_params["dup"]) broad_runner.run_fn("picard_index", sel_file) return sel_file, " ".join(cl)
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + [ "--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir, ] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save( dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml") ) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") calls.append({"variantcaller": "metasv", "vrn_file": out_file}) return calls
def genebody_coverage2(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts, takes a bam file, converts it to bigwig and then uses that """ PROGRAM = "geneBody_coverage2.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) in_bigwig = bam2bigwig(in_file, config) prefix = "coverage" out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage") safe_makedir(out_dir) out_prefix = out_dir + "/wiggle" #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf")) do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None) return coverage_plot_file
def _run_amber(paired, work_dir, lenient=False): """AMBER: calculate allele frequencies at likely heterozygous sites. lenient flag allows amber runs on small test sets. """ amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: key = "germline_het_pon" het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data) cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-threads", dd.get_num_cores(paired.tumor_data), "-tumor", dd.get_sample_name(paired.tumor_data), "-tumor_bam", dd.get_align_bam(paired.tumor_data), "-reference", dd.get_sample_name(paired.normal_data), "-reference_bam", dd.get_align_bam(paired.normal_data), "-ref_genome", dd.get_ref_file(paired.tumor_data), "-bed", het_bed, "-output_dir", os.path.dirname(tx_out_file)] if lenient: cmd += ["-max_het_af_percent", "1.0"] try: do.run(cmd, "PURPLE: AMBER baf generation") except subprocess.CalledProcessError as msg: if not lenient and _amber_allowed_errors(str(msg)): return _run_amber(paired, work_dir, True) for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(amber_dir, f)) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir() as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _amber_het_file(method, vrn_files, work_dir, paired): """Create file of BAFs in normal heterozygous positions compatible with AMBER. Two available methods: - pon -- Use panel of normals with likely heterozygous sites. - variants -- Use pre-existing variant calls, filtered to likely heterozygotes. https://github.com/hartwigmedical/hmftools/tree/master/amber https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf """ assert vrn_files, "Did not find compatible variant calling files for PURPLE inputs" from bcbio.heterogeneity import bubbletree if method == "variants": amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"], work_dir, paired, AmberWriter) utils.symlink_plus(prep_file, out_file) pcf_file = out_file + ".pcf" if not utils.file_exists(pcf_file): with file_transaction(paired.tumor_data, pcf_file) as tx_out_file: r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R") with open(r_file, "w") as out_handle: out_handle.write(_amber_seg_script) cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file, out_file, pcf_file) do.run(cmd, "PURPLE: AMBER baf segmentation") else: assert method == "pon" out_file = _run_amber(paired, work_dir) return out_file
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="", extra_cmd="", limit_regions="variant_regions"): """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else "" intervals = "" if limit_regions == "variant_regions": variant_regions = dd.get_variant_regions(data) if variant_regions: intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_gatk(self, params, tmp_dir=None, log_error=True, data=None, region=None, memscale=None, parallel_gc=False, ld_preload=False): """Top level interface to running a GATK command. ld_preload injects required libraries for Java JNI calls: https://gatkforums.broadinstitute.org/gatk/discussion/8810/something-about-create-pon-workflow """ needs_java7 = LooseVersion(self.get_gatk_version()) < LooseVersion("3.6") # For old Java requirements use global java 7 if needs_java7: setpath.remove_bcbiopath() with tx_tmpdir(self._config) as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc) atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] cl = fix_missing_spark_user(cl, prog, params) if ld_preload: cl = "export LD_PRELOAD=%s/lib/libopenblas.so && %s" % (os.path.dirname(utils.get_bcbio_bin()), cl) do.run(cl, "GATK: {0}".format(prog), data, region=region, log_error=log_error) if needs_java7: setpath.prepend_bcbiopath()
def _run_cobalt(paired, work_dir): """Run Cobalt for counting read depth across genomic windows. PURPLE requires even 1000bp windows so use integrated counting solution directly rather than converting from CNVkit calculations. If this approach is useful should be moved upstream to be available to other tools as an input comparison. https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines """ cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt")) out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam, "-threads", dd.get_num_cores(paired.tumor_data), "-output_dir", os.path.dirname(tx_out_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]] cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PURPLE: COBALT read depth normalization") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(cobalt_dir, f)) return out_file
def run_mutect(self, params, tmp_dir=None): with tx_tmpdir(self._config) as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_mutect(params, tmp_dir) prog = "MuTect" do.run(cl, "MuTect: {0}".format(prog), None)
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter(lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items]))) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError("Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.bam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-coverage-search"] = True options["no-mixed"] = True tophat_runner = sh.Command(config_utils.get_program("tophat", config)) ready_options = {} for k, v in options.iteritems(): ready_options[k.replace("-", "_")] = v # tophat requires options before arguments, # otherwise it silently ignores them tophat_ready = tophat_runner.bake(**ready_options) cmd = "%s %s" % (sys.executable, str(tophat_ready.bake(*files))) do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base), ref_file, config) else: fixed = out_file fixed_unmapped = _fix_unmapped(fixed, unmapped, data) fixed = merge_unmapped(fixed, fixed_unmapped, config) fixed = _add_rg(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"], os.path.splitext(fixed)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir) if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"] else None) bam_file = ds_bam if ds_bam else bam_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file ] do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists( tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write( line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move( os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move( "%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(tx_fastqc_out, fastqc_out) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [ None, False, "None" ] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps( bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file) }
def _compres_bdg_files(out_dir): for fn in glob.glob(os.path.join(out_dir, "*bdg")): cmd = "gzip %s" % fn do.run(cmd, "compress bdg file: %s" % fn)
def run(bam_file, data, out_dir): """Run viral QC analysis: 1. Extract the unmapped reads 2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files 3. Report viruses that are in more than 50% covered by at least 5x """ source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files' viral_target = "gdc-viral" out = {} viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0] cores = dd.get_num_cores(data) if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Align unmapped reads to viral genome") total_reads = _count_reads(bam_file) assert total_reads > 0, 'Reads count is {total_reads}, is there a bug in counting the read count? {bam_file}'.format(**locals()) with file_transaction(data, out_file) as tx_out_file: sample_name = dd.get_sample_name(data) mosdepth_prefix = os.path.splitext(viral_bam)[0] cmd = ("mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by " "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && " "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&" "echo '## Sample: {sample_name}' >> {tx_out_file} && " "echo '#virus\tsize\tdepth\t1x\t5x\t25x\treads\treads_pct' >> {tx_out_file} && " "paste " "<(zcat {mosdepth_prefix}.regions.bed.gz) " "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) " "<(samtools idxstats {viral_bam} | grep -v '*') | " "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | " "sort -n -r -k 5,5 >> {tx_out_file}") do.run(cmd.format(**locals()), "Analyse coverage of viral genomes") if chromhacks.get_EBV(data): ref_file = dd.get_ref_file(data) work_bam = dd.get_work_bam(data) ebv = chromhacks.get_EBV(data) mosdepth_prefix = os.path.splitext(work_bam)[0] + "-EBV" cmd = ("mosdepth -t {cores} {mosdepth_prefix} {work_bam} -n --thresholds 1,5,25 --by " "<(grep {ebv} {ref_file}.fai | awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}') && " "paste " "<(zcat {mosdepth_prefix}.regions.bed.gz) " "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) " "<(samtools idxstats {work_bam} | grep {ebv}) | " "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | " "sort -n -r -k 5,5 >> {tx_out_file}") do.run(cmd.format(**locals()), "Analyse coverage of EBV") out["base"] = out_file out["secondary"] = [] return out
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) folders = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif isinstance(pfiles, basestring): pfiles = [pfiles] folders.extend(pfiles) # XXX temporary workaround until we can handle larger inputs through MultiQC folders = list(set(folders)) # Back compatible -- to migrate to explicit specifications in input YAML folders += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): input_dir = [_check_multiqc_input(d) for d in folders] input_dir = _create_list_file(input_dir) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_dir: cmd = "{export_tmp} {multiqc} -f -l {input_dir} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } out.append(data) return [[d] for d in out]
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_jointcaller(d): vcinfo = variant.extract_population_vcinfo(d) elif dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): cmd = ["tar", "-czvpf", tarball, os.path.join(out_dir, indir)] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append(tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def run(self, subcmd, opts, memscale=None): jvm_opts = get_picard_opts(self._config, memscale=memscale) cmd = ["export", "PATH=%s:\"$PATH\"" % utils.get_java_binpath(), "&&"] + \ [self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \ ["VALIDATION_STRINGENCY=SILENT"] do.run(utils.clear_java_home() + " && " + " ".join(cmd), "Picard: %s" % subcmd)
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """ "{freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]] adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join( work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join( _prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = [ "~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy" ] def _is_std_exclude(n): clean_excludes = [ x.replace("~", "").replace("^", "") for x in std_excludes ] return any( [n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [ c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name) ] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ( "--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ( "export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): vcfutils.write_empty_vcf( tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = { "loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion } plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in( ("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_exac", "--pubmed", "--variant_class"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists( vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info( "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" cmd = ( "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ((l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions( vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """ "| bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if dd.get_avg_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs( vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), { "algorithm": { "memory_adjust": { "magnitude": threads, "direction": "increase" } } }) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = [ "rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] rm_samples = vcfutils.get_samples(rm_file) if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples: cmd += ["--sample=%s" % dd.get_sample_name(data)] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % ( utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def consensus(peakfiles, consensusfile, data): """call consensus peaks from a set of narrow/broad peakfiles we use this method: https://bedops.readthedocs.io/en/latest/content/usage-examples/master-list.html """ if utils.file_exists(consensusfile): return consensusfile try: bedops = config_utils.get_program("bedops", data) except config_utils.CmdNotFound: logger.info("bedops not found, skipping consensus peak calling. do a " "--tools update to install bedops.") return None try: sortbed = config_utils.get_program("sort-bed", data) except config_utils.CmdNotFound: logger.info("sort-bed not found, skipping consensus peak calling. do " "--tools update to install sort-bed.") return None try: bedmap = config_utils.get_program("bedmap", data) except config_utils.CmdNotFound: logger.info("bedmap not found, skipping consensus peak calling. do a " "--tools update to install bedmap.") return None logger.info(f"Calling consensus peaks on {','.join(peakfiles)}") logger.info(f"Removing low quality peaks from {','.join(peakfiles)}") filteredpeaks = [] for fn in peakfiles: filteredpeak = NamedTemporaryFile(suffix=".bed", delete=False).name df = remove_low_quality_peaks(fn, qval=0.05) df.to_csv(filteredpeak, index=False, header=False, sep="\t") filteredpeaks.append(filteredpeak) peakfiles = filteredpeaks with file_transaction(consensusfile) as tx_consensus_file: with utils.tmpfile(suffix=".bed") as tmpbed: message = f"Move all peaks in {' '.join(peakfiles)} to a single file." mergepeakscmd = f"{bedops} -u {' '.join(peakfiles)} > {tmpbed}" do.run(mergepeakscmd, message) iteration = 0 while os.path.getsize(tmpbed): iteration = iteration + 1 iterationbed = NamedTemporaryFile(suffix=".bed", delete=False).name with utils.tmpfile(suffix="bed") as mergedbed, \ utils.tmpfile(suffix="bed") as intermediatebed, \ utils.tmpfile(suffix="bed") as leftoverbed, \ utils.tmpfile(suffix="bed") as tmpsolutionbed: mergecmd = (f"{bedops} -m --range 0:-1 {tmpbed} | " f"{bedops} -u --range 0:1 - > " f"{mergedbed}") message = f"Merging non-overlapping peaks, iteration {iteration}." do.run(mergecmd, message) nitems = len(open(mergedbed).readlines()) message = f"Considering {nitems} peaks, choosing the highest score for overlapping peaks." highscorecmd = ( f"{bedmap} --max-element {mergedbed} {tmpbed} |" f"{sortbed} - > " f"{iterationbed}") do.run(highscorecmd, message) message = f"Checking if there are peaks left to merge." anyleftcmd = ( f"{bedops} -n 1 {tmpbed} {iterationbed} > {intermediatebed}" ) do.run(anyleftcmd, message) shutil.move(intermediatebed, tmpbed) nitems = len(open(iterationbed).readlines()) message = f"Adding {nitems} peaks to consensus peaks." if utils.file_exists(tx_consensus_file): consensuscmd = ( f"{bedops} -u {tx_consensus_file} {iterationbed} > {tmpsolutionbed}" ) do.run(consensuscmd, message) shutil.move(tmpsolutionbed, tx_consensus_file) else: shutil.move(iterationbed, tx_consensus_file) return consensusfile
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: opts = " ".join(opts) opts += " --min-repeat-entropy 1" opts += " --no-partial-observations" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" # For multi-sample outputs, ensure consistent order samples = ( "-s " + ",".join([dd.get_sample_name(d) for d in items])) if len(items) > 1 else "" fix_ambig = vcfutils.fix_ambiguous_cl() bcbio_py = sys.executable py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ( "{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ """| {bcbio_py} -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("{paired.tumor_name}", "{paired.normal_name}")' """ "| {fix_ambig} | bcftools view {samples} -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null " "{compress_cmd} > {tx_out_file}") do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info("No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return [[data]] else: logger.error("No UMI transform was specified, but %s does not look " "pre-transformed." % fq1) sys.exit(1) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join( [str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([_unpack_fastq(fastq_file), _unpack_fastq(pair_file)]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join([str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError( "Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" tumor_bam, _, normal_bam, _ = get_paired_bams(align_bams, items) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) if do.file_exists(snp_file): to_combine.append(snp_file) if do.file_exists(indel_file): to_combine.append(indel_file) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) _fix_varscan_vcf(out_file, align_bams)
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) if is_human: dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) prediction_args = ["--sift", "b", "--polyphen", "b"] prediction_fields = ["PolyPhen", "SIFT"] else: dbnsfp_args, dbnsfp_fields = [], [] loftee_args, loftee_fields = [], [] prediction_args, prediction_fields = [], [] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + \ prediction_args + dbnsfp_args + loftee_args if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one # variant per gene # From the VEP docs: # "Pick once line of consequence data per variant, # including transcript-specific columns. Consequences are # chosen by the canonical, biotype status and length of the # transcript, along with the ranking of the consequence # type according to this table. This is the best method to # use if you are interested only in one consequence per # variant. cmd += ["--pick"] # TODO investigate hgvs reporting but requires indexing the reference file # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)] perllib = "export PERL5LIB=%s:$PERL5LIB" % _get_perllib() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perllib, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = [ "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations( items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus( out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus( tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports() tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ( "{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}" ) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join( _scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join( tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ( "{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index( os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression( "reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}" ) do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) antibody = dd.get_antibody(data) if antibody: antibody = antibody.lower() if antibody not in antibodies.SUPPORTED_ANTIBODIES: logger.error( f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody " f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody." f"It will run 'narrow' if the antibody is not supported.") antibody = 'narrow' antibody = antibodies.ANTIBODIES[antibody] logger.info( f"{antibody.name} specified, using {antibody.peaktype} peak settings." ) peaksettings = select_peak_parameters(antibody) elif method == "atac": logger.info(f"ATAC-seq specified, using narrow peak settings.") peaksettings = " " else: peaksettings = " " options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" chip_reads = sum([x.aligned for x in bam.idxstats(chip_bam, data)]) if chip_reads == 0: logger.error( f"{chip_bam} has 0 reads. Please remove the sample and re-run") raise RuntimeWarning( f"macs2 terminated - no reads in {chip_bam}. Please remove the sample and re-run" ) with utils.chdir(out_dir): cmd = _macs2_cmd(data) cmd += peaksettings try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error. " "Please, check the message and report " "error if it is related to bcbio. " "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir)
def _run_purple(paired, het_file, depth_file, vrn_files, work_dir): """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs. """ purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple")) out_file = os.path.join( purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-amber", os.path.dirname(het_file), "-baf", het_file, "-cobalt", os.path.dirname(depth_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"], "-output_dir", os.path.dirname(tx_out_file), "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19", "-run_dir", work_dir, "-threads", dd.get_num_cores(paired.tumor_data), "-tumor_sample", dd.get_sample_name(paired.tumor_data), "-ref_sample", dd.get_sample_name(paired.normal_data)] if vrn_files: cmd += ["-somatic_vcf", vrn_files[0]["vrn_file"]] # Avoid X11 display errors when writing plots cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd]) do.run(cmd, "PURPLE: purity and ploidy estimation") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(purple_dir, f)) out_file_export = os.path.join( purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data))) if not utils.file_exists(out_file_export): utils.symlink_plus(out_file, out_file_export) out = { "variantcaller": "purple", "call_file": out_file_export, "vrn_file": titancna.to_vcf(out_file_export, "PURPLE", _get_header, _export_to_vcf, paired.tumor_data), "plot": {}, "metrics": {} } for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]: plot_file = os.path.join( purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext)) if os.path.exists(plot_file): out["plot"][name] = plot_file purity_file = os.path.join( purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data)) with open(purity_file) as in_handle: header = in_handle.readline().replace("#", "").split("\t") vals = in_handle.readline().split("\t") for h, v in zip(header, vals): try: v = float(v) except ValueError: pass out["metrics"][h] = v return out
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data