def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") qual_format = config["algorithm"].get("quality_format", "").lower() qual_flag = "ILMFQ" if qual_format == "illumina" else "STDFQ" rg_info = get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -F {qual_flag} -c {num_cores} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) log_cmd("Novoalign: %s" % names["sample"], None, cmd) subprocess.check_call(cmd, shell=True) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samblaster = config_utils.get_program("samblaster", data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file): with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with utils.curdir_tmpdir() as tmpdir: tobam_cmd = ("{sambamba} view -S -f bam -l 0 /dev/stdin | " "{sambamba} sort -t {cores} -m {mem} --tmpdir {tmpdir} " "-o {out_file} /dev/stdin") splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, **locals()) cmd = ("{sambamba} sort -t {cores} -m {mem} --tmpdir={tmpdir} " "-n -o /dev/stdout -l 0 {in_bam} | " "{sambamba} view -h /dev/stdin | " "{samblaster} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "-o /dev/null") do.run(cmd.format(**locals()), "samblaster: split and discordant reads", data) return sr_file, disc_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( "{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file): """Deduplicate and sort with samblaster, produces split read and discordant pair files. """ samblaster = config_utils.get_program("samblaster", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] tobam_cmd = ("{samtools} sort {sort_opt} -@ {cores} -m {mem} -T {tmp_prefix}-{dext} {out_file} -") # full BAM -- associate more memory and cores cores, mem = _get_cores_memory(data, downscale=2) # Potentially downsample to maximum coverage here if not splitting and whole genome sample ds_cmd = None if data.get("align_split") else bam.get_maxcov_downsample_cl(data, "samtools") sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else "" if ds_cmd: dedup_cmd = "%s %s > %s" % (tobam_cmd.format(out_file="", dext="full", **locals()), ds_cmd, tx_out_file) else: dedup_cmd = tobam_cmd.format(out_file="-o %s" % tx_out_file, dext="full", **locals()) # split and discordant BAMs -- give less memory/cores since smaller files sort_opt = "" cores, mem = _get_cores_memory(data, downscale=4) splitter_cmd = tobam_cmd.format(out_file="-o %s" % tx_sr_file, dext="spl", **locals()) discordant_cmd = tobam_cmd.format(out_file="-o %s" % tx_disc_file, dext="disc", **locals()) # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem cmd = ("{samblaster} --addMateTags -M --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "| {dedup_cmd}") return cmd.format(**locals())
def _piped_dedup_recal_cmd(data, prep_params, tmp_dir, out_file): """Generate de-duplication and recalibration commandline. """ if prep_params["dup"] == "bamutil": assert prep_params["recal"] in ["bamutil", False], ( "Cannot handle recalibration approach %s with bamutil dedup" % prep_params["recal"] ) out_stream = "-.ubam" if prep_params["realign"] else "-.bam" return "| " + recalibrate.bamutil_dedup_recal_cl("-.ubam", out_stream, data, prep_params["recal"] == "bamutil") elif prep_params["dup"] == "samtools": samtools = config_utils.get_program("samtools", data["config"]) return "| " + "{samtools} rmdup - -".format(**locals()) elif prep_params["dup"] == "biobambam": biobambam_md = config_utils.get_program("bammarkduplicates2", data["config"]) num_cores = 1 compression_level = 1 if prep_params.get("realign") else 9 tmpfile = os.path.join(tmp_dir, "%s-md" % os.path.splitext(os.path.basename(out_file))[0]) metrics_file = "%s-dupmetrics.txt" % (os.path.splitext(out_file)[0]) return ( "| {biobambam_md} level={compression_level} markthreads={num_cores} verbose=0 " "M={metrics_file} tmpfile={tmpfile}".format(**locals()) ) elif prep_params["dup"]: raise ValueError("Unexpected deduplication approach: %s" % prep_params["dup"]) else: return ""
def index(in_bam, config): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if not utils.file_exists(index_file) and not utils.file_exists(alt_index_file): try: sambamba = config_utils.get_program("sambamba", config) except config_utils.CmdNotFound: sambamba = None samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(index_file) as tx_index_file: samtools_cmd = "{samtools} index {in_bam} {tx_index_file}" if sambamba: cmd = "{sambamba} index -t {num_cores} {in_bam} {tx_index_file}" else: cmd = samtools_cmd # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam), log_error=False) except: do.run(samtools_cmd.format(**locals()), "Index BAM file (single core): %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novosort = config_utils.get_program("novosort", config) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: rg_info = get_rg_info(names) cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 " " -n -t {work_dir} {in_bam} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} " " -o {tx_out_file} /dev/stdin") cmd = cmd.format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) samtools = config_utils.get_program("samtools", config) sambamba = config_utils.get_program("sambamba", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: try: cmd = "{sambamba} index -t {num_cores} {in_bam} {tx_index_file}" do.run(cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam)) except subprocess.CalledProcessError: cmd = "{samtools} index {in_bam} {tx_index_file}" do.run(cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina") if pair_file: pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina") bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) samtools = config_utils.get_program("samtools", data["config"]) cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} ") with file_transaction(data, out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file, name_sort=True) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def merge(bamfiles, out_bam, config): assert all(map(is_bam, bamfiles)), ("Not all of the files to merge are not BAM " "files: %s " % (bamfiles)) assert all(map(utils.file_exists, bamfiles)), ("Not all of the files to merge " "exist: %s" % (bamfiles)) if len(bamfiles) == 1: return bamfiles[0] if os.path.exists(out_bam): return out_bam sambamba = _get_sambamba(config) sambamba = None samtools = config_utils.get_program("samtools", config) bamtools = config_utils.get_program("bamtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, out_bam) as tx_out_bam: try: if sambamba: cmd = "{sambamba} merge -t {num_cores} {tx_out_bam} " + " ".join(bamfiles) else: cmd = "{samtools} merge -@ {num_cores} {tx_out_bam} " + " ".join(bamfiles) do.run(cmd.format(**locals()), "Merge %s into %s." % (bamfiles, out_bam)) except subprocess.CalledProcessError: files = " -in ".join(bamfiles) cmd = "{bamtools} merge -in {files} -out {tx_out_bam}" do.run(cmd.format(**locals()), "Error with other tools. Merge %s into %s with bamtools" % (bamfiles, out_bam)) index(out_bam, config) return out_bam
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling") vcfsamplediff = config_utils.get_program("vcfsamplediff", config) freebayes = config_utils.get_program("freebayes", config) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) opts += " -f {}".format(ref_file) # NOTE: The first sample name in the vcfsamplediff call is # the one supposed to be the *germline* one compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cl = ("{freebayes} --pooled-discrete --pvar 0.7" " --genotype-qualities {opts} {paired.tumor_bam}" " {paired.normal_bam} | {vcfsamplediff} -s VT" " {paired.normal_name} {paired.tumor_name}" " - {compress_cmd} > {tx_out_file}") bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts = " ".join(_freebayes_options_from_config(items, config, out_file, region)) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1 --experimental-gls" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools") bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools") bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" vcfutils = config_utils.get_program("vcfutils.pl", config) cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "> {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file): """Deduplicate and sort with samblaster, produces split read and discordant pair files. """ samblaster = config_utils.get_program("samblaster", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=3) tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] for ext in ["spl", "disc", "full"]: utils.safe_makedir("%s-%s" % (tmp_prefix, ext)) if data.get("align_split"): full_tobam_cmd = _nosort_tobam_cmd() else: full_tobam_cmd = ("samtools view -b -u - | " "sambamba sort -t {cores} -m {mem} " "--tmpdir {tmp_prefix}-{dext} -o {out_file} /dev/stdin") tobam_cmd = ("{samtools} sort -@ {cores} -m {mem} " "-T {tmp_prefix}-{dext} -o {out_file} /dev/stdin") # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem # https://github.com/GregoryFaust/samblaster/releases/tag/v.0.1.22 if LooseVersion(programs.get_version_manifest("samblaster", data=data, required=True)) >= LooseVersion("0.1.22"): opts = "-M" else: opts = "" splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, dext="spl", **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, dext="disc", **locals()) dedup_cmd = full_tobam_cmd.format(out_file=tx_out_file, dext="full", **locals()) cmd = ("{samblaster} {opts} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "| {dedup_cmd}") return cmd.format(**locals())
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) rg_info = get_rg_info(names) if not utils.file_exists(out_file): check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "768M") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): _check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def _calc_regional_coverage(in_bam, chrom, start, end, samplename, work_dir, data): """ given a BAM and a region, calculate the coverage for each base in that region. returns a pandas dataframe of the format: chrom position coverage name where the samplename column is the coverage at chrom:position """ region_bt = pybedtools.BedTool("%s\t%s\t%s\n" % (chrom, start, end), from_string=True).saveas() region_file = region_bt.fn coords = "%s:%s-%s" % (chrom, start, end) tx_tmp_file = os.path.join(work_dir, "coverage-%s-%s.txt" % (samplename, coords.replace(":", "_"))) samtools = config_utils.get_program("samtools", data) bedtools = config_utils.get_program("bedtools", data) cmd = ("{samtools} view -b {in_bam} {coords} | " "{bedtools} coverage -a {region_file} -b - -d > {tx_tmp_file}") do.run(cmd.format(**locals()), "Plotting coverage for %s %s" % (samplename, coords)) names = ["chom", "start", "end", "offset", "coverage"] df = pd.io.parsers.read_table(tx_tmp_file, sep="\t", header=None, names=names).dropna() os.remove(tx_tmp_file) df["sample"] = samplename df["chrom"] = chrom df["position"] = df["start"] + df["offset"] - 1 return df[["chrom", "position", "coverage", "sample"]]
def illumina_qual_bin(in_file, ref_file, out_dir, config): """Uses CRAM to perform Illumina 8-bin approaches to existing BAM files. Bins quality scores according to Illumina scheme: http://www.illumina.com/Documents/products/whitepapers/whitepaper_datacompression.pdf Also fixes output header to remove extra run groups added by CRAM during conversion. """ index_file = ref_file + ".fai" assert os.path.exists(index_file), "Could not find FASTA reference index: %s" % index_file out_file = os.path.join(out_dir, "%s-qualbin%s" % os.path.splitext(os.path.basename(in_file))) cram_jar = config_utils.get_jar("cramtools", config_utils.get_program("cram", config, "dir")) samtools = config_utils.get_program("samtools", config) if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: orig_header = "%s-header.sam" % os.path.splitext(out_file)[0] header_cmd = "{samtools} view -H -o {orig_header} {in_file}" cmd = ("java -jar {cram_jar} cram --input-bam-file {in_file} " " --reference-fasta-file {ref_file} --preserve-read-names " " --capture-all-tags --lossy-quality-score-spec '*8' " "| java -jar {cram_jar} bam --output-bam-format " " --reference-fasta-file {ref_file} " "| {samtools} reheader {orig_header} - " "> {tx_out_file}") logger.info("Quality binning with CRAM") subprocess.check_call(header_cmd.format(**locals()), shell=True) subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novosort = config_utils.get_program("novosort", config) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: rg_info = r"@RG\tID:{rg}\tPL:{pl}\tPU:{pu}\tSM:{sample}".format(**names) cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 " " -n -t {work_dir} {in_bam} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} " "| {samtools} view -b -S -u - " "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} " " -o {tx_out_file} /dev/stdin") subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def compress(in_bam, data): """Compress a BAM file to CRAM, providing indexed CRAM file. Does 8 bin compression of quality score and read name removal using bamUtils squeeze: http://genome.sph.umich.edu/wiki/BamUtil:_squeeze """ out_file = "%s.cram" % os.path.splitext(in_bam)[0] cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: samtools = config_utils.get_program("samtools", data["config"]) bam = config_utils.get_program("bam", data["config"]) to_cram = ("{samtools} view -T {ref_file} -@ {cores} " "-C -x BD -x BI -o {tx_out_file}") try: cmd = ("{bam} squeeze --in {in_bam} --out -.ubam --keepDups " "--binQualS=2,10,20,25,30,35,70 --binMid | " + to_cram) do.run(cmd.format(**locals()), "Compress BAM to CRAM") # Retry failures avoiding using bam squeeze which can cause issues except subprocess.CalledProcessError: cmd = (to_cram + " {in_bam}") do.run(cmd.format(**locals()), "Compress BAM to CRAM") index(out_file, data["config"]) return out_file
def _get_bgzip_cmd(config): """Retrieve command to use for bgzip, trying to use parallel pbgzip if available. """ try: pbgzip = config_utils.get_program("pbgzip", config) return "%s -n %s " % (pbgzip, config["algorithm"].get("num_cores", 1)) except config_utils.CmdNotFound: return config_utils.get_program("bgzip", config)
def is_installed(config): """Check for qsnp installation on machine. """ try: config_utils.get_program("qsnp", config) return True except config_utils.CmdNotFound: return False
def is_installed(config): """Check for scalpel installation on machine. """ try: config_utils.get_program("scalpel-discovery", config) return True except config_utils.CmdNotFound: return False
def _run_gemini_stats(bam_file, data, out_dir): """Retrieve high level variant statistics from Gemini. """ out = {} gemini_dbs = [d for d in [tz.get_in(["population", "db"], x) for x in data.get("variants", [])] if d] if len(gemini_dbs) > 0: gemini_db = gemini_dbs[0] gemini_stat_file = "%s-stats.yaml" % os.path.splitext(gemini_db)[0] if not utils.file_uptodate(gemini_stat_file, gemini_db): gemini = config_utils.get_program("gemini", data["config"]) tstv = subprocess.check_output([gemini, "stats", "--tstv", gemini_db]) gt_counts = subprocess.check_output([gemini, "stats", "--gts-by-sample", gemini_db]) dbsnp_count = subprocess.check_output([gemini, "query", gemini_db, "-q", "SELECT count(*) FROM variants WHERE in_dbsnp==1"]) out["Transition/Transversion"] = tstv.split("\n")[1].split()[-1] for line in gt_counts.split("\n"): parts = line.rstrip().split() if len(parts) > 0 and parts[0] != "sample": name, hom_ref, het, hom_var, _, total = parts out[name] = {} out[name]["Variations (heterozygous)"] = int(het) out[name]["Variations (homozygous)"] = int(hom_var) # same total variations for all samples, keep that top level as well. out["Variations (total)"] = int(total) out["Variations (in dbSNP)"] = int(dbsnp_count.strip()) if out.get("Variations (total)") > 0: out["Variations (in dbSNP) pct"] = "%.1f%%" % (out["Variations (in dbSNP)"] / float(out["Variations (total)"]) * 100.0) with open(gemini_stat_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) else: with open(gemini_stat_file) as in_handle: out = yaml.safe_load(in_handle) else: vcf_file = dd.get_vrn_file(data) if isinstance(vcf_file, list): vcf_file = vcf_file[0] if vcf_file: out_file = "%s-bcfstats.tsv" % utils.splitext_plus(vcf_file)[0] bcftools = config_utils.get_program("bcftools", data["config"]) if not utils.file_exists(out_file): cmd = ("{bcftools} stats -f PASS {vcf_file} > {out_file}") do.run(cmd.format(**locals()), "basic vcf stats %s" % data["name"][-1]) with open(out_file) as in_handle: for line in in_handle: if line.startswith("SN") and line.find("records") > -1: cols = line.split() print line out["Variations (total)"] = cols[-1] res = {} for k, v in out.iteritems(): if not isinstance(v, dict): res.update({k: v}) if k == data["name"][-1]: res.update(v) return res
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1 = data["files"][0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return data else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed. Assuming non-umi data." % fq1) return data if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return data umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return data cmd = ("{umis} fastqtransform {transform_file} " "--cores {cores} " "{fq1}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return data
def run(bam_file, data, out_dir): """Run viral QC analysis: 1. Extract the unmapped reads 2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files 3. Report viruses that are in more than 50% covered by at least 5x """ source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files' viral_target = "gdc-viral" out = {} viral_refs = [ x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target ] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join( utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0] cores = dd.get_num_cores(data) samtools = config_utils.get_program("samtools", data["config"]) bamtofastq = config_utils.get_program("bamtofastq", data["config"]) bamsort = config_utils.get_program("bamsort", data["config"]) if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] tmpbam = "%s-tmpbam" % utils.splitext_plus(tx_out_file)[0] # the weirdest bug # in bcbio1.2.9a ipython (only ipython not multicore) runs fail after this step with bgzf error, see issue 3581 # what helps is to samtools view the file to restore the proper EOF cmd = ( f"{samtools} view -u -f 4 {bam_file} | " f"{bamtofastq} collate=0 | " f"bwa mem -t {cores} {viral_ref} - | " f"{bamsort} tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " f"inputformat=sam index=1 indexfilename={tmpbam}.bai O={tmpbam}.bam &&" f"{samtools} view -bh {tmpbam}.bam > {tx_out_file} && " f"{samtools} index {tx_out_file}") do.run(cmd, "Align unmapped reads to viral genome") total_reads = _count_reads(bam_file, data) assert total_reads > 0, 'Reads count is {total_reads}, is there a bug in counting the read count? {bam_file}'.format( **locals()) with file_transaction(data, out_file) as tx_out_file: sample_name = dd.get_sample_name(data) mosdepth_prefix = os.path.splitext(viral_bam)[0] mosdepth = config_utils.get_program("mosdepth", data) cmd = ( "{mosdepth} -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by " "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && " "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&" "echo '## Sample: {sample_name}' >> {tx_out_file} && " "echo '#virus\tsize\tdepth\t1x\t5x\t25x\treads\treads_pct' >> {tx_out_file} && " "paste " "<(zcat {mosdepth_prefix}.regions.bed.gz) " "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) " "<(samtools idxstats {viral_bam} | grep -v '*') | " "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | " "sort -n -r -k 5,5 >> {tx_out_file}") do.run(cmd.format(**locals()), "Analyse coverage of viral genomes") if chromhacks.get_EBV(data): ref_file = dd.get_ref_file(data) work_bam = dd.get_work_bam(data) ebv = chromhacks.get_EBV(data) mosdepth_prefix = os.path.splitext(work_bam)[0] + "-EBV" mosdepth = config_utils.get_program("mosdepth", data) cmd = ( "{mosdepth} -t {cores} {mosdepth_prefix} {work_bam} -n --thresholds 1,5,25 --by " "<(grep {ebv} {ref_file}.fai | awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}') && " "paste " "<(zcat {mosdepth_prefix}.regions.bed.gz) " "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) " "<(samtools idxstats {work_bam} | grep {ebv}) | " "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | " "sort -n -r -k 5,5 >> {tx_out_file}") do.run(cmd.format(**locals()), "Analyse coverage of EBV") out["base"] = out_file out["secondary"] = [] return out
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams( align_bams, items) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, normal_name, tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, normal_name, tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) if _should_run_fusion(data): cmd += ( " --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM " ) strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) scalpel = config_utils.get_program("scalpel", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) tmp_path = os.path.dirname(tx_out_file) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov cl = ( "{scalpel} --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}" ) bam.index(paired.tumor_bam, config) bam.index(paired.normal_bam, config) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # somatic scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "main/somatic." + min_cov + "x.indel.vcf"), config) # common scalpel_tmp_file_common = bgzip_and_index( os.path.join(tmp_path, "main/common." + min_cov + "x.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression( "reject", config) cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " sed 's/sample_name/{paired.tumor_name}/g' | " "{vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir) if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"] else None) if ds_file is not None: bam_file = ds_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) # FastQC scales memory with threads (250mb per thread) so we avoid # very low memory usage num_cores = max(data["config"]["algorithm"].get("num_cores", 1), 2) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file ] cl = "%s %s %s" % (utils.java_freetype_fix(), utils.local_path_export(), " ".join( [str(x) for x in cl])) do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists( tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write( line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move( os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move( "%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): raise ValueError( "FastQC failed to produce output HTML file: %s" % os.listdir(tx_tmp_dir)) logger.info("Produced HTML report %s" % sentry_file) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = dd.get_variantcaller(items[0]) vardict = "vardict-java" if not vardict.endswith("-perl") else "vardict" strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join(_vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _run_bwa_align(fastq_file, ref_file, out_file, config): aln_cl = [config_utils.get_program("bwa", config), "aln", "-n 2", "-k 2"] aln_cl += _bwa_args_from_config(config) aln_cl += [ref_file, fastq_file] cmd = "{cl} > {out_file}".format(cl=" ".join(aln_cl), out_file=out_file) do.run(cmd, "bwa aln: {f}".format(f=os.path.basename(fastq_file)), None)
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): if not hla_on(data) or needs_separate_hla(data): bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-", with_hla=False) else: bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-", with_hla=True) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "unset JAVA_HOME && " "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) data["work_bam"] = out_file hla_file = "HLA-" + out_file if needs_separate_hla(data) and not utils.file_exists(hla_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, hla_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): bwa_cmd = _get_bwa_mem_cmd(data, hla_file, ref_file, "-", with_hla=True) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "unset JAVA_HOME && " "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file, names, rg_info, data) data["hla_bam"] = hla_file return data
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted( bam_files[0], data["config"], "coordinate"): with file_transaction(data, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, data["config"]) out_file = bam_files[0] samtools = config_utils.get_program("samtools", data["config"]) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, data, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with tx_tmpdir(data) as tmpdir: with utils.chdir(tmpdir): with file_transaction(data, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist( bam_files, tx_out_file, data["config"]) sambamba = config_utils.get_program( "sambamba", data["config"]) samtools = config_utils.get_program( "samtools", data["config"]) resources = config_utils.get_resources( "samtools", data["config"]) num_cores = dd.get_num_cores(data) max_mem = config_utils.adjust_memory( resources.get("memory", "1G"), 2, "decrease").upper() if bam.bam_already_sorted(bam_files[0], data["config"], "coordinate"): cmd = _sambamba_merge(bam_files) else: # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) assert dd.get_mark_duplicates(data) cmd = _biobambam_merge_dedup_maxcov(data) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run( '{} quickcheck -v {}'.format( samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, data["config"]) bam.index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) config_args = [] if is_human: plugin_fns = { "dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer } plugins = ["dbnsfp", "loftee", "dbscsnv"] if "vep_splicesite_annotations" in dd.get_tools_on(data): plugins += ["maxentscan", "genesplicer"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] # Use HGVS by default, requires indexing the reference genome config_args += [ "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data) ] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in( ("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--gmaf", "--maf_1kg", "--maf_esp", "--maf_exac", "--pubmed", "--variant_class"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join( [str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) folders = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = pfiles["base"] folders.append(os.path.dirname(pfiles)) # XXX temporary workaround until we can handle larger inputs through MultiQC folders = list(set(folders)) if len(folders) > 250: logger.warning( "Too many samples for MultiQC, only using first 250 entries.") folders = folders[:250] opts = "--flat" # Back compatible -- to migrate to explicit specifications in input YAML folders += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): input_dir = " ".join([_check_multiqc_input(d) for d in folders]) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_dir.strip(): cmd = "{export_tmp} {multiqc} -f {input_dir} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } out.append(data) return [[d] for d in out]
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = dd.get_variantcaller(items[0]) vardict = "vardict-java" if not vardict.endswith("-perl") else "vardict" vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def chipseq_count(data): """ count reads mapping to ChIP/ATAC consensus peaks with featureCounts """ method = dd.get_chip_method(data) if method == "chip": in_bam = dd.get_work_bam(data) elif method == "atac": if bam.is_paired(dd.get_work_bam(data)): in_bam = tz.get_in(("atac", "align", "NF"), data) else: in_bam = tz.get_in(("atac", "align", "full"), data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data) saf_file = os.path.splitext(consensus_file)[0] + ".saf" work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "consensus") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): if method == "atac": if bam.is_paired(dd.get_work_bam(data)): data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) else: data = tz.assoc_in(data, ("peak_counts", "full"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count_file) return [[data]] featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ( "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {sorted_bam}") message = ("Count reads in {sorted_bam} overlapping {saf_file} using " "featureCounts.") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) if method == "atac": if bam.is_paired(dd.get_work_bam(data)): data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) else: data = tz.assoc_in(data, ("peak_counts", "full"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count_file) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ data = umi_transform(data) in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) log_out = os.path.join(out_dir, "%s.log" % names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) data["log_trimming"] = log_out return [[data]] adapter = dd.get_adapters(data) is_4n = any([a == "4N" for a in adapter]) adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)] if adapter and not trim_reads: trim_reads = True logger.info( "Adapter is set up in config file, but trim_reads is not true." "If you want to skip trimming, skip adapter option from config.") if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) if trim_reads: adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if not trim_reads or len( adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) if any([a for a in adapters if re.compile("^N+$").match(a)]): adapter_cmd = "-N %s" % adapter_cmd out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) # atropos = _get_atropos() atropos = config_utils.get_program("atropos", data, default="atropos") options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) if options.strip() == "-u 4 -u -4": options = "" is_4n = "4N" cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if is_4n: options = "-u 4 -u -4" in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run( cmd.format(**locals()), "atropos with this parameters %s for %s" % (options, names)) data["log_trimming"] = log_out else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def split_namedpipe_cl(in_file, data): """Create a commandline suitable for use as a named pipe with reads in a given region. """ grabix = config_utils.get_program("grabix", data["config"]) start, end = data["align_split"] return "<({grabix} grab {in_file} {start} {end})".format(**locals())
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 15 --chimJunctionOverhangMin 15 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): raw_file = "%s-raw%s" % utils.splitext_plus(out_file) with file_transaction(items[0], raw_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if dd.get_avg_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """ "| bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = raw_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if raw_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) if assoc_files.get("dbsnp"): annotation.add_dbsnp(raw_file, assoc_files["dbsnp"], items[0], out_file) else: utils.symlink_plus(raw_file, out_file) return out_file
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs( [utils.deepish_copy(x) for x in samples], samples[0]) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir( samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources( "multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add( os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add( os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json( samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append( data_json_final) file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append( file_list_final) return [[data] for data in samples]
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [ None, False, "None" ] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps( bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file) }
def runner_from_config(config, program="gatk"): return BroadRunner(_get_picard_ref(config), config_utils.get_program(program, config, "dir"), config)
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError("Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.bam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-coverage-search"] = True options["no-mixed"] = True tophat_runner = sh.Command(config_utils.get_program("tophat", config)) ready_options = {} for k, v in options.items(): ready_options[k.replace("-", "_")] = v # tophat requires options before arguments, # otherwise it silently ignores them tophat_ready = tophat_runner.bake(**ready_options) cmd = "%s %s" % (sys.executable, str(tophat_ready.bake(*files))) do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base), ref_file, config) else: fixed = out_file fixed_unmapped = _fix_unmapped(fixed, unmapped, data) fixed = merge_unmapped(fixed, fixed_unmapped, config) fixed = _add_rg(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"], os.path.splitext(fixed)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([ _unpack_fastq(fastq_file), _unpack_fastq(pair_file) ]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_transcriptome_gtf(data) if not gtf_file: gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): if "arriba" in dd.get_fusion_caller(data): cmd += ( "--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions " "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 " "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 " "--alignSJstitchMismatchNmax 5 -1 5 5 " "--chimSegmentReadGapMax 3 ") else: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join( [str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def _count_reads(bam_file, data): samtools = config_utils.get_program("samtools", data) cmd = "%s idxstats %s | awk '{sum += $3 + $4} END {print sum}'" count = subprocess.check_output(cmd % (samtools, bam_file), shell=True) return int(count.strip())
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. Will parallelize up to 4 cores based on documented recommendations: https://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) params = [ "-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file ] priority_order = [] for i, ready_file in enumerate(ready_files): name = "v%s" % i params.extend( ["--variant:{name}".format(name=name), ready_file]) priority_order.append(name) params.extend(["--rod_priority_list", ",".join(priority_order)]) if quiet_out: params.extend( ["--suppressCommandLineHeader", "--setKey", "null"]) variant_regions = config["algorithm"].get("variant_regions", None) cur_region = shared.subset_variant_regions(variant_regions, region, out_file) if cur_region: params += [ "-L", bamprep.region_to_gatk(cur_region), "--interval_set_rule", "INTERSECTION" ] cores = tz.get_in(["algorithm", "num_cores"], config, 1) if cores > 1: params += ["-nt", min(cores, 4)] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale) cmd = [config_utils.get_program("gatk-framework", config) ] + jvm_opts + params do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{ file_key: out_file, "region": region, "sam_ref": ref_file, "config": config }] else: return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' " "| %s " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), _lowfreq_linear_filter(0, True), os.path.join(os.path.dirname(sys.executable), "py"), 0, bam.aligner_from_header(paired.tumor_bam))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| awk 'NF>=48' | testsomatic.R " "| var2vcf_paired.pl -P 0.9 -m 4.25 {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def run_mosdepth(data, target_name, bed_file, per_base=False, quantize=None, thresholds=None): """Run mosdepth generating distribution, region depth and per-base depth. """ MosdepthCov = collections.namedtuple( "MosdepthCov", ("dist", "per_base", "regions", "quantize", "thresholds")) bam_file = dd.get_align_bam(data) or dd.get_work_bam(data) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) prefix = os.path.join(work_dir, "%s-%s" % (dd.get_sample_name(data), target_name)) old_dist_file = "%s.mosdepth.dist.txt" % (prefix) out = MosdepthCov( (old_dist_file if utils.file_uptodate( old_dist_file, bam_file) else "%s.mosdepth.%s.dist.txt" % (prefix, "region" if bed_file else "global")), ("%s.per-base.bed.gz" % prefix) if per_base else None, ("%s.regions.bed.gz" % prefix) if bed_file else None, ("%s.quantized.bed.gz" % prefix) if quantize else None, ("%s.thresholds.bed.gz" % prefix) if thresholds else None) if not utils.file_uptodate(out.dist, bam_file): bam.index(bam_file, dd.get_config(data)) with file_transaction(data, out.dist) as tx_out_file: tx_prefix = os.path.join(os.path.dirname(tx_out_file), os.path.basename(prefix)) num_cores = dd.get_cores(data) bed_arg = ("--by %s" % bed_file) if bed_file else "" perbase_arg = "" if per_base else "--no-per-base" mapq_arg = "-Q 1" if (per_base or quantize) else "" if quantize: quant_arg = "--quantize %s" % quantize[0] quant_export = " && ".join([ "export MOSDEPTH_Q%s=%s" % (i, x) for (i, x) in enumerate(quantize[1]) ]) quant_export += " && " else: quant_arg, quant_export = "", "" thresholds_cmdl = ( "-T " + ",".join([str(t) for t in thresholds])) if out.thresholds else "" mosdepth = config_utils.get_program("mosdepth", data) cmd = ( "{quant_export}{mosdepth} -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} " "{tx_prefix} {bam_file} {thresholds_cmdl}") message = "Calculating coverage: %s %s" % ( dd.get_sample_name(data), target_name) do.run(cmd.format(**locals()), message.format(**locals())) if out.per_base: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.per_base)), out.per_base) if out.regions: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.regions)), out.regions) if out.quantize: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.quantize)), out.quantize) if out.thresholds: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.thresholds)), out.thresholds) return out
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions( vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) lowfreq_filter = _lowfreq_linear_filter(0, False) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} " "-N {sample} -b {bamfile} {opts} " "| teststrandbias.R " "| var2vcf_valid.pl -A -N {sample} -E {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ( "{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}" ) do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join( _scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join( tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ( "{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index( os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression( "reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() add_contig = vcfutils.add_contig_to_header_cl( dd.get_ref_file(items[0]), tx_out_file) cl2 = ( "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} | {add_contig} {compress_cmd} > {tx_out_file}" ) do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) return out_file