def variants(data): if not "vrn_file" in data: return data in_vcf = data['vrn_file'] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = data['work_bam'] ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." jvm_opts = broad.get_gatk_framework_opts(data['config']) gatk_jar = config_utils.get_program("gatk", data['config'], "dir") bed_file = dd.get_variant_regions(data) sample = splitext_plus(os.path.basename(in_vcf))[0] in_bam = data["work_bam"] cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_cg-depth-parse.tsv") if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} " "-L {bed_file} -I {in_bam} " "-A GCContent --variant {in_vcf} --out {tx_out}") do.run(cmd.format(**locals()), " GC bias for %s" % in_vcf) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R {bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), " query for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) # return df return data
def align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """Perform a BWA alignment, generating a SAM file. """ config = data["config"] sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base) sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base) if pair_file else None) sam_file = os.path.join(align_dir, "%s.sam" % out_base) if not utils.file_exists(sam_file): if not utils.file_exists(sai1_file): with file_transaction(sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not utils.file_exists(sai2_file): with file_transaction(sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) align_type = "sampe" if sai2_file else "samse" sam_cl = [config_utils.get_program("bwa", config), align_type, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file) as tx_sam_file: cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file) do.run(cmd, "bwa {align_type}".format(**locals()), None) return sam_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir() as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform a BWA alignment, generating a SAM file. """ assert not data.get("align_split"), "Do not handle split alignments with non-piped bwa" config = data["config"] sai1_file = os.path.join(align_dir, "%s_1.sai" % names["lane"]) sai2_file = (os.path.join(align_dir, "%s_2.sai" % names["lane"]) if pair_file else None) sam_file = os.path.join(align_dir, "%s.sam" % names["lane"]) if not utils.file_exists(sam_file): if not utils.file_exists(sai1_file): with file_transaction(sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not utils.file_exists(sai2_file): with file_transaction(sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) align_type = "sampe" if sai2_file else "samse" rg_info = novoalign.get_rg_info(names) sam_cl = [config_utils.get_program("bwa", config), align_type, "-r", "'%s'" % rg_info, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file) as tx_sam_file: cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file) do.run(cmd, "bwa {align_type}".format(**locals()), None) return sam_file
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config, rg_name=None): """Perform a BWA alignment, generating a SAM file. """ sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base) sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base) if pair_file else None) sam_file = os.path.join(align_dir, "%s.sam" % out_base) if not file_exists(sam_file): if not file_exists(sai1_file): with file_transaction(sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not file_exists(sai2_file): with file_transaction(sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) align_type = "sampe" if sai2_file else "samse" sam_cl = [config["program"]["bwa"], align_type, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file) as tx_sam_file: with open(tx_sam_file, "w") as out_handle: logger.info(" ".join(sam_cl)) subprocess.check_call(sam_cl, stdout=out_handle) return sam_file
def _segment_normalized_gatk(cnr_file, work_dir, paired): """Segmentation of normalized inputs using GATK4, converting into standard input formats. """ work_dir = utils.safe_makedir(os.path.join(work_dir, "gatk-cnv")) seg_file = gatkcnv.model_segments(cnr_file, work_dir, paired)["seg"] std_seg_file = seg_file.replace(".cr.seg", ".seg") if not utils.file_uptodate(std_seg_file, seg_file): with file_transaction(std_seg_file) as tx_out_file: df = pd.read_csv(seg_file, sep="\t", comment="@", header=0, names=["chrom", "loc.start", "loc.end", "num.mark", "seg.mean"]) df.insert(0, "ID", [dd.get_sample_name(paired.tumor_data)] * len(df)) df.to_csv(tx_out_file, sep="\t", header=True, index=False) std_cnr_file = os.path.join(work_dir, "%s.cnr" % dd.get_sample_name(paired.tumor_data)) if not utils.file_uptodate(std_cnr_file, cnr_file): with file_transaction(std_cnr_file) as tx_out_file: logdf = pd.read_csv(cnr_file, sep="\t", comment="@", header=0, names=["chrom", "start", "end", "log2"]) covdf = pd.read_csv(tz.get_in(["depth", "bins", "antitarget"], paired.tumor_data), sep="\t", header=None, names=["chrom", "start", "end", "orig.name", "depth", "gene"]) df = pd.merge(logdf, covdf, on=["chrom", "start", "end"]) del df["orig.name"] df = df[["chrom", "start", "end", "gene", "log2", "depth"]] df.insert(6, "weight", [1.0] * len(df)) df.to_csv(tx_out_file, sep="\t", header=True, index=False) return std_cnr_file, std_seg_file
def gtf_to_fasta(gtf, ref_fasta, cds=False, out_file=None): """ convert a GTF to FASTA format if cds=True, use the start/stop codons to output only the CDS """ if out_file and file_exists(out_file): return out_file if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".fa").name tmp_file = out_file + ".tmp" if cds: cmd = "gffread -g {ref_fasta} -x {tx_tmp_file} {gtf}" else: cmd = "gffread -g {ref_fasta} -w {tx_tmp_file} {gtf}" message = "Converting %s to FASTA format." % gtf with file_transaction(tmp_file) as tx_tmp_file: do.run(cmd.format(**locals()), message) with file_transaction(out_file) as tx_out_file: with open(tmp_file) as in_handle, open(tx_out_file, "w") as out_handle: for line in in_handle: if line.startswith(">"): line = line.split()[0] + "\n" out_handle.write(line) return out_file
def _add_variantcalls_to_output(out, data): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = dd.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"]] if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files, log_file, config): """Trimming with cutadapt, using version installed with bcbio-nextgen. Uses the system executable to find the version next to our Anaconda Python. TODO: Could we use cutadapt as a library to avoid this? """ if all([file_exists(x) for x in out_files]): return out_files cmd = _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files) if len(fastq_files) == 1: of = [out_files[0], log_file] message = "Trimming %s in single end mode with cutadapt." % (fastq_files[0]) with file_transaction(config, of) as of_tx: of1_tx, log_tx = of_tx do.run(cmd.format(**locals()), message) else: of = out_files + [log_file] with file_transaction(config, of) as tx_out_files: of1_tx, of2_tx, log_tx = tx_out_files tmp_fq1 = append_stem(of1_tx, ".tmp") tmp_fq2 = append_stem(of2_tx, ".tmp") singles_file = of1_tx + ".single" message = "Trimming %s and %s in paired end mode with cutadapt." % (fastq_files[0], fastq_files[1]) do.run(cmd.format(**locals()), message) return out_files
def _add_bed_to_output(out, data): """Call ploidy and convert into BED representation. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = dd.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"]] if gender: cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") out_file = "%s.bed" % os.path.splitext(call_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", "bed", "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export BED") out["call_file"] = call_file out["vrn_file"] = annotate.add_genes(out_file, data) return out
def cram_compress(in_bam): import os import subprocess from bcbio import utils from bcbio.distributed.transaction import file_transaction print in_bam ref_file = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa" out_file = "%s.cram" % os.path.splitext(in_bam)[0] jvm_opts = "-Xms1g -Xmx3g" if not utils.file_exists(out_file): print "cramming", out_file with file_transaction(out_file) as tx_out_file: cmd = ("cramtools {jvm_opts} cram " "--input-bam-file {in_bam} " "--capture-all-tags " "--ignore-tags 'BD:BI' " "--reference-fasta-file {ref_file} " "--lossy-quality-score-spec '*8' " "--output-cram-file {tx_out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) if not utils.file_exists(out_file + ".crai"): print "indexing", out_file + ".crai" with file_transaction(out_file + ".crai") as tx_out_file: tx_in_file = os.path.splitext(tx_out_file)[0] utils.symlink_plus(out_file, tx_in_file) cmd = ("cramtools {jvm_opts} index " "--input-file {tx_in_file}") subprocess.check_call(cmd.format(**locals()), shell=True) if os.path.exists(in_bam) and utils.file_exists(out_file): if in_bam != out_file and in_bam.endswith(".bam") and out_file.endswith(".cram"): os.remove(in_bam) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samblaster = config_utils.get_program("samblaster", data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file): with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with utils.curdir_tmpdir() as tmpdir: tobam_cmd = ("{sambamba} view -S -f bam -l 0 /dev/stdin | " "{sambamba} sort -t {cores} -m {mem} --tmpdir {tmpdir} " "-o {out_file} /dev/stdin") splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, **locals()) cmd = ("{sambamba} sort -t {cores} -m {mem} --tmpdir={tmpdir} " "-n -o /dev/stdout -l 0 {in_bam} | " "{sambamba} view -h /dev/stdin | " "{samblaster} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "-o /dev/null") do.run(cmd.format(**locals()), "samblaster: split and discordant reads", data) return sr_file, disc_file
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def calc_variants_stats(data, args): in_vcf = data['vcf'] ref_file = args.reference # gatk_jar = '/groups/bcbio/bcbio/toolplus/gatk/3.2-2-gec30cee/GenomeAnalysisTK.jar' jvm_opts = broad.get_gatk_framework_opts(data['config']) gatk_jar = config_utils.get_program("gatk", data['config'], "dir") bed_file = args.region sample = splitext_plus(op.basename(in_vcf))[0] in_bam = data['bam'] cg_file = op.join(args.out, sample + "_with-gc.vcf.gz") parse_file = op.join(args.out, sample + "_cg-depth-parse.tsv") if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} " "-L {bed_file} -I {in_bam} " "-A GCContent --variant {in_vcf} --out {tx_out}") do.run(cmd.format(**locals()), " cg for %s" % in_vcf) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R {bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), " query for %s" % in_vcf) logger.info('parsing coverage: %s' % sample) # return df return parse_file
def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def calculate_tstv(args): """ get tstv from bcftools stat for all, known and new variants """ tstv = defaultdict(list) for in_vcf in args.bams: out_file = os.path.splitext(in_vcf)[0] + ".stats" known_file = os.path.splitext(in_vcf)[0] + ".known.stats" new_file = os.path.splitext(in_vcf)[0] + ".new.stats" sample = os.path.basename(in_vcf).split("-")[0] if not file_exists(out_file): with file_transaction(out_file) as tx_out: cmd = ("bcftools stats {in_vcf} > {tx_out}") do.run(cmd.format(**locals()), "ts/tv ratio for %s" % in_vcf) if not file_exists(new_file): with file_transaction(new_file) as tx_new: cmd = ("bcftools filter -i DB=0 {in_vcf} | bcftools stats /dev/stdin > {tx_new}") do.run(cmd.format(**locals()), "ts/tv ratio for %s" % in_vcf) if not file_exists(known_file): with file_transaction(known_file) as tx_known: cmd = ("bcftools filter -i DB=1 {in_vcf} | bcftools stats /dev/stdin > {tx_known}") do.run(cmd.format(**locals()), "ts/tv ratio for %s" % in_vcf) for fn, name in zip([out_file, known_file, new_file], ['all', 'known', 'new']): with open(fn) as in_handle: for line in in_handle: if line.startswith("TSTV"): tstv[sample].append(line.split()[4]) break df = pd.DataFrame(tstv, index=['all', 'known', 'new']) df.to_csv(args.out)
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data): """Create target and antitarget regions from target and access files. """ batch = dd.get_batch(data) or dd.get_sample_name(data) basename = os.path.splitext(os.path.basename(raw_target_bed))[0] target_bed = os.path.join(work_dir, "%s-%s.target.bed" % (basename, batch)) # back compatible with previous runs to avoid re-calculating target_bed_old = os.path.join(work_dir, "%s.target.bed" % basename) if utils.file_exists(target_bed_old): target_bed = target_bed_old if not utils.file_exists(target_bed): with file_transaction(data, target_bed) as tx_out_file: cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file] bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data) if bin_estimates.get("target"): cmd += ["--avg-size", str(bin_estimates["target"])] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit target") antitarget_bed = os.path.join(work_dir, "%s-%s.antitarget.bed" % (basename, batch)) antitarget_bed_old = os.path.join(work_dir, "%s.antitarget.bed" % basename) # back compatible with previous runs to avoid re-calculating if os.path.exists(antitarget_bed_old): antitarget_bed = antitarget_bed_old if not os.path.exists(antitarget_bed): with file_transaction(data, antitarget_bed) as tx_out_file: cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file] bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data) if bin_estimates.get("antitarget"): cmd += ["--avg-size", str(bin_estimates["antitarget"])] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit antitarget") return target_bed, antitarget_bed
def _bgzip_from_cram(cram_file, dirs, data): """Create bgzipped fastq files from an input CRAM file in regions of interest. Returns a list with a single file, for single end CRAM files, or two files for paired end input. """ region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data) if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome"] else None) if region_file: regions = ["%s:%s-%s" % tuple(r) for r in pybedtools.BedTool(region_file)] else: regions = [None] work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s-%s.fq.gz" % (utils.splitext_plus(os.path.basename(cram_file))[0], fext)) for fext in ["s1", "p1", "p2"]] if not utils.file_exists(out_s) and not utils.file_exists(out_p1): cram.index(cram_file) fastqs = _cram_to_fastq_regions(regions, cram_file, dirs, data) if len(fastqs[0]) == 1: with file_transaction(out_s) as tx_out_file: _merge_and_bgzip([xs[0] for xs in fastqs], tx_out_file, out_s) else: for i, out_file in enumerate([out_p1, out_p2]): ext = "/%s" % (i + 1) with file_transaction(out_file) as tx_out_file: _merge_and_bgzip([xs[i] for xs in fastqs], tx_out_file, out_file, ext) if utils.file_exists(out_p1): return [out_p1, out_p2] else: assert utils.file_exists(out_s) return [out_s]
def _prioritize_vcf(caller, vcf_file, prioritize_by, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = "bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}" do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: cmd = "simple_sv_annotation.py -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(simple_vcf, data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ( "zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".")) """ "print CALLER,SNAME,$1,$2,I$END,I$SVTYPE,I$KNOWN,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}" ) do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) _rename_allelic_fraction_field(out_file_mutect,config) disable_SID = True # SID isn't great, so use Scalpel instead if "appistry" not in broad_runner.get_mutect_version() or disable_SID: # Scalpel InDels is_paired = "-I:normal" in params out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(out_file_indels) as tx_out_file2: if not is_paired: scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) else: # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def trim_adapters(fastq_files, dirs, config): QUALITY_CUTOFF = 5 to_trim = _get_sequences_to_trim(config, ALIENTRIMMER_ADAPTERS) resources = config_utils.get_resources("AlienTrimmer", config) try: jarpath = config_utils.get_program("AlienTrimmer", config, "dir") # fall back on Cutadapt if AlienTrimmer is not installed # XXX: remove after it has been live for a while except: return trim_read_through(fastq_files, dirs, config) jarfile = config_utils.get_jar("AlienTrimmer", jarpath) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])) base_cmd = ("java -jar {jvm_opts} {jarfile} -k 10 -l 20 ") fastq1 = fastq_files[0] supplied_quality_format = _get_quality_format(config) cores = config["algorithm"].get("num_cores", 0) out_files = _get_read_through_trimmed_outfiles(fastq_files, dirs) fastq1_out = out_files[0] if supplied_quality_format == "illumina": quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][0] else: quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][1] quality_flag = '-q ' + quality_flag if len(fastq_files) == 1: if file_exists(fastq1_out): return [fastq1_out] base_cmd += ("-i {fastq1} -o {tx_fastq1_out} -c {temp_file} " "{quality_flag}") message = "Trimming %s from %s with AlienTrimmer." % (to_trim, fastq1) else: fastq2 = fastq_files[1] fastq2_out = out_files[1] if all(map(file_exists, [fastq1_out, fastq2_out])): return [fastq1_out, fastq2_out] base_cmd += ("-if {fastq1} -ir {fastq2} -of {tx_fastq1_out} " "-or {tx_fastq2_out} -c {temp_file} {quality_flag}") message = ("Trimming %s from %s and %s with AlienTrimmer." % (to_trim, fastq1, fastq2)) with tempfile.NamedTemporaryFile(delete=False) as temp: temp_file = temp.name for adapter in to_trim: temp.write(adapter + "\n") temp.close() if len(fastq_files) == 1: with file_transaction(fastq1_out) as tx_fastq1_out: do.run(base_cmd.format(**locals()), message) return [fastq1_out] else: with file_transaction([fastq1_out, fastq2_out]) as tx_out_files: tx_fastq1_out = tx_out_files[0] tx_fastq2_out = tx_out_files[1] do.run(base_cmd.format(**locals()), message) return [fastq1_out, fastq2_out]
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data["vrn_file"] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out, ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, "w") as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug("parsing coverage: %s" % sample) return data
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: bam.index(bam_files[0], config) return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file): sambamba = config_utils.get_program("sambamba", config) samtools = config_utils.get_program("samtools", config) samblaster = config_utils.get_program("samblaster", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 2, "decrease").upper() # sambamba opens 4 handles per file, so try to guess a reasonable batch size batch_size = (system.open_file_limit() // 4) - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): with file_transaction(config, out_file) as tx_out_file: with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list: with open(tx_bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) if bam.bam_already_sorted(bam_files[0], config, "coordinate"): cmd = _sambamba_merge(bam_files) else: assert config.get("mark_duplicates", True) cmd = _biobambam_merge_dedup() do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) # Ensure timestamps are up to date on output file and index # Works around issues on systems with inconsistent times for ext in ["", ".bai"]: if os.path.exists(out_file + ext): subprocess.check_call(["touch", out_file + ext]) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def _cut_file(self, in_file): """ run cutadapt on a single file """ adapters = self._get_adapters(self.chemistry) out_file = self.in2trimmed(in_file) if file_exists(out_file): return out_file cutadapt = sh.Command(self.stage_config.get("program", "cutadapt")) quality_format = self.quality_format if not quality_format: quality_format = self._detect_fastq_format(in_file) if quality_format == "sanger": logger.info("Quality format detected as sanger.") quality_base = 33 elif quality_format == "illumina": logger.info("Quality format set to illumina 1.5/1.3") quality_base = 64 else: logger.error("Quality format could not be detected. Quality " "Detected or set as %s. It should be illumina " "or sanger.") exit(1) # if we want to trim the polya tails we have to first remove # the adapters and then trim the tail if self.stage_config.get("trim_polya", True): temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq", dir=self.out_dir) # trim off adapters cmd = str(cutadapt.bake(in_file, self.options, adapters, quality_base=quality_base, out=temp_cut.name)) do.run(cmd, "Cutadapt trim of adapters of %s." % (in_file), None) with file_transaction(out_file) as temp_out: polya = ADAPTERS.get("polya") # trim off polya cmd = str(cutadapt.bake(temp_cut.name, self.options, "-a", polya, "-a", self._rc_adapters(polya), quality_base=quality_base, out=temp_out)) do.run(cmd, "Cutadapt trim of polyA tail of %s." % (temp_cut.name), None) return out_file else: with file_transaction(out_file) as temp_out: cmd = str(cutadapt.bake(in_file, self.options, adapters, out=temp_out)) do.run(cmd, "Cutadapt trim of %s." % (in_file)) return out_file
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data): """Create target and antitarget regions from target and access files. """ target_bed = os.path.join(work_dir, "%s.target.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0]) if not utils.file_uptodate(target_bed, raw_target_bed): with file_transaction(data, target_bed) as tx_out_file: cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file] do.run(cmd, "CNVkit target") antitarget_bed = os.path.join(work_dir, "%s.antitarget.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0]) if not utils.file_uptodate(antitarget_bed, target_bed): with file_transaction(data, antitarget_bed) as tx_out_file: cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file] do.run(cmd, "CNVkit antitarget") return target_bed, antitarget_bed
def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None, [_create_bed(c, out_file, data) for c in calls]) if len(input_beds) > 0: all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): out_handle.write(line) pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=",").saveas(tx_out_file) if utils.file_exists(out_file): calls.append({"variantcaller": "ensemble", "vrn_file": out_file}) return calls
def _bam_coverage(name, bam_input, data): """Run bamCoverage from deeptools""" cmd = ("{bam_coverage} --bam {bam_input} --outFileName {bw_output} " "--binSize 20 --effectiveGenomeSize {size} " "--smoothLength 60 --extendReads 150 --centerReads -p {cores} ") size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) cores = dd.get_num_cores(data) try: bam_coverage = config_utils.get_program("bamCoverage", data) except config_utils.CmdNotFound: logger.info("No bamCoverage found, skipping bamCoverage.") return None resources = config_utils.get_resources("bamCoverage", data["config"]) if resources: options = resources.get("options") if options: cmd += " %s" % " ".join([str(x) for x in options]) bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name) if utils.file_exists(bw_output): return bw_output with file_transaction(bw_output) as out_tx: do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name) return bw_output
def gatk_rnaseq_calling(data): """ use GATK to perform variant calling on RNA-seq data """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) split_bam = dd.get_split_bam(data) out_file = os.path.splitext(split_bam)[0] + ".gvcf" num_cores = dd.get_num_cores(data) if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o", tx_out_file, "-nct", str(num_cores), "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", "-dontUseSoftClippedBases" ] broad_runner.run_gatk(params) data = dd.set_vrn_file(data, out_file) return data
def run(items): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) data = paired.tumor_data if paired else items[0] work_dir = _sv_workdir(data) variant_file = _get_out_file(work_dir, paired) if not utils.file_exists(variant_file): with file_transaction(data, work_dir) as tx_work_dir: utils.safe_makedir(tx_work_dir) tx_workflow_file = _prep_config(items, paired, tx_work_dir) _run_workflow(items, paired, tx_workflow_file, tx_work_dir) assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file out = [] for data in items: sample_file = _select_sample(data, variant_file, work_dir) if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({"variantcaller": "manta", "vrn_file": effects_vcf or sample_file}) out.append(data) return out
def picard_mark_duplicates(picard, align_bam, remove_dups=False): base, ext = os.path.splitext(align_bam) base = base.replace(".", "-") dup_bam = "%s-dup%s" % (base, ext) dup_metrics = "%s-dup.dup_metrics" % base if not file_exists(dup_bam): with tx_tmpdir(picard._config) as tmp_dir: with file_transaction(picard._config, dup_bam, dup_metrics) as (tx_dup_bam, tx_dup_metrics): opts = [("INPUT", align_bam), ("OUTPUT", tx_dup_bam), ("TMP_DIR", tmp_dir), ("REMOVE_DUPLICATES", "true" if remove_dups else "false"), ("METRICS_FILE", tx_dup_metrics)] if picard.get_picard_version("MarkDuplicates") >= 1.82: opts += [("PROGRAM_RECORD_ID", "null")] picard.run("MarkDuplicates", opts, memscale={ "direction": "decrease", "magnitude": 2 }) return dup_bam, dup_metrics
def picard_fastq_to_bam(picard, fastq_one, fastq_two, out_dir, names, order="queryname"): """Convert fastq file(s) to BAM, adding sample, run group and platform information. """ out_bam = os.path.join( out_dir, "%s-fastq.bam" % os.path.splitext(os.path.basename(fastq_one))[0]) if not file_exists(out_bam): with tx_tmpdir(picard._config) as tmp_dir: with file_transaction(picard._config, out_bam) as tx_out_bam: opts = [("FASTQ", fastq_one), ("READ_GROUP_NAME", names["rg"]), ("SAMPLE_NAME", names["sample"]), ("PLATFORM_UNIT", names["pu"]), ("PLATFORM", names["pl"]), ("TMP_DIR", tmp_dir), ("OUTPUT", tx_out_bam), ("SORT_ORDER", order)] if fastq_two: opts.append(("FASTQ2", fastq_two)) picard.run("FastqToSam", opts) return out_bam
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data bowtie2 = config_utils.get_program("bowtie2", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_index = index_transcriptome(gtf_file, ref_file, data) num_cores = data["config"]["algorithm"].get("num_cores", 1) fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file pair_cmd = "-2 %s " % pair_file if pair_file else "" cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} ") with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file, name_sort=True) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def _run_snpeff(snp_in, out_format, data): snpeff_db, datadir = get_db(data) assert datadir is not None, \ "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"] assert os.path.exists(os.path.join(datadir, snpeff_db)), \ "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir) snpeff_cmd = get_cmd("eff", datadir, data["config"]) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(out_file) as tx_out_file: cmd = ( "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} " "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def genotype_filter(vcf_file, expression, data, name, filterext=""): """Perform genotype based filtering using GATK with the provided expression. Adds FT tags to genotypes, rather than the general FILTER flag. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "VariantFiltration", "-R", tz.get_in(["reference", "fasta", "base"], data), "--variant", vcf_file, "--out", tx_out_file, "--genotypeFilterName", name, "--genotypeFilterExpression", "'%s'" % expression ] jvm_opts = broad.get_gatk_framework_opts(data["config"]) cmd = [config_utils.get_program("gatk-framework", data["config"]) ] + jvm_opts + params do.run(cmd, "Filter with expression: %s" % expression) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _prep_sample_cnvs(cnv_file, data): """Convert a multiple sample CNV file into a single BED file for a sample. Handles matching and fixing names where R converts numerical IDs (1234) into strings by adding an X (X1234), and converts other characters into '.'s. http://stat.ethz.ch/R-manual/R-devel/library/base/html/make.names.html """ import pybedtools sample_name = tz.get_in(["rgnames", "sample"], data) def make_names(name): return re.sub("[^\w.]", '.', name) def matches_sample_name(feat): return (feat.name == sample_name or feat.name == "X%s" % sample_name or feat.name == make_names(sample_name)) def update_sample_name(feat): feat.name = sample_name return feat sample_file = os.path.join(os.path.dirname(cnv_file), "%s-cnv.bed" % sample_name) if not utils.file_exists(sample_file): with file_transaction(data, sample_file) as tx_out_file: with shared.bedtools_tmpdir(data): pybedtools.BedTool(cnv_file).filter(matches_sample_name).each(update_sample_name).saveas(tx_out_file) return sample_file
def _grabix_index(data): """Create grabix index of bgzip input file. grabix does not allow specification of output file, so symlink the original file into a transactional directory. """ in_file = data["bgzip_file"] config = data["config"] grabix = config_utils.get_program("grabix", config) gbi_file = _get_grabix_index(in_file) # We always build grabix input so we can use it for counting reads and doing downsampling if not gbi_file or _is_partial_index(gbi_file): if gbi_file: utils.remove_safe(gbi_file) else: gbi_file = in_file + ".gbi" with file_transaction(data, gbi_file) as tx_gbi_file: tx_in_file = os.path.splitext(tx_gbi_file)[0] utils.symlink_plus(in_file, tx_in_file) do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file)) assert utils.file_exists(gbi_file) return [gbi_file]
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = _libtype_string(bam_file, strandedness) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} " "-o {tx_out_dir} -a {bam_file} ") cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = "Quantifying transcripts in %s with Salmon." % bam_file do.run(cmd.format(**locals()), message, None) return out_file
def picard_sam_to_bam(picard, align_sam, fastq_bam, ref_file, is_paired=False): """Convert SAM to BAM, including unmapped reads from fastq BAM file. """ if align_sam.endswith(".sam"): out_bam = "%s.bam" % os.path.splitext(align_sam)[0] elif align_sam.endswith("-align.bam"): out_bam = "%s.bam" % align_sam.replace("-align.bam", "") else: raise NotImplementedError("Input format not recognized") if not file_exists(out_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_bam) as tx_out_bam: opts = [ ("UNMAPPED", fastq_bam), ("ALIGNED", align_sam), ("OUTPUT", tx_out_bam), ("REFERENCE_SEQUENCE", ref_file), ("TMP_DIR", tmp_dir), ("PAIRED_RUN", ("true" if is_paired else "false")), ] picard.run("MergeBamAlignment", opts) return out_bam
def sort_merge(in_file, data, out_dir=None): """Sort and merge a BED file, collapsing gene names. Output is a 3 or 4 column file (the 4th column values go comma-separated). """ out_file = "%s-sortmerge.bed" % os.path.splitext(in_file)[0] bedtools = config_utils.get_program("bedtools", data, default="bedtools") if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): column_opt = "" with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith(("#", "track", "browser")): parts = line.split() if len(parts) >= 4: column_opt = "-c 4 -o distinct" with file_transaction(data, out_file) as tx_out_file: cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" sort_cmd = get_sort_cmd() cmd = ("{cat_cmd} {in_file} | {sort_cmd} -k1,1 -k2,2n | " "{bedtools} merge -i - {column_opt} > {tx_out_file}") do.run(cmd.format(**locals()), "Sort and merge BED file", data) return out_file
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, region=None, out_file=None, deep_coverage=False, config=None): """Perform realignment of BAM file in specified regions """ if out_file is None: out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] if not file_exists(out_file): with curdir_tmpdir({"config": config}) as tmp_dir: with file_transaction(out_file) as tx_out_file: logger.info("GATK IndelRealigner: %s %s" % (os.path.basename(align_bam), region)) cl = gatk_indel_realignment_cl(runner, align_bam, ref_file, intervals, tmp_dir, region, deep_coverage) cl += ["-o", tx_out_file] do.run(cl, "GATK indel realignment", {}) return out_file
def merge_overlaps(in_file, data, distance=None, out_dir=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ if in_file: bedtools = config_utils.get_program("bedtools", data["config"]) work_dir = tz.get_in(["dirs", "work"], data) if out_dir: bedprep_dir = out_dir elif work_dir: bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep")) else: bedprep_dir = os.path.dirname(in_file) out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0])) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: distance = "-d %s" % distance if distance else "" cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare merged BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def pizzly(pizzly_path, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data): outdir = os.path.join(pizzlydir, samplename) out_stem = os.path.join(outdir, samplename) pizzly_gtf = make_pizzly_gtf(gtf, os.path.join(pizzlydir, "pizzly.gtf"), data) sentinel = os.path.join(out_stem, "-flat-filtered.tsv") pizzlycalls = out_stem + ".json" if not file_exists(pizzlycalls): with file_transaction(data, outdir) as tx_out_dir: safe_makedir(tx_out_dir) tx_out_stem = os.path.join(tx_out_dir, samplename) cmd = ( "{pizzly_path} -k 31 --gtf {pizzly_gtf} --cache {cachefile} " "--align-score 2 --insert-size {fraglength} --fasta {gtf_fa} " "--output {tx_out_stem} {fusions}") message = ("Running pizzly on %s." % fusions) do.run(cmd.format(**locals()), message) flatfile = out_stem + "-flat.tsv" filteredfile = out_stem + "-flat-filtered.tsv" flatten_pizzly(pizzlycalls, flatfile, data) filter_pizzly(flatfile, filteredfile, data) return outdir
def _prepare_inputs(ma_fn, bam_file, out_dir): """ Convert to fastq with counts """ fixed_fa = os.path.join(out_dir, "file_reads.fa") count_name =dict() with file_transaction(fixed_fa) as out_tx: with open(out_tx, 'w') as out_handle: with open(ma_fn) as in_handle: h = in_handle.next() for line in in_handle: cols = line.split("\t") name_with_counts = "%s_x%s" % (cols[0], sum(map(int, cols[2:]))) count_name[cols[0]] = name_with_counts print >>out_handle, ">%s\n%s" % (name_with_counts, cols[1]) fixed_bam = os.path.join(out_dir, "align.bam") bam_handle = pysam.AlignmentFile(bam_file, "rb") with pysam.AlignmentFile(fixed_bam, "wb", template=bam_handle) as out_handle: for read in bam_handle.fetch(): read.query_name = count_name[read.query_name] out_handle.write(read) return fixed_fa, fixed_bam
def _segs_to_vcf(in_file, data): """Convert output TitanCNA segs file into bgzipped VCF. """ out_file = "%s.vcf" % utils.splitext_plus(in_file)[0] if not utils.file_exists(out_file + ".gz") and not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: out_handle.write(_vcf_header) out_handle.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", dd.get_sample_name(data)]) + "\n") header = in_handle.readline().strip().split("\t") for line in in_handle: cur = dict(zip(header, line.strip().split("\t"))) svtype = _get_svtype(cur["TITAN_call"]) info = ["SVTYPE=%s" % svtype, "END=%s" % cur["End_Position.bp."], "CN=%s" % cur["Copy_Number"], "MajorCN=%s" % cur["MajorCN"], "MinorCN=%s" % cur["MinorCN"], "FOLD_CHANGE_LOG=%s" % cur["Median_logR"]] out = [cur["Chromosome"], cur["Start_Position.bp."], ".", "N", "<%s>" % svtype, ".", ".", ";".join(info), "GT", "0/1"] out_handle.write("\t".join(out) + "\n") return vcfutils.bgzip_and_index(out_file, data["config"])
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() cores = dd.get_num_cores(data) if gatk_type == "gatk4": params = ["-T", "ApplyBQSRSpark", "--sparkMaster", "local[%s]" % cores, "--input", in_file, "--output", tx_out_file, "--bqsr_recal_file", data["prep_recal"], "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=(cores > 1 and gatk_type == "gatk4")) bam.index(out_file, data["config"]) return out_file
def _disambiguate_star_fusion_junctions(star_junction_file, contamination_bam, disambig_out_file, data): """ Disambiguate detected fusions based on alignments to another species. """ out_file = disambig_out_file fusiondict = {} with open(star_junction_file, "r") as in_handle: for my_line in in_handle: my_line_split = my_line.strip().split("\t") if len(my_line_split) < 10: continue fusiondict[my_line_split[9]] = my_line.strip("\n") with pysam.Samfile(contamination_bam, "rb") as samfile: for my_read in samfile: if my_read.is_unmapped or my_read.is_secondary: continue if my_read.qname in fusiondict: fusiondict.pop(my_read.qname) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, 'w') as myhandle: for my_key in fusiondict: print(fusiondict[my_key], file=myhandle) return out_file
def run(bam_file, data, out_dir): out = {} if not tz.get_in(["config", "algorithm", "preseq"], data): return out samtools_stats_dir = os.path.join(out_dir, os.path.pardir, "samtools") samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data)) if not utils.file_exists(stats_file): utils.safe_makedir(out_dir) preseq = config_utils.get_program("preseq", data["config"]) params = _get_preseq_params(data, int(samtools_stats["Total_reads"])) param_line = "-step {step} -extrap {extrap} -seg_len {seg_len}".format( **params) with file_transaction(data, stats_file) as tx_out_file: cmd = "{preseq} lc_extrap -bam -pe {bam_file} -o {tx_out_file} {param_line}".format( **locals()) do.run(cmd.format(**locals()), "preseq lc_extrap", data) out = _prep_real_counts(bam_file, data, samtools_stats) return {"base": stats_file, "metrics": out}
def _merge_metrics(yaml_data): """ parse project.yaml file to get metrics for each bam """ project = yaml_data out_file = os.path.join("metrics", "metrics.tsv") dt_together = [] with file_transaction(out_file) as out_tx: for s in project['samples']: m = s['summary']['metrics'] for me in m: if isinstance(m[me], list): m[me] = ":".join(m[me]) dt = pd.DataFrame(m, index=['1']) # dt = pd.DataFrame.from_dict(m) dt.columns = [ k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns ] dt['sample'] = s['description'] dt_together.append(dt) dt_together = utils.rbind(dt_together) dt_together.to_csv(out_tx, index=False, sep="\t")
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2"]] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2): cram_file = utils.remote_cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]
def regions_coverage(data, bed_file, bam_file, target_name, depth_thresholds=None): """Generate coverage over regions of interest using sambamba depth. sambamba can segfault with multiple threads so provides a single threaded backup implementation in case of failures. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) out_file = os.path.join(work_dir, target_name + "_regions_depth.bed") if utils.file_uptodate(out_file, bam_file) and utils.file_uptodate(out_file, bed_file): return out_file with file_transaction(data, out_file) as tx_out_file: try: cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file, depth_thresholds=depth_thresholds) cmdl += " -o " + tx_out_file message = "Calculating regions coverage of {target_name} in {bam_file}" do.run(cmdl, message.format(**locals())) except subprocess.CalledProcessError: cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file, depth_thresholds=depth_thresholds, multicore=False) cmdl += " -o " + tx_out_file message = "Calculating regions coverage of {target_name} in {bam_file} -- single thread backup" do.run(cmdl, message.format(**locals())) return out_file
def cnvkit_background(background_cnns, out_file, items, target_bed=None, antitarget_bed=None): """Calculate background reference, handling flat case with no normal sample. """ if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cmd = [ _get_cmd(), "reference", "-f", dd.get_ref_file(items[0]), "-o", tx_out_file ] gender = _get_batch_gender(items) if gender: cmd += ["--sample-sex", gender] if len(background_cnns) == 0: assert target_bed and antitarget_bed, "Missing CNNs and target BEDs for flat background" cmd += ["-t", target_bed, "-a", antitarget_bed] else: cmd += background_cnns do.run(_prep_cmd(cmd, tx_out_file), "CNVkit background") return out_file
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" group_opts, cons_opts = _get_fgbio_options(data, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=unsorted " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_total_file = os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate( parse_file, in_bam): pass else: with file_transaction(parse_file) as out_tx: cmdl = sambamba.make_command( data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data): """Retrieve intervals to run validation on, merging reference and callable BED files. """ a_intervals = get_analysis_intervals(data, vrn_file, base_dir) if a_intervals: final_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: caller = _get_caller(data) sample = dd.get_sample_name(data) combo_intervals = os.path.join(base_dir, "%s-%s-%s-wrm.bed" % (utils.splitext_plus(os.path.basename(final_intervals))[0], sample, caller)) if not utils.file_uptodate(combo_intervals, final_intervals): with file_transaction(data, combo_intervals) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): # Copy files locally to avoid issues on shared filesystems # where BEDtools has trouble accessing the same base # files from multiple locations a = os.path.basename(final_intervals) b = os.path.basename(rm_interval_file) try: shutil.copyfile(final_intervals, a) except IOError: time.sleep(60) shutil.copyfile(final_intervals, a) try: shutil.copyfile(rm_interval_file, b) except IOError: time.sleep(60) shutil.copyfile(rm_interval_file, b) cmd = ("bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}") do.run(cmd.format(**locals()), "Intersect callable intervals for rtg vcfeval") final_intervals = combo_intervals else: assert rm_interval_file, "No intervals to subset analysis with for %s" % vrn_file final_intervals = shared.remove_lcr_regions(rm_interval_file, [data]) return final_intervals
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool( shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract( sv_exclude_bed, nonamecheck=True).saveas() if any(dd.get_coverage_interval(d) == "genome" for d in items): want_bedtool = pybedtools.BedTool( shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file