def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def run(data): """Quantitaive isoforms expression by express""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) tophat_index = get_in(data, ('genome_resources', 'rnaseq', 'transcriptome_index', 'tophat')) if not tophat_index: logger.info("Tophat index not found, skipping running eXpress.") return None tophat_fa = tophat_index.replace("ver", "fa") out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") safe_makedir(out_dir) express = config_utils.get_program("express", data['config']) if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return None if not file_exists(out_file): with tx_tmpdir() as tmp_dir: chdir(tmp_dir) ref_transcript = _do_fasta(tophat_fa) cmd = ("{express} {ref_transcript} {in_bam}") do.run(cmd.format(**locals()), "Run express", {}) shutil.move("results.xprs", out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs","fpkm"), 10) return (eff_count_file, tpm_file, fpkm_file)
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def _call_hla(hla_fq, out_dir, data): """Run OptiType HLA calling for a specific fastq input. """ bin_dir = os.path.dirname(os.path.realpath(sys.executable)) out_dir = utils.safe_makedir(out_dir) with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: config_file = os.path.join(tx_out_dir, "config.ini") with open(config_file, "w") as out_handle: razers3 = os.path.join(bin_dir, "razers3") if not os.path.exists(razers3): raise ValueError("Could not find razers3 executable at %s" % (razers3)) out_handle.write(CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data))) resources = config_utils.get_resources("optitype", data["config"]) if resources.get("options"): opts = " ".join([str(x) for x in resources["options"]]) else: opts = "" cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} " "-i {hla_fq} -c {config_file}") do.run(cmd.format(**locals()), "HLA typing with OptiType") for outf in os.listdir(tx_out_dir): shutil.move(os.path.join(tx_out_dir, outf), os.path.join(out_dir, outf)) out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv")) assert len(out_file) == 1, "Expected one result file for OptiType, found %s" % out_file return out_file[0]
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data) cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_file, "-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)}
def _goleft_indexcov(bam_file, data, out_dir): """Use goleft indexcov to estimate coverage distributions using BAM index. Only used for whole genome runs as captures typically don't have enough data to be useful for index-only summaries. """ if not dd.get_coverage_interval(data) == "genome": return [] out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov")) out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext)) for ext in ["roc", "ped", "bed.gz"]] if not utils.file_uptodate(out_files[-1], bam_file): with transaction.tx_tmpdir(data) as tmp_dir: tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data))) gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)] gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else "" cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}" try: do.run(cmd.format(**locals()), "QC: goleft indexcov") except subprocess.CalledProcessError as msg: if not ("indexcov: no usable" in str(msg) or ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))): raise for out_file in out_files: orig_file = os.path.join(tmp_dir, os.path.basename(out_file)) if utils.file_exists(orig_file): utils.copy_plus(orig_file, out_file) # MultiQC needs non-gzipped/BED inputs so unpack the file out_bed = out_files[-1].replace(".bed.gz", ".tsv") if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed): with transaction.file_transaction(data, out_bed) as tx_out_bed: cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed) do.run(cmd, "Unpack indexcov BED file") out_files[-1] = out_bed return [x for x in out_files if utils.file_exists(x)]
def run_mutect(self, params, tmp_dir=None): with tx_tmpdir(self._config) as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_mutect(params, tmp_dir) prog = "MuTect" do.run(cl, "MuTect: {0}".format(prog), None)
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join( work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if utils.file_exists( sv_exclude_bed) else "" ref_file = dd.get_ref_file(items[0]) # use our bcbio python for runs within lumpyexpress curpython_dir = os.path.dirname(sys.executable) cmd = ( "export PATH={curpython_dir}:$PATH && " "lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_samples_with_pipelines(samples) final = [] with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def _callable_intersect(in_file, callable_bed, data): """Return list of original VCF SVs intersected by callable regions. Does not try to handle BNDs. We should resolve these and return where possible. """ with tx_tmpdir(data) as tmpdir: in_bed = os.path.join( tmpdir, "%s-convert.bed" % utils.splitext_plus(os.path.basename(in_file))[0]) with utils.open_gzipsafe(in_file) as in_handle: with open(in_bed, "w") as out_handle: for parts in (l.split("\t") for l in in_handle if not l.startswith("#")): start, end = _get_start_end(parts) if end: out_handle.write("\t".join([parts[0], start, end] + parts) + "\n") out_file = os.path.join( tmpdir, "%s-subset.tsv" % utils.splitext_plus(os.path.basename(in_file))[0]) cmd = "bedtools intersect -a {in_bed} -b {callable_bed} -wa -wb > {out_file}" do.run(cmd.format(**locals()), "Intersect VCF by callable") with open(out_file) as in_handle: for line in in_handle: yield line.rstrip().split("\t")[3:]
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease").upper() if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with tx_tmpdir(data) as tmpdir: with file_transaction(data, sr_file) as tx_sr_file: with file_transaction(data, disc_file) as tx_disc_file: with file_transaction(data, dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort( data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file ) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ( "{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | " ) cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) logger.info("System YAML configuration: %s" % os.path.abspath(config_file)) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config) system.write_info(dirs, parallel, config) with tx_tmpdir(config if parallel.get("type") == "local" else None) as tmpdir: tempfile.tempdir = tmpdir for pipeline, samples in pipelines.items(): for xs in pipeline(config, run_info_yaml, parallel, dirs, samples): pass
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-") tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("unset JAVA_HOME && " "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def test_create_tmpdir_in_a_specified_base_dir(self, mock_io): with tx_tmpdir(base_dir='somedir'): pass transaction.utils.get_abspath.assert_called_once_with( 'somedir/bcbiotx') transaction.utils.safe_makedir.assert_called_once_with( transaction.utils.get_abspath.return_value)
def run_gatk(self, params, tmp_dir=None, log_error=True, data=None, region=None, memscale=None, parallel_gc=False, ld_preload=False): """Top level interface to running a GATK command. ld_preload injects required libraries for Java JNI calls: https://gatkforums.broadinstitute.org/gatk/discussion/8810/something-about-create-pon-workflow """ needs_java7 = LooseVersion(self.get_gatk_version()) < LooseVersion("3.6") # For old Java requirements use global java 7 if needs_java7: setpath.remove_bcbiopath() with tx_tmpdir(self._config) as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc) atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] cl = fix_missing_spark_user(cl, prog, params) if ld_preload: cl = "export LD_PRELOAD=%s/lib/libopenblas.so && %s" % (os.path.dirname(utils.get_bcbio_bin()), cl) do.run(cl, "GATK: {0}".format(prog), data, region=region, log_error=log_error) if needs_java7: setpath.prepend_bcbiopath()
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)): with tx_tmpdir(data, work_dir) as tx_work_dir: target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data) cmd = ["batch"] + test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_file, "-d", raw_work_dir, "--split", "-p", str(tz.get_in(["config", "algorithm", "num_cores"], data, 1)), "--output-reference", os.path.join(raw_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] args = cnvlib_cmd.parse_args(cmd) args.func(args) shutil.move(tx_work_dir, raw_work_dir) return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)}
def _mirtop(input_fn, sps, db, out_dir, config): """ Convert to GFF3 standard format """ hairpin = os.path.join(db, "hairpin.fa") gtf = os.path.join(db, "mirbase.gff3") if not file_exists(hairpin) or not file_exists(gtf): logger.warning("%s or %s are not installed. Skipping." % (hairpin, gtf)) return None out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0] out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0] export = _get_env() cmd = ("{export} mirtop gff --sps {sps} --hairpin {hairpin} " "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}") if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \ not file_exists(os.path.join(out_dir, out_gff_fn)): with tx_tmpdir() as out_tx: do.run(cmd.format(**locals()), "Do miRNA annotation for %s" % input_fn) with utils.chdir(out_tx): out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \ else out_gff_fn if utils.file_exists(out_fn): shutil.move(os.path.join(out_tx, out_fn), os.path.join(out_dir, out_fn)) out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \ else os.path.join(out_dir, out_gff_fn) if utils.file_exists(os.path.join(out_dir, out_fn)): return os.path.join(out_dir, out_fn)
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists( hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def piped_bamprep(data, region=None, out_file=None): """Perform full BAM preparation using pipes to avoid intermediate disk IO. Handles recalibration and realignment of original BAMs. """ data["region"] = region if not _need_prep(data): return [data] else: utils.safe_makedir(os.path.dirname(out_file)) if region[0] == "nochrom": prep_bam = shared.write_nochr_reads(data["work_bam"], out_file, data["config"]) elif region[0] == "noanalysis": prep_bam = shared.write_noanalysis_reads(data["work_bam"], region[1], out_file, data["config"]) else: if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: _piped_bamprep_region(data, region, out_file, tmp_dir) prep_bam = out_file bam.index(prep_bam, data["config"]) data["work_bam"] = prep_bam return [data]
def _call_hla(hla_fq, out_dir, data): """Run OptiType HLA calling for a specific fastq input. """ bin_dir = os.path.dirname(os.path.realpath(sys.executable)) out_dir = utils.safe_makedir(out_dir) with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: config_file = os.path.join(tx_out_dir, "config.ini") with open(config_file, "w") as out_handle: razers3 = os.path.join(bin_dir, "razers3") if not os.path.exists(razers3): raise ValueError("Could not find razers3 executable at %s" % (razers3)) out_handle.write( CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data))) resources = config_utils.get_resources("optitype", data["config"]) if resources.get("options"): opts = " ".join([str(x) for x in resources["options"]]) else: opts = "" cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} " "-i {hla_fq} -c {config_file}") do.run(cmd.format(**locals()), "HLA typing with OptiType") for outf in os.listdir(tx_out_dir): shutil.move(os.path.join(tx_out_dir, outf), os.path.join(out_dir, outf)) out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv")) assert len( out_file ) == 1, "Expected one result file for OptiType, found %s" % out_file return out_file[0]
def _trna_annotation(data): """ use tDRmapper to quantify tRNAs """ trna_ref = op.join(dd.get_srna_trna_file(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "trna", name)) in_file = op.basename(data["clean_fastq"]) tdrmapper = os.path.join(os.path.dirname(sys.executable), "TdrMappingScripts.pl") perl_export = utils.get_perl_exports() if not file_exists(trna_ref) or not file_exists(tdrmapper): logger.info("There is no tRNA annotation to run TdrMapper.") return work_dir out_file = op.join(work_dir, in_file + ".hq_cs.mapped") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}" ).format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*mapped*"): shutil.move(filename, work_dir) return work_dir
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("samtools", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease").upper() if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with tx_tmpdir(data) as tmpdir: with file_transaction(data, sr_file) as tx_sr_file: with file_transaction(data, disc_file) as tx_disc_file: with file_transaction(data, dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(os.path.basename(in_bam))[0]) cmd = ("{samtools} sort -n -@ {cores} -m {mem} -O sam -T {out_base} {in_bam} | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def test_makes_unique_tmp_dir(self, mock_io): """Test that tx_tmpdir creates a tmp dir unique name using `tempfile.mkdtemp` inside the base dir.""" with tx_tmpdir(None): pass transaction.tempfile.mkdtemp.assert_called_once_with( dir=transaction.utils.get_abspath.return_value)
def run_gatk(self, params, tmp_dir=None, log_error=True, data=None, region=None, memscale=None, parallel_gc=False): needs_java7 = LooseVersion( self.get_gatk_version()) < LooseVersion("3.6") # For old Java requirements use global java 7 if needs_java7: setpath.remove_bcbiopath() with tx_tmpdir(self._config) as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc) atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] do.run(cl, "GATK: {0}".format(prog), data, region=region, log_error=log_error) if needs_java7: setpath.prepend_bcbiopath()
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def coverage(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(out_dir) if not bed_file: return None cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ( "{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run( cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with tx_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def priority_coverage(data, out_dir): from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return data work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with file_transaction(out_file) as tx_out_file: parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'" cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "{in_bam} | {parse_cmd} > {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) return out_file
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename( test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = { "cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn) } if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files( cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} target_bed = annotate.add_genes(raw_target_bed, data) # Do not paralleize cnvkit due to current issues with multi-processing cores = 1 # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), # len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_bed] + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] if cov_interval not in ["amplicon", "genome"]: at_avg, at_min, t_avg = _get_antitarget_size( access_bed, target_bed) if at_avg: cmd += [ "--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg) ] local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ( "{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with tx_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "unset JAVA_HOME && " "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = (cmd + tobam_cl).format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) return out_file
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir( os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ( "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir}" ) do.run( cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move( os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return data cleaned_bed = os.path.splitext( os.path.basename(bed_file))[0] + ".cleaned.bed" work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(parse_file) as out_tx: cmd = ( "sambamba depth region -F \"not unmapped\" -t {cores} " "-C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()), "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, bed_file, sample) _calculate_percentiles(parse_file, sample) data['coverage'] = os.path.abspath(parse_file) return data
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join( work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if ( sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len( depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ( "{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_priority_regions(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ( "{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") return hsmetric_file
def _gids_to_genes(gids, ssm_locs, cnv_ssms, data): """Convert support ids for SNPs and SSMs into associated genes. """ locs = collections.defaultdict(set) for gid in gids: cur_locs = [] try: cur_locs.append(ssm_locs[gid]) except KeyError: for ssm_loc in cnv_ssms.get(gid, []): cur_locs.append(ssm_locs[ssm_loc]) for chrom, pos in cur_locs: locs[chrom].add(pos) genes = set([]) with tx_tmpdir(data) as tmpdir: chrom_prefix = "chr" if next(ref.file_contigs(dd.get_ref_file(data))).name.startswith("chr") else "" loc_file = os.path.join(tmpdir, "battenberg_find_genes.bed") with open(loc_file, "w") as out_handle: for chrom in sorted(locs.keys()): for loc in sorted(list(locs[chrom])): out_handle.write("%s%s\t%s\t%s\n" % (chrom_prefix, chrom, loc - 1, loc)) ann_file = annotate.add_genes(loc_file, data, max_distance=10000) for r in pybedtools.BedTool(ann_file): for gene in r.name.split(","): if gene != ".": genes.add(gene) return sorted(list(genes))
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ name = dd.get_sample_name(data) work_dir = os.path.join(dd.get_work_dir(data), "trna_mint", name) if not dd.get_srna_mint_lookup(data): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals", "--filter_bases_not_stored", "--filter_reads_with_N_cigar", ] if dd.get_quality_format(data, "").lower() == "illumina": params.append("--fix_misencoded_quality_scores") jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) cmd = [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + params do.run(cmd, "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def run(data): """Proxy function to run the tool""" sample = data[0][0] work_dir = dd.get_work_dir(sample) out_dir = os.path.join(work_dir, "mirge") lib = _find_lib(sample) mirge = _find_mirge(sample) bowtie = _find_bowtie(sample) sps = dd.get_species(sample) species = SPS.get(sps, "") if not species: raise ValueError( "species not supported (hsa, mmu, rno, dre, cel, dme): %s" % sps) if not lib: raise ValueError( "-lib option is not set up in resources for mirge tool." " Read above warnings lines.") if not utils.file_exists(out_dir): with tx_tmpdir() as tmp_dir: sample_file = _create_sample_file(data, tmp_dir) do.run(_cmd().format(**locals()), "Running miRge2.0.") shutil.move(tmp_dir, out_dir) return [ os.path.abspath(fn) for fn in glob.glob(os.path.join(out_dir, "*", "*")) ]
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect somatic mutations with qSNP. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): out_file = out_file.replace(".gz", "") with file_transaction(config, out_file) as tx_out_file: with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): paired = get_paired_bams(align_bams, items) qsnp = config_utils.get_program("qsnp", config) resources = config_utils.get_resources("qsnp", config) mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"])) qsnp_log = os.path.join(tmpdir, "qsnp.log") qsnp_init = os.path.join(tmpdir, "qsnp.ini") if region: paired = _create_bam_region(paired, region, tmpdir) _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init) cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}") do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {}) out_file = _filter_vcf(out_file) out_file = bgzip_and_index(out_file, config) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-") tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def dedup_bam(in_bam, data): """Perform non-stream based deduplication of BAM input files using biobambam. """ if _check_dedup(data): out_file = os.path.join( utils.safe_makedir( os.path.join(os.getcwd(), "align", dd.get_sample_name(data))), "%s-dedup%s" % utils.splitext_plus(os.path.basename(in_bam))) if not utils.file_exists(out_file): with tx_tmpdir(data) as tmpdir: with file_transaction(data, out_file) as tx_out_file: bammarkduplicates = config_utils.get_program( "bammarkduplicates", data["config"]) base_tmp = os.path.join( tmpdir, os.path.splitext(os.path.basename(tx_out_file))[0]) cores, mem = _get_cores_memory(data, downscale=2) cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup " "markthreads={cores} I={in_bam} O={tx_out_file}") do.run(cmd.format(**locals()), "De-duplication with biobambam") bam.index(out_file, data["config"]) return out_file else: return in_bam
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None, samples=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process samples -- Pre-processed samples, useful if run inside of docker containers. """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) if samples: dockerized = True else: dockerized = False samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_samples_with_pipelines(samples) final = [] with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) if not dockerized: versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if (sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len(depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ("{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) config = data['config'] if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return data gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(out_dir) as tx_out_dir: bam_file = _prepare_bam_file(in_bam, tmp_dir, config) cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}") do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: # pick targets, anti-targets and access files based on analysis type # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html cov_interval = dd.get_coverage_interval(data) base_regions = dd.get_variant_regions(data) # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = annotate.subset_by_genes(base_regions, data, work_dir, pad=1e4) raw_target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) target_bed = annotate.add_genes(raw_target_bed, data) # bail out if we ended up with no regions if not utils.file_exists(target_bed): return {} if cov_interval == "amplicon": target_opts = ["--targets", target_bed, "--access", target_bed] elif cov_interval == "genome": target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)] else: target_opts = ["--targets", target_bed, "--access", access_file] cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ target_opts + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def test_makes_base_tmp_dir(self, mock_io): """" Test that tx_tmpdir creates a base temporary directory """ with tx_tmpdir(None): pass transaction.utils.safe_makedir.assert_called_once_with( transaction.utils.get_abspath.return_value)
def bedtools_tmpdir(data): with tx_tmpdir(data) as tmpdir: orig_tmpdir = tempfile.gettempdir() pybedtools.set_tempdir(tmpdir) yield if orig_tmpdir and os.path.exists(orig_tmpdir): pybedtools.set_tempdir(orig_tmpdir) else: tempfile.tempdir = None
def test_gets_base_tmpdir_name_from_config_or_cwd(self, mock_io, mocker): mocker.patch('bcbio.distributed.transaction._get_base_tmpdir') data = mock.Mock() with tx_tmpdir(data): pass cwd = transaction.os.getcwd.return_value transaction._get_base_tmpdir.assert_called_once_with( data, cwd) base_tmpdir = transaction._get_base_tmpdir.return_value transaction.utils.get_abspath.assert_called_once_with(base_tmpdir)