def run(data): """Quantitaive isoforms expression by express""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) tophat_index = get_in(data, ('genome_resources', 'rnaseq', 'transcriptome_index', 'tophat')) if not tophat_index: logger.info("Tophat index not found, skipping running eXpress.") return None tophat_fa = tophat_index.replace("ver", "fa") out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") safe_makedir(out_dir) express = config_utils.get_program("express", data['config']) if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return None if not file_exists(out_file): with tx_tmpdir() as tmp_dir: chdir(tmp_dir) ref_transcript = _do_fasta(tophat_fa) cmd = ("{express} {ref_transcript} {in_bam}") do.run(cmd.format(**locals()), "Run express", {}) shutil.move("results.xprs", out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs","fpkm"), 10) return (eff_count_file, tpm_file, fpkm_file)
def report_summary(samples, run_parallel): """ Run coverage report with bcbiocov package """ work_dir = dd.get_work_dir(samples[0][0]) parent_dir = utils.safe_makedir(os.path.join(work_dir, "report")) qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") with utils.chdir(parent_dir): logger.info("copy qsignature") if qsignature_fn: if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) out_dir = utils.safe_makedir("coverage") out_dir = utils.safe_makedir("variants") samples = run_parallel("coverage_report", samples) try: import bcbreport.prepare as bcbreport bcbreport.report(parent_dir) except: logger.info("skipping report. No bcbreport installed.") pass logger.info("summarize metrics") samples = _merge_metrics(samples) return samples
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: cmd = ("sambamba depth region -F \"not unmapped\" -t {cores} -C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 -T 80 -T 100 -L {bed_file} {in_bam} | sed 's/# chrom/chrom/' > {parse_file}") do.run(cmd.format(**locals()), "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, bed_file, sample) _calculate_percentiles(parse_file, sample) data['coverage'] = os.path.abspath(parse_file) return data
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect somatic mutations with qSNP. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): out_file = out_file.replace(".gz", "") with file_transaction(config, out_file) as tx_out_file: with tx_tmpdir(config) as tmpdir: with utils.chdir(tmpdir): paired = get_paired_bams(align_bams, items) qsnp = config_utils.get_program("qsnp", config) resources = config_utils.get_resources("qsnp", config) mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"])) qsnp_log = os.path.join(tmpdir, "qsnp.log") qsnp_init = os.path.join(tmpdir, "qsnp.ini") if region: paired = _create_bam_region(paired, region, tmpdir) _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init) cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}") do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {}) out_file = _filter_vcf(out_file) out_file = bgzip_and_index(out_file, config) return out_file
def _generate_qseq(bc_dir, config): """Generate qseq files from illumina bcl files if not present. More recent Illumina updates do not produce qseq files. Illumina's offline base caller (OLB) generates these starting with bcl, intensity and filter files. """ if not os.path.exists(os.path.join(bc_dir, "finished.txt")): bcl2qseq_log = os.path.join(config["log_dir"], "setupBclToQseq.log") cmd = os.path.join(config["program"]["olb"], "bin", "setupBclToQseq.py") cl = [cmd, "-L", bcl2qseq_log, "-o", bc_dir, "--in-place", "--overwrite", "--ignore-missing-stats", "--ignore-missing-control"] # in OLB version 1.9, the -i flag changed to intensities instead of input version_cl = [cmd, "-v"] p = subprocess.Popen(version_cl, stdout=subprocess.PIPE) (out, _) = p.communicate() olb_version = float(out.strip().split()[-1].rsplit(".", 1)[0]) if olb_version > 1.8: cl += ["-P", ".clocs"] cl += ["-b", bc_dir] else: cl += ["-i", bc_dir, "-p", os.path.split(bc_dir)[0]] subprocess.check_call(cl) with utils.chdir(bc_dir): processors = config["algorithm"].get("num_cores", 8) cl = config["program"].get("olb_make", "make").split() + ["-j", str(processors)] subprocess.check_call(cl)
def _download_prepped_genome(genome_build, data, name, need_remap): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. """ out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) ref_dir = os.path.join(out_dir, genome_build, REMAP_NAMES.get(name, name)) if not os.path.exists(ref_dir): target = REMAP_NAMES.get(name, name) if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] INPLACE_INDEX[target](ref_file, ref_dir, data) else: with utils.chdir(out_dir): bucket = S3_INFO["bucket"] key = S3_INFO["key"].format(build=genome_build, target=REMAP_NAMES.get(name, name)) cmd = ("gof3r get --no-md5 -k {key} -b {bucket} | pigz -d -c | tar -xvp") do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, name)) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def copy_flowcell(dname, fastq_dir, sample_cfile, config): """Copy required files for processing using rsync, potentially to a remote server. """ with utils.chdir(dname): reports = reduce(operator.add, [glob.glob("*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xsl"), glob.glob("Data/Intensities/BaseCalls/*.htm"), ["Data/Intensities/BaseCalls/Plots", "Data/reports", "Data/Status.htm", "Data/Status_Files", "InterOp"]]) run_info = reduce(operator.add, [glob.glob("run_info.yaml"), glob.glob("*.csv")]) fastq = glob.glob(os.path.join(fastq_dir.replace(dname + "/", "", 1), "*.gz")) configs = [sample_cfile.replace(dname + "/", "", 1)] include_file = os.path.join(dname, "transfer_files.txt") with open(include_file, "w") as out_handle: out_handle.write("+ */\n") for fname in configs + fastq + run_info + reports: out_handle.write("+ %s\n" % fname) out_handle.write("- *\n") # remote transfer if utils.get_in(config, ("process", "host")): dest = "%s@%s:%s" % (utils.get_in(config, ("process", "username")), utils.get_in(config, ("process", "host")), utils.get_in(config, ("process", "dir"))) # local transfer else: dest = utils.get_in(config, ("process", "dir")) cmd = ["rsync", "-akmrtv", "--include-from=%s" % include_file, dname, dest] logger.info("Copying files to analysis machine") logger.info(" ".join(cmd)) subprocess.check_call(cmd)
def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "") if postprocess_dir: fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq") if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir): with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] if postprocess_dir: cl += ["-o", fastq_dir] if compress_fastq: cl += ["--gzip"] logger2.debug("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) return fastq_dir
def generate_align_summary(bam_file, is_paired, sam_ref, sample_name, config, dirs): """Run alignment summarizing script to produce a pdf with align details. """ with utils.chdir(dirs["work"]): with utils.curdir_tmpdir() as tmp_dir: graphs, summary, overrep = _graphs_and_summary(bam_file, sam_ref, is_paired, tmp_dir, config) return _generate_pdf(graphs, summary, overrep, bam_file, sample_name, dirs, config)
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. Does not support transactional directories yet, since GenomicsDB databases cannot be moved to new locations. We try to identify half-finished databases and restart: https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 Known issue -- Genomics DB workspace path core dumps on longer paths: (std::string::compare(char const*)) """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir): if os.path.exists(out_dir): shutil.rmtree(out_dir) with utils.chdir(os.path.dirname(out_file)): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = ["-T", "GenomicsDBImport", "--reader-threads", str(cores), "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()), "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: vcfutils.bgzip_and_index(vrn_file, data["config"]) params += ["--variant", vrn_file] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") return hsmetric_file
def _generate_qseq(bc_dir, config): """Generate qseq files from illumina bcl files if not present. More recent Illumina updates do not produce qseq files. These can be generated from bcl, intensity and filter files with tools from the offline base caller OLB. """ if not os.path.exists(os.path.join(bc_dir, "finished.txt")): log.info("Generating qseq files at %s" % bc_dir) bcl2qseq_log = os.path.join(config["log_dir"], "setupBclToQseq.log") cmd = os.path.join(config["program"]["olb"], "bin", "setupBclToQseq.py") cl = [cmd, "-L", bcl2qseq_log,"-o", bc_dir, "--in-place", "--overwrite"] # in OLB version 1.9, the -i flag changed to intensities instead of input version_cl = [cmd, "-v"] p = subprocess.Popen(version_cl, stdout=subprocess.PIPE) (out, _) = p.communicate() olb_version = float(out.strip().split()[-1].rsplit(".", 1)[0]) if olb_version > 1.8: cl += ["-b", bc_dir] else: cl += ["-i", bc_dir, "-p", os.path.split(bc_dir)[0]] subprocess.check_call(cl) log.info("Qseq files generated.") with utils.chdir(bc_dir): try: processors = config["algorithm"]["num_cores"] except KeyError: processors = 8 cl = config["program"].get("olb_make", "make").split() + ["-j", str(processors)] subprocess.check_call(cl)
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get("analysis", "").lower() not in ["standard"] else None) bam_file = ds_bam if ds_bam else bam_file num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file] do.run(cl, "FastQC: %s" % data["name"][-1]) fastqc_outdir = os.path.join(tx_tmp_dir, "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0]) if os.path.exists("%s.zip" % fastqc_outdir): os.remove("%s.zip" % fastqc_outdir) if not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(fastqc_outdir, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def _run_biodata_upload(args): """Manage preparation of biodata on a local machine, uploading to S3 in pieces. """ args = defaults.update_check_args(args, "biodata not uploaded") args = install.docker_image_arg(args) for gbuild in args.genomes: print("Preparing %s" % gbuild) if args.prepped: for target in ["samtools"] + args.aligners: genome.download_prepped_genome(gbuild, {}, target, False, args.prepped) print("Downloaded prepped %s to %s. Edit and re-run without --prepped to upload" % (gbuild, args.prepped)) return cl = ["upgrade", "--genomes", gbuild] for a in args.aligners: cl += ["--aligners", a] dmounts = mounts.prepare_system(args.datadir, DOCKER["biodata_dir"]) manage.run_bcbio_cmd(args.image, dmounts, cl) print("Uploading %s" % gbuild) gdir = _get_basedir(args.datadir, gbuild) basedir, genomedir = os.path.split(gdir) assert genomedir == gbuild with utils.chdir(basedir): all_dirs = sorted(os.listdir(gbuild)) _upload_biodata(gbuild, "seq", all_dirs) for aligner in args.aligners: _upload_biodata(gbuild, genome.REMAP_NAMES.get(aligner, aligner), all_dirs)
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) depth_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions") if not utils.file_uptodate(callable_file, bam_file): cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]), "--processes", str(dd.get_num_cores(data)), "--ordered"] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, depth_file) as tx_depth_file: with utils.chdir(os.path.dirname(tx_depth_file)): tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed") prefix = tx_depth_file.replace(".depth.bed", "") bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0] bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data) cmd += ["--reference", bam_ref_file] cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
def run_has_samplesheet(fc_dir, config, require_single=True): """Checks if there's a suitable SampleSheet.csv present for the run. Returns the path to the samplesheet if one is found, None otherwise. """ fc_name, _ = get_flowcell_info(fc_dir) sheet_dirs = config.get("samplesheet_directories", []) fcid_sheet = {} for ss_dir in (s for s in sheet_dirs if os.path.exists(s)): with utils.chdir(ss_dir): for ss in glob.glob("*.csv"): fc_ids = _get_flowcell_id(ss, require_single) for fcid in fc_ids: if fcid: fcid_sheet[fcid] = os.path.join(ss_dir, ss) # difflib handles human errors while entering data on the SampleSheet. # Only one best candidate is returned (if any). 0.85 cutoff allows for # maximum of 2 mismatches in fcid potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1, 0.85) if len(potential_fcids) > 0 and potential_fcids[0] in fcid_sheet: return fcid_sheet[potential_fcids[0]] else: return None
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def run(name, chip_bam, input_bam, genome_build, out_dir, config): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): return out_file macs2 = config_utils.get_program("macs2", config) options = " ".join(config_utils.get_resources("macs2", config).get("options", "")) if genome_build not in HS and options.find("-g") == -1: raise ValueError("This %s genome doesn't have a pre-set value." "You can add specific values using resources " "option for macs2 in the YAML file (-g genome_size)." "Check Chip-seq configuration in " "bcbio-nextgen documentation.") genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build] with utils.chdir(out_dir): cmd = _macs2_cmd() try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") return out_file
def _run_kraken(data,ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") stats = out = out_stats = None db = data['config']["algorithm"]["kraken"] if db == "minikraken": db = os.path.join(_get_data_dir(),"genome","kraken","minikraken") else: if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report" : "null"} if not os.path.exists(os.path.join(kraken_out,"kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) files = data["files"] with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir,"kraken_out") out_stats = os.path.join(tx_tmp_dir,"kraken_stats") cl = (" ").join([config_utils.get_program("kraken", data["config"]), "--db",db,"--quick", "--preload","--min-hits","2","--threads",str(num_cores), "--out", out, files[0]," 2>",out_stats]) do.run(cl,"kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out,db,data) return metrics
def _start_processing(dname, sample_file, config): """Initiate processing: on a remote server or locally on a cluster. """ to_remote = _remap_dirname(dname, os.path.join(utils.get_in(config, ("process", "dir")), os.path.basename(dname))) args = { "work_dir": to_remote(os.path.join(dname, "analysis")), "run_config": to_remote(sample_file), "fc_dir": to_remote(dname), } # call a remote server if utils.get_in(config, ("process", "server")): print "%s/run?args=%s" % (utils.get_in(config, ("process", "server")), json.dumps(args)) requests.get(url="%s/run" % utils.get_in(config, ("process", "server")), params={"args": json.dumps(args)}) # submit to a cluster scheduler elif "submit_cmd" in config["process"] and "bcbio_batch" in config["process"]: with utils.chdir(utils.safe_makedir(args["work_dir"])): batch_script = "submit_bcbio.sh" with open(batch_script, "w") as out_handle: out_handle.write( config["process"]["bcbio_batch"].format(fcdir=args["fc_dir"], run_config=args["run_config"]) ) submit_cmd = utils.get_in(config, ("process", "submit_cmd")) subprocess.check_call(submit_cmd.format(batch_script=batch_script), shell=True) else: raise ValueError("Unexpected processing approach: %s" % config["process"])
def _upgrade_snpeff_data(galaxy_dir, args, remotes): """Install or upgrade snpEff databases, localized to reference directory. """ for dbkey, ref_file in genome.get_builds(galaxy_dir): resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey) with open(resource_file) as in_handle: resources = yaml.load(in_handle) snpeff_db, snpeff_base_dir = effects.get_db(ref_file, resources) if snpeff_db: snpeff_db_dir = os.path.join(snpeff_base_dir, snpeff_db) if not os.path.exists(snpeff_db_dir): print("Installing snpEff database %s in %s" % (snpeff_db, snpeff_base_dir)) tooldir = args.tooldir or get_defaults()["tooldir"] config = {"resources": {"snpeff": {"jvm_opts": ["-Xms500m", "-Xmx1g"], "dir": os.path.join(tooldir, "share", "java", "snpeff")}}} raw_version = programs.java_versioner("snpeff", "snpEff", stdout_flag="snpEff version SnpEff")(config) snpeff_version = "".join([x for x in raw_version if x in set(string.digits + ".")]).replace(".", "_") dl_url = remotes["snpeff_dl_url"].format(snpeff_ver=snpeff_version, genome=snpeff_db) dl_file = os.path.basename(dl_url) with utils.chdir(snpeff_base_dir): subprocess.check_call(["wget", "-c", "-O", dl_file, dl_url]) subprocess.check_call(["unzip", dl_file]) os.remove(dl_file) dl_dir = os.path.join(snpeff_base_dir, "data", snpeff_db) os.rename(dl_dir, snpeff_db_dir) os.rmdir(os.path.join(snpeff_base_dir, "data"))
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config): """Split a fastq file into multiplex pieces using barcode details. """ if not multiplex: return [("", "", fastq1, fastq2)] bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name) nomatch_file = "%s_unmatched_1_fastq.txt" % base_name metrics_file = "%s_bc.metrics" % base_name out_files = [] for info in multiplex: fq_fname = lambda x: os.path.join(bc_dir, "%s_%s_%s_fastq.txt" % (base_name, info["barcode_id"], x)) bc_file1 = fq_fname("1") bc_file2 = fq_fname("2") if fastq2 else None out_files.append((info["barcode_id"], info["name"], bc_file1, bc_file2)) with utils.chdir(bc_dir): if not os.path.exists(nomatch_file) and not os.path.exists(metrics_file): tag_file = _make_tag_file(multiplex) cl = [config["program"]["barcode"], tag_file, "%s_--b--_--r--_fastq.txt" % base_name, fastq1] if fastq2: cl.append(fastq2) cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"]) cl.append("--metrics=%s" % metrics_file) if int(config["algorithm"]["bc_read"]) == 2: cl.append("--second") if int(config["algorithm"]["bc_position"]) == 5: cl.append("--five") if config["algorithm"].get("bc_allow_indels", True) is False: cl.append("--noindel") with utils.file_transaction(out_files + [nomatch_file, metrics_file]): subprocess.check_call(cl) out_files = [(b, n, f1, f2) for (b, n, f1, f2) in out_files if os.path.exists(f1)] return out_files
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0]))) species = dd.get_species(data[0][0]) hairpin = op.join(mirbase, "hairpin.fa") mature = op.join(mirbase, "mature.fa") rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file): do.run(cmd.format(**locals()), "Running mirdeep2.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def main(org_build, gtf_file, genome_fasta, genome_dir, cores, args): genome_dir = genome_dir if genome_dir else os.curdir build_dir = os.path.abspath(os.path.join(genome_dir, org_build)) work_dir = os.path.join(build_dir, "tmpcbl") safe_makedir(work_dir) ens_version = supported_oldbuilds.get(org_build, ensembl_release) out_dir = os.path.join(build_dir, "rnaseq-%s_%s" % (datetime.datetime.now().strftime("%Y-%m-%d"), ens_version)) tophat_dir = os.path.join(out_dir, "tophat") gtf_file = os.path.abspath(gtf_file) if gtf_file else gtf_file if genome_fasta: genome_fasta = os.path.abspath(genome_fasta) work_fasta = os.path.join(work_dir, os.path.basename(genome_fasta)) if not os.path.exists(work_fasta): shutil.copy(genome_fasta, work_fasta) genome_fasta = work_fasta with chdir(work_dir): if not genome_fasta: genome_fasta = get_genome_fasta(org_build) if not gtf_file: write_version(build=build_info[org_build]) build = build_info[org_build] gtf_file = prepare_tx_gff(build, org_build) else: write_version(gtf_file=gtf_file) work_gtf = os.path.join(work_dir, "ref-transcripts.gtf") if not os.path.exists(work_gtf): shutil.copy(gtf_file, work_gtf) gtf_file = work_gtf gtf_file = clean_gtf(gtf_file, genome_fasta) db = _get_gtf_db(gtf_file) os.remove(gtf_file) gtf_file = db_to_gtf(db, gtf_file) gtf_to_refflat(gtf_file) gtf_to_bed(gtf_file) prepare_tx2gene(gtf_file) prepare_dexseq(gtf_file) mask_gff = prepare_mask_gtf(gtf_file) rrna_gtf = prepare_rrna_gtf(gtf_file) if file_exists(rrna_gtf): gtf_to_interval(rrna_gtf, genome_fasta) if args.tophat: prepare_tophat_index(gtf_file, org_build, genome_fasta) transcriptome_fasta = make_transcriptome_fasta(gtf_file, genome_fasta) if args.kallisto: prepare_kallisto_index(transcriptome_fasta, org_build) make_hisat2_splicesites(gtf_file) cleanup(work_dir, out_dir, org_build) rnaseq_dir = os.path.join(build_dir, "rnaseq") if os.path.exists(rnaseq_dir): if os.path.islink(rnaseq_dir): os.unlink(rnaseq_dir) else: shutil.rmtree(rnaseq_dir) os.symlink(out_dir, rnaseq_dir) tar_dirs = [os.path.relpath(out_dir)] tarball = create_tarball(tar_dirs, org_build)
def _files_to_copy(directory): """Retrieve files that should be remotely copied. """ with utils.chdir(directory): image_redo_files = reduce(operator.add, [glob.glob("*.params"), glob.glob("Images/L*/C*"), ["RunInfo.xml", "runParameters.xml"]]) qseqs = reduce(operator.add, [glob.glob("Data/Intensities/*.xml"), glob.glob("Data/Intensities/BaseCalls/*qseq.txt"), ]) reports = reduce(operator.add, [glob.glob("*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xsl"), glob.glob("Data/Intensities/BaseCalls/*.htm"), ["Data/Intensities/BaseCalls/Plots", "Data/reports", "Data/Status.htm", "Data/Status_Files", "InterOp"]]) run_info = reduce(operator.add, [glob.glob("run_info.yaml"), glob.glob("*.csv"), ]) logs = reduce(operator.add, [["Logs", "Recipe", "Diag", "Data/RTALogs", "Data/Log.txt"]]) fastq = ["Data/Intensities/BaseCalls/fastq"] return (sorted(image_redo_files + logs + reports + run_info + qseqs), sorted(reports + fastq + run_info))
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() hairpin, mature, species = "none", "none", "na" rfam_file = dd.get_mirdeep2_file(data[0][0]) if file_exists(dd.get_mirbase_hairpin(data[0][0])): species = dd.get_species(data[0][0]) hairpin = dd.get_mirbase_hairpin(data[0][0]) mature = dd.get_mirbase_mature(data[0][0]) logger.debug("Preparing for mirdeep2 analysis.") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file): try: do.run(cmd.format(**locals()), "Running mirdeep2.") except: logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def _upgrade_snpeff_data(galaxy_dir, args, remotes): """Install or upgrade snpEff databases, localized to reference directory. """ snpeff_version = effects.snpeff_version(args) if not snpeff_version: return for dbkey, ref_file in genome.get_builds(galaxy_dir): resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey) if os.path.exists(resource_file): with open(resource_file) as in_handle: resources = yaml.load(in_handle) snpeff_db, snpeff_base_dir = effects.get_db({"genome_resources": resources, "reference": {"fasta": {"base": ref_file}}}) if snpeff_db: snpeff_db_dir = os.path.join(snpeff_base_dir, snpeff_db) if os.path.exists(snpeff_db_dir) and _is_old_database(snpeff_db_dir, args): shutil.rmtree(snpeff_db_dir) if not os.path.exists(snpeff_db_dir): print("Installing snpEff database %s in %s" % (snpeff_db, snpeff_base_dir)) dl_url = remotes["snpeff_dl_url"].format( snpeff_ver=snpeff_version.replace(".", "_"), genome=snpeff_db) dl_file = os.path.basename(dl_url) with utils.chdir(snpeff_base_dir): subprocess.check_call(["wget", "-c", "-O", dl_file, dl_url]) subprocess.check_call(["unzip", dl_file]) os.remove(dl_file) dl_dir = os.path.join(snpeff_base_dir, "data", snpeff_db) shutil.move(dl_dir, snpeff_db_dir) os.rmdir(os.path.join(snpeff_base_dir, "data"))
def unpack_tarballs(xs, data, use_subdir=True): """Unpack workflow tarballs into ready to use directories. """ if isinstance(xs, dict): for k, v in xs.items(): xs[k] = unpack_tarballs(v, data, use_subdir) elif isinstance(xs, (list, tuple)): xs = [unpack_tarballs(x, data, use_subdir) for x in xs] elif isinstance(xs, six.string_types): if os.path.isfile(xs.encode("utf-8", "ignore")) and xs.endswith("-wf.tar.gz"): if use_subdir: tarball_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "wf-inputs")) else: tarball_dir = dd.get_work_dir(data) out_dir = os.path.join(tarball_dir, os.path.basename(xs).replace("-wf.tar.gz", "").replace("--", os.path.sep)) if not os.path.exists(out_dir): with utils.chdir(tarball_dir): with tarfile.open(xs, "r:gz") as tar: tar.extractall() assert os.path.exists(out_dir), out_dir # Default to representing output directory xs = out_dir # Look for aligner indices for fname in os.listdir(out_dir): if fname.endswith(DIR_TARGETS): xs = os.path.join(out_dir, fname) break elif fname.endswith(BASENAME_TARGETS): base = os.path.join(out_dir, utils.splitext_plus(os.path.basename(fname))[0]) xs = glob.glob("%s*" % base) break return xs
def analyze_locally(dname, post_config_file, fastq_dir): """Run analysis directly on the local machine. """ assert fastq_dir is not None post_config = load_config(post_config_file) analysis_dir = os.path.join(fastq_dir, os.pardir, "analysis") utils.safe_makedir(analysis_dir) with utils.chdir(analysis_dir): if post_config["algorithm"]["num_cores"] == "messaging": prog = post_config["analysis"]["distributed_process_program"] else: prog = post_config["analysis"]["process_program"] cl = [prog, post_config_file, dname] run_yaml = os.path.join(dname, "run_info.yaml") if os.path.exists(run_yaml): cl.append(run_yaml) subprocess.check_call(cl)
def main(cores=1): start_dir = os.getcwd() work_dir = utils.safe_makedir("/scratch/square") priorities = set(["1", "2"]) list_file = get_input_list(start_dir, priorities) ensure_bam_index(list_file) # Ensure input CRAMs are indexed; gets IO bound quickly so limit cores cram_cores = min(int(cores), 6) for cindex in joblib.Parallel(cram_cores)(joblib.delayed(index_cram)(x) for x in find_crams(list_file)): print cindex with utils.chdir(work_dir): out_file = run_squaring(list_file, name, ref_file, cores) for ext in ["", ".tbi"]: new_file = os.path.join(start_dir, os.path.basename(out_file) + ext) if not utils.file_exists(new_file): shutil.copy(out_file + ext, new_file)
def main(config_file, queues=None, task_module=None, base_dir=None): if base_dir is None: base_dir = os.getcwd() if task_module is None: task_module = "bcbio.distributed.tasks" config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(base_dir, "log") signals.setup_logging.connect(celery_logger(config)) setup_logging(config) logger.info("Starting distributed worker process: {0}".format(queues if queues else "")) with utils.chdir(base_dir): with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config, os.path.abspath(config_file)): run_celeryd(work_dir, queues)
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) antibody = antibodies.ANTIBODIES.get(dd.get_antibody(data).lower(), None) if antibody: logger.info( f"{antibody.name} specified, using {antibody.peaktype} peak settings." ) peaksettings = select_peak_parameters(antibody) elif method == "atac": logger.info(f"ATAC-seq specified, using narrow peak settings.") peaksettings = " " else: peaksettings = " " options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(data) cmd += peaksettings try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error. " "Please, check the message and report " "error if it is related to bcbio. " "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir)
def _run_cwltool(args): """Run with cwltool -- reference implementation. """ main_file, json_file, project_name = _get_main_and_json(args.directory) work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cwltool_work")) tmp_dir = utils.safe_makedir(os.path.join(work_dir, "tmpcwl")) os.environ["TMPDIR"] = tmp_dir flags = ["--tmpdir-prefix", tmp_dir, "--tmp-outdir-prefix", tmp_dir] if args.no_container: _remove_bcbiovm_path() flags += [ "--no-container", "--preserve-environment", "PATH", "--preserve-environment", "HOME" ] cmd = ["cwltool"] + flags + args.toolargs + ["--", main_file, json_file] with utils.chdir(work_dir): _run_tool(cmd, not args.no_container, work_dir)
def _generate_fastq(fc_dir, config): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): log.info("Generating fastq files for %s" % fc_dir) with utils.chdir(basecall_dir): lanes = sorted( list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] log.info("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) log.info("Qseq to fastq conversion completed.") return fastq_dir
def _directory_tarball(dirname): """Create a tarball of a complex directory, avoiding complex secondaryFiles. Complex secondary files do not work on multiple platforms and are not portable to WDL, so for now we create a tarball that workers will unpack. """ assert os.path.isdir(dirname) base_dir, tarball_dir = os.path.split(dirname) while base_dir and not os.path.exists(os.path.join(base_dir, "seq")): base_dir, extra_tarball = os.path.split(base_dir) tarball_dir = os.path.join(extra_tarball, tarball_dir) tarball = os.path.join(base_dir, "%s-wf.tar.gz" % (tarball_dir.replace(os.path.sep, "--"))) if not utils.file_exists(tarball): with utils.chdir(base_dir): with tarfile.open(tarball, "w:gz") as tar: tar.add(tarball_dir) return tarball
def priority_coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_svprioritize(data) if not bed_file or not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_depth.bed") if file_exists(out_file): data['priority_coverage'] = os.path.abspath(out_file) return data with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) logger.debug("Calculating priority coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) with file_transaction(out_file) as tx_out_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += "%s\t%s\t%s\n" % (chrom, start, end) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample samtools = config_utils.get_program("samtools", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ( "{samtools} view -b {in_bam} {coord_string} | " "{bedtools} coverage -sorted -d -a {region_file} -b - | " "awk {awk_string} >> {tx_out_file}") _silence_run(cmd.format(**locals())) data['priority_coverage'] = os.path.abspath(out_file) return data
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" setpath.prepend_bcbiopath() try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys = _world_from_cwl(args.name, fnargs[1:], work_dir) # Can remove this awkward Docker merge when we do not need custom GATK3 installs fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys = None, {} with utils.chdir(work_dir): with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})): try: out = fn(fnargs) except: logger.exception() raise if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, work_dir) except: logger.exception() raise if argfile.endswith(".json"): _write_wdl_outputs(argfile, out_keys)
def install_srna(species, gtf): out_file = os.path.join(SRNASEQ_DIR, "srna-transcripts.gtf") safe_makedir(SRNASEQ_DIR) if not os.path.exists(out_file): shutil.copyfile(gtf, out_file) try: from seqcluster import install except ImportError: raise ImportError("install seqcluster first, please.") with chdir(SRNASEQ_DIR): hairpin, miRNA = install._install_mirbase() cmd = ("grep -A 2 {species} {hairpin} | grep -v '\-\-$' | tr U T > hairpin.fa") do.run(cmd.format(**locals()), "set precursor.") cmd = ("grep -A 1 {species} {miRNA} > miRNA.str") do.run(cmd.format(**locals()), "set miRNA.") shutil.rmtree("mirbase") return out_file
def rmarkdown_draft(filename, template, package): """ create a draft rmarkdown file from an installed template """ if file_exists(filename): return filename draft_template = Template( 'rmarkdown::draft("$filename", template="$template", package="$package", edit=FALSE)' ) draft_string = draft_template.substitute( filename=filename, template=template, package=package) report_dir = os.path.dirname(filename) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, "--vanilla", "-e", draft_string], "Creating bcbioRNASeq quality control template.") do.run(["sed", "-i", "s/YYYY-MM-DD\///g", filename], "Editing bcbioRNAseq quality control template.") return filename
def _convert_fastq(srafn, outdir, single=False): "convert sra to fastq" cmd = "fastq-dump --split-files --gzip {srafn}" sraid = os.path.basename(utils.splitext_plus(srafn)[0]) if not single: out_file = [ os.path.join(outdir, "%s_1.fastq.gz" % sraid), os.path.join(outdir, "%s_2.fastq.gz" % sraid) ] if not utils.file_exists(out_file[0]): with utils.chdir(outdir): do.run(cmd.format(**locals()), "Covert to fastq %s" % sraid) if not utils.file_exists(out_file[0]): raise IOError("SRA %s didn't convert, something happened." % srafn) return [out for out in out_file if utils.file_exists(out)] else: raise ValueError("Not supported single-end sra samples for now.")
def _bcbio_variation_ensemble(vrn_files, out_file, ref_file, config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ vrn_files = [_handle_somatic_ensemble(v, data) for v in vrn_files] tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bv_jar, "variant-ensemble", config_file, ref_file, out_file ] + vrn_files with utils.chdir(base_dir): do.run(cmd, "Ensemble calling: %s" % os.path.basename(base_dir))
def _download_ref(url, ref_dir): #Lifted from Brad Chapman dl_file = os.path.basename(url) ref_file = None for supported_ext, extract_cmd in [(".gz", "gunzip"), (".tgz", ("tar", "zxvf"))]: if dl_file.endswith(supported_ext): ref_file = os.path.join(ref_dir, dl_file[:-len(supported_ext)]) break assert ref_file is not None, url if not os.path.exists(ref_file): with utils.chdir(ref_dir): cl = ["wget", url] subprocess.check_call(cl) cl = list(flatten([extract_cmd, dl_file])) subprocess.check_call(cl) return ref_file
def _run_funnel(args): """Run funnel TES server with rabix bunny for CWL. """ host = "localhost" port = "8088" main_file, json_file, project_name = _get_main_and_json(args.directory) work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "funnel_work")) log_file = os.path.join(work_dir, "%s-funnel.log" % project_name) # Create bunny configuration directory with TES backend orig_config_dir = os.path.join( os.path.dirname(os.path.realpath(utils.which("rabix"))), "config") work_config_dir = utils.safe_makedir(os.path.join(work_dir, "rabix_config")) for fname in os.listdir(orig_config_dir): if fname == "core.properties": with open(os.path.join(orig_config_dir, fname)) as in_handle: with open(os.path.join(work_config_dir, fname), "w") as out_handle: for line in in_handle: if line.startswith("backend.embedded.types"): line = "backend.embedded.types=TES\n" out_handle.write(line) else: shutil.copy(os.path.join(orig_config_dir, fname), os.path.join(work_config_dir, fname)) flags = [ "-c", work_config_dir, "-tes-url=http://%s:%s" % (host, port), "-tes-storage=%s" % work_dir ] if args.no_container: _remove_bcbiovm_path() flags += ["--no-container"] cmd = ["rabix"] + flags + [main_file, json_file] funnelp = subprocess.Popen([ "funnel", "server", "run", "--Server.HostName", host, "--Server.HTTPPort", port, "--LocalStorage.AllowedDirs", work_dir, "--Worker.WorkDir", os.path.join(work_dir, "funnel-work") ]) try: with utils.chdir(work_dir): _run_tool(cmd, not args.no_container, work_dir, log_file) finally: funnelp.kill()
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) out_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions", file_prefix=prefix) if not utils.file_uptodate(out_file, bam_file): ref_file = dd.get_ref_file(data) cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"] window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0] if not utils.file_uptodate(window_file, bam_file): with file_transaction(data, window_file) as tx_out_file: if not variant_regions: variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0] with open(variant_regions, "w") as out_handle: for c in shared.get_noalt_contigs(data): out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size)) pybedtools.BedTool().window_maker(w=params["parallel_window_size"], b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file) cmd += ["--bed", window_file] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, out_file) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed") prefix = tx_out_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
def _run_bunny(args): """Run CWL with rabix bunny. """ main_file, json_file, project_name = _get_main_and_json(args.directory) work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "bunny_work")) flags = ["-b", work_dir] log_file = os.path.join(work_dir, "%s-bunny.log" % project_name) if os.path.exists(work_dir): caches = [os.path.join(work_dir, d) for d in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, d))] if caches: flags += ["--cache-dir", max(caches, key=os.path.getmtime)] if args.no_container: _remove_bcbiovm_path() flags += ["--no-container"] cmd = ["rabix"] + flags + [main_file, json_file] with utils.chdir(work_dir): _run_tool(cmd, not args.no_container, work_dir, log_file)
def install_srna(species, gtf): out_file = os.path.join(SRNASEQ_DIR, "srna-transcripts.gtf") safe_makedir(SRNASEQ_DIR) if gtf: if not file_exists(out_file): shutil.copyfile(gtf, out_file) try: from seqcluster import install except ImportError: raise ImportError("install seqcluster first, please.") with chdir(SRNASEQ_DIR): hairpin, miRNA = install._install_mirbase() cmd = ("cat %s | awk '{if ($0~/>%s/){name=$0; print name} else if ($0~/^>/){name=0};if (name!=0 && $0!~/^>/){print $0;}}' | sed 's/U/T/g' > hairpin.fa") do.run(cmd % (hairpin, species), "set precursor.") cmd = ("grep -A 1 {species} {miRNA} > miRNA.str") do.run(cmd.format(**locals()), "set miRNA.") shutil.rmtree("mirbase") return out_file
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None): """Ensure correct installation of VEP cache file. """ if config is None: config = {} resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey) if os.path.exists(resource_file): with open(resource_file) as in_handle: resources = yaml.load(in_handle) ensembl_name = tz.get_in(["aliases", "ensembl"], resources) symlink_dir = _special_dbkey_maps(dbkey, ref_file) if ensembl_name and ensembl_name.find("_vep_") == -1: raise ValueError("%s has ensembl an incorrect value." "It should have _vep_ in the name." "Remove line or fix the name to avoid error.") if symlink_dir and ensembl_name: species, vepv = ensembl_name.split("_vep_") return symlink_dir, species elif ensembl_name: species, vepv = ensembl_name.split("_vep_") vep_dir = utils.safe_makedir(os.path.normpath(os.path.join( os.path.dirname(os.path.dirname(ref_file)), "vep"))) out_dir = os.path.join(vep_dir, species, vepv) if not os.path.exists(out_dir): tmp_dir = utils.safe_makedir(os.path.join(vep_dir, species, "txtmp")) eversion = vepv.split("_")[0] url = "ftp://ftp.ensembl.org/pub/release-%s/variation/VEP/%s.tar.gz" % (eversion, ensembl_name) with utils.chdir(tmp_dir): subprocess.check_call(["wget", "--no-check-certificate", "-c", url]) vep_path = "%s/bin/" % tooldir if tooldir else "" perl_exports = utils.get_perl_exports() cmd = ["%svep_install" % vep_path, "-a", "c", "-s", ensembl_name, "-c", vep_dir, "-u", tmp_dir, "--NO_UPDATE", "--VERSION", eversion] do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Prepare VEP directory for %s" % ensembl_name) cmd = ["%svep_convert_cache" % vep_path, "--species", species, "--version", vepv, "--dir", vep_dir, "--force_overwrite", "--remove"] do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Convert VEP cache to tabix %s" % ensembl_name) for tmp_fname in os.listdir(tmp_dir): os.remove(os.path.join(tmp_dir, tmp_fname)) os.rmdir(tmp_dir) tmp_dir = os.path.join(vep_dir, "tmp") if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) return vep_dir, species return None, None
def run(_, data, out_dir): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio)) logger.info("Running kraken to determine contaminant: %s" % dd.get_sample_name(data)) # ratio = bam.get_aligned_reads(bam_file, data) out = out_stats = None db = tz.get_in(["config", "algorithm", "kraken"], data) kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(install._get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(out_dir, "kraken_out")): work_dir = os.path.dirname(out_dir) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files_orig"][0] if dd.get_save_diskspace( data) else data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fastq files as input") return {"kraken_report": "null"} with tx_tmpdir(data) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--output {out} --fastq-input /dev/stdin 2> {out_stats}" ).format(**locals()) do.run(cl, "kraken: %s" % dd.get_sample_name(data)) if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_tmp_dir, out_dir) metrics = _parse_kraken_output(out_dir, db, data) return metrics
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_kraken(data, ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]), ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") out = out_stats = None db = data['config']["algorithm"]["kraken"] kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(kraken_out, "kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fasta files as input") return {"kraken_report": "null"} with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--out {out} --fastq-input /dev/stdin 2> {out_stats}" ).format(**locals()) do.run(cl, "kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out, db, data) return metrics
def _upgrade_snpeff_data(galaxy_dir, args, remotes): """Install or upgrade snpEff databases, localized to reference directory. """ snpeff_version = effects.snpeff_version(args) if not snpeff_version: return for dbkey, ref_file in genome.get_builds(galaxy_dir): resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey) if os.path.exists(resource_file): with open(resource_file) as in_handle: resources = yaml.safe_load(in_handle) snpeff_db, snpeff_base_dir = effects.get_db({ "genome_resources": resources, "reference": { "fasta": { "base": ref_file } } }) if snpeff_db: snpeff_db_dir = os.path.join(snpeff_base_dir, snpeff_db) if os.path.exists(snpeff_db_dir) and _is_old_database( snpeff_db_dir, args): shutil.rmtree(snpeff_db_dir) if not os.path.exists(snpeff_db_dir): print("Installing snpEff database %s in %s" % (snpeff_db, snpeff_base_dir)) dl_url = remotes["snpeff_dl_url"].format( snpeff_ver=snpeff_version.replace(".", "_"), genome=snpeff_db) dl_file = os.path.basename(dl_url) with utils.chdir(snpeff_base_dir): subprocess.check_call([ "wget", "--no-check-certificate", "-c", "-O", dl_file, dl_url ]) subprocess.check_call(["unzip", dl_file]) os.remove(dl_file) dl_dir = os.path.join(snpeff_base_dir, "data", snpeff_db) shutil.move(dl_dir, snpeff_db_dir) os.rmdir(os.path.join(snpeff_base_dir, "data")) if args.cwl: create.directory_tarball(snpeff_db_dir)
def _run_analysis(fc_dir, remote_info, config, config_file): """Run local or distributed analysis, wait to finish. """ run_yaml = _get_run_yaml(remote_info, fc_dir, config) analysis_dir = os.path.join(config["analysis"].get("base_dir", os.getcwd()), os.path.basename(remote_info["directory"])) if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) with utils.chdir(analysis_dir): if config["algorithm"]["num_cores"] == "messaging": prog = config["analysis"].get("distributed_process_program", "distributed_nextgen_pipeline.py") else: prog = config["analysis"]["process_program"] cl = [prog, config_file, fc_dir] if run_yaml: cl.append(run_yaml) subprocess.check_call(cl) return analysis_dir
def sort_by_ref(vcf_file, data): """Sort a VCF file by genome reference and position, adding contig information. """ out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0] if not utils.file_uptodate(out_file, vcf_file): with file_transaction(data, out_file) as tx_out_file: header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat" cmd = ( "{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | " "vt sort -m full -o {tx_out_file} -") with utils.chdir(os.path.dirname(tx_out_file)): do.run(cmd.format(**locals()), "Sort VCF by reference") return bgzip_and_index(out_file, data["config"])
def prepare_rsem_reference(gtf, multifasta, build): """ gtf: path to GTF file (must have gene_id and transcript_id) multifasta: path to multifasta file build: name of organism build (e.g. hg19) """ if not utils.which("rsem-prepare-reference"): logger.info("Skipping prepping RSEM reference because " "rsem-prepare-reference could not be found.") return None command = PREPARE_REFERENCE.format(gtf=gtf, multifasta=multifasta, build=build) with transaction.tx_tmpdir(remove=False) as rsem_genome_dir: with utils.chdir(rsem_genome_dir): message = "Preparing rsem reference from %s" % gtf do.run(command, message) return rsem_genome_dir
def rnaseq_align_summary(bam_file, sam_ref, sample_name, config, dirs): qc_dir = utils.safe_makedir(os.path.join(dirs["work"], "qc")) genome_dir = os.path.dirname(os.path.dirname(sam_ref)) refflat_file = config_utils.get_transcript_refflat(genome_dir) rrna_file = config_utils.get_rRNA_interval(genome_dir) if not utils.file_exists(rrna_file): rrna_file = "null" with utils.curdir_tmpdir() as tmp_dir: graphs, summary, overrep = \ _rnaseq_graphs_and_summary(bam_file, sam_ref, refflat_file, rrna_file, qc_dir, tmp_dir, config) with utils.chdir(qc_dir): return { "pdf": _generate_pdf(graphs, summary, overrep, bam_file, sample_name, qc_dir, config), "metrics": summary }
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = { "window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data) } prefix = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) depth_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) if not utils.file_uptodate(callable_file, bam_file): cmd = [ "goleft", "depth", "--q", "1", "--mincov", str(params["min"]), "--processes", str(dd.get_num_cores(data)), "--ordered" ] with file_transaction(data, depth_file) as tx_depth_file: with utils.chdir(os.path.dirname(tx_depth_file)): tx_callable_file = tx_depth_file.replace( ".depth.bed", ".callable.bed") prefix = tx_depth_file.replace(".depth.bed", "") bam_ref_file = "%s-bamref.fa" % utils.splitext_plus( bam_file)[0] bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data) cmd += ["--reference", bam_ref_file] cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return depth_file, final_callable
def kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) # unsure how to estimate from single end data, so go with a reasonable default frag_length = 250 batch_file = umi.convert_to_kallisto(data) index = kallisto_index(gtf_file, fasta_file, data, kallisto_dir) cmd = ("{kallisto} pseudo --umi " "-t {num_cores} -o {tx_out_dir} -b {batch_file} -i {index}") with chdir(os.path.dirname(batch_file)): with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with Kallisto.") do.run(cmd.format(**locals()), message, None) kallisto_table(kallisto_dir, index) return quant_dir
def main(org_build): work_dir = os.path.join(os.getcwd(), org_build, "tmpcbl") out_dir = os.path.join( os.getcwd(), org_build, "rnaseq-%s" % datetime.datetime.now().strftime("%Y-%m-%d")) tophat_dir = os.path.join(out_dir, "tophat") safe_makedir(work_dir) with chdir(work_dir): build = build_info[org_build] tx_gff = prepare_tx_gff(build, org_build) gtf_to_refflat(tx_gff) mask_gff = prepare_mask_gtf(tx_gff) rrna_gtf = prepare_rrna_gtf(tx_gff) gtf_to_interval(rrna_gtf, org_build) make_miso_events(tx_gff, org_build) prepare_tophat_index(tx_gff, org_build) cleanup(work_dir, out_dir) tar_dirs = [out_dir] upload_to_s3(tar_dirs, org_build)