def _bowtie_for_innerdist(start, fastq_file, pair_file, ref_file, out_base, out_dir, data, remove_workdir=False): work_dir = os.path.join(out_dir, "innerdist_estimate") if os.path.exists(work_dir): shutil.rmtree(work_dir) safe_makedir(work_dir) extra_args = ["-s", str(start), "-u", "250000"] ref_file, bowtie_runner = _determine_aligner_and_reference(ref_file, data["config"]) out_sam = bowtie_runner.align(fastq_file, pair_file, ref_file, {"lane": out_base}, work_dir, data, extra_args) dists = [] with closing(pysam.Samfile(out_sam)) as work_sam: for read in work_sam: if read.is_proper_pair and read.is_read1: dists.append(abs(read.isize) - 2 * read.rlen) if dists: median = float(numpy.median(dists)) deviations = [] for d in dists: deviations.append(abs(d - median)) # this is the median absolute deviation estimator of the # standard deviation mad = 1.4826 * float(numpy.median(deviations)) return int(median), int(mad) else: return None, None
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get("analysis", "").lower() not in ["standard"] else None) bam_file = ds_bam if ds_bam else bam_file num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file] do.run(cl, "FastQC: %s" % data["name"][-1]) fastqc_outdir = os.path.join(tx_tmp_dir, "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0]) if os.path.exists("%s.zip" % fastqc_outdir): os.remove("%s.zip" % fastqc_outdir) if not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(fastqc_outdir, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description item = add_reference_resources(item) # Create temporary directories and make absolute if utils.get_in(item, ("config", "resources", "tmp", "dir")): utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir"))) item["config"]["resources"]["tmp"] = genome.abs_file_paths( utils.get_in(item, ("config", "resources", "tmp"))) out.append(item) return out
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file): """Deduplicate and sort with samblaster, produces split read and discordant pair files. """ samblaster = config_utils.get_program("samblaster", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) cores, mem = _get_cores_memory(data, downscale=3) tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] for ext in ["spl", "disc", "full"]: utils.safe_makedir("%s-%s" % (tmp_prefix, ext)) if data.get("align_split"): full_tobam_cmd = _nosort_tobam_cmd() else: full_tobam_cmd = ("samtools view -b -u - | " "sambamba sort -t {cores} -m {mem} " "--tmpdir {tmp_prefix}-{dext} -o {out_file} /dev/stdin") tobam_cmd = ("{samtools} sort -@ {cores} -m {mem} " "-T {tmp_prefix}-{dext} -o {out_file} /dev/stdin") # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem # https://github.com/GregoryFaust/samblaster/releases/tag/v.0.1.22 if LooseVersion(programs.get_version_manifest("samblaster", data=data, required=True)) >= LooseVersion("0.1.22"): opts = "-M" else: opts = "" splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, dext="spl", **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, dext="disc", **locals()) dedup_cmd = full_tobam_cmd.format(out_file=tx_out_file, dext="full", **locals()) cmd = ("{samblaster} {opts} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "| {dedup_cmd}") return cmd.format(**locals())
def _run_kraken(data,ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") stats = out = out_stats = None db = data['config']["algorithm"]["kraken"] if db == "minikraken": db = os.path.join(_get_data_dir(),"genome","kraken","minikraken") else: if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report" : "null"} if not os.path.exists(os.path.join(kraken_out,"kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) files = data["files"] with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir,"kraken_out") out_stats = os.path.join(tx_tmp_dir,"kraken_stats") cl = (" ").join([config_utils.get_program("kraken", data["config"]), "--db",db,"--quick", "--preload","--min-hits","2","--threads",str(num_cores), "--out", out, files[0]," 2>",out_stats]) do.run(cl,"kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out,db,data) return metrics
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) qual_bin_method = config["algorithm"].get("quality_bin") if (qual_bin_method == "prealignment" or (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)): out_bindir = safe_makedir(os.path.join(out_dir, "qualbin")) in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config) out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"]] if bam.is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if out2 and os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def convert_to_kallisto(data): files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq") out_file = os.path.join(kallisto_dir, "barcodes.batch") umis = config_utils.get_program("umis", dd.get_config(data)) if file_exists(out_file): return out_file if dd.get_minimum_barcode_depth(data): cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt") cb_cutoff = dd.get_minimum_barcode_depth(data) cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}" cb_options = cb_options.format(**locals()) else: cb_options = "" cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}") with file_transaction(data, kallisto_dir) as tx_kallisto_dir: safe_makedir(tx_kallisto_dir) message = ("Transforming %s to Kallisto singlecell format. " % fq1) do.run(cmd.format(**locals()), message) return out_file
def genebody_coverage2(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts, takes a bam file, converts it to bigwig and then uses that """ PROGRAM = "geneBody_coverage2.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) in_bigwig = bam2bigwig(in_file, config) prefix = "coverage" out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage") safe_makedir(out_dir) out_prefix = out_dir + "/wiggle" #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf")) do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None) return coverage_plot_file
def _get_out_dir(in_file, config, out_prefix, prefix): if not out_prefix: out_dir = os.path.join(_results_dir(config), os.path.basename(in_file), prefix) safe_makedir(out_dir) return out_dir
def run(in_file, ref, blastn_config, config): logger.info("Preparing the reference file for %s." % (ref.get("name"))) ref_file = prepare_ref_file(ref, config) logger.info("Preparing the blast database for %s." % (ref.get("name"))) blast_db = prepare_blast_db(ref_file, "nucl") logger.info("Blasting %s against %s." % (in_file, ref.get("name"))) results_dir = build_results_dir(blastn_config, config) utils.safe_makedir(results_dir) out_file = os.path.join(results_dir, replace_suffix(os.path.basename(in_file), ref.get("name") + "hits.tsv")) tmp_out = out_file + ".tmp" blast_results = blast_search(in_file, blast_db, tmp_out) #logger.info("Filtering results for at least %f percent of the " # "sequences covered." %(0.5*100)) #filtered_results = filter_results_by_length(blast_results, 0.5) #logger.info("Filtered output file here: %s" %(filtered_results)) with open(blast_results) as in_handle: reader = csv.reader(in_handle, delimiter="\t") with open(out_file, "w") as out_handle: writer = csv.writer(out_handle, delimiter="\t") writer.writerow(HEADER_FIELDS.split(" ")) for line in reader: writer.writerow(line) return out_file
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config, rg_name=None): qual_format = config["algorithm"].get("quality_format", None) if qual_format is None or qual_format.lower() == "illumina": qual_flags = ["--solexa1.3-quals"] else: qual_flags = [] cores = config.get("resources", {}).get("tophat", {}).get("cores", None) core_flags = ["-p", str(cores)] if cores else [] out_dir = os.path.join(align_dir, "%s_tophat" % out_base) out_file = os.path.join(out_dir, _out_fnames[0]) files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) cl = [config["program"].get("tophat", "tophat")] cl += core_flags cl += qual_flags cl += ["-m", str(config["algorithm"].get("max_errors", 0)), "--output-dir", tx_out_dir, "--no-convert-bam"] if pair_file: d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, tx_out_dir, config) cl += ["--mate-inner-dist", str(d), "--mate-std-dev", str(d_stdev)] files.append(pair_file) cl += files child = subprocess.check_call(cl) out_file_final = os.path.join(out_dir, "%s.sam" % out_base) if not os.path.exists(out_file_final): os.symlink(out_file, out_file_final) return out_file_final
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def _parse_novel(csv_file): """Create input of novel miRNAs from miRDeep2""" read = 0 seen = set() safe_makedir("novel") with open("novel/hairpin.fa", "w") as fa_handle, open("novel/miRNA.str", "w") as str_handle: with open(csv_file) as in_handle: for line in in_handle: if line.startswith("mature miRBase miRNAs detected by miRDeep2"): break if line.startswith("novel miRNAs predicted"): read = 1 line = in_handle.next() continue if read and line.strip(): cols = line.strip().split("\t") name, start, score = cols[0], cols[16], cols[1] m5p, m3p, pre = cols[13], cols[14], cols[15].replace('u','t').upper() m5p_start = cols[15].find(m5p) + 1 m3p_start = cols[15].find(m3p) + 1 m5p_end = m5p_start + len(m5p) - 1 m3p_end = m3p_start + len(m3p) - 1 if m5p in seen: continue print >>fa_handle, (">new-{name} {start}\n{pre}").format(**locals()) print >>str_handle, (">new-{name} ({score}) [new-{name}-5p:{m5p_start}-{m5p_end}] [new-{name}-3p:{m3p_start}-{m3p_end}]").format(**locals()) seen.add(m5p)
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data): safe_makedir(sailfish_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(sailfish_dir, "quant.sf") if file_exists(out_file): return out_file sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir) num_cores = dd.get_num_cores(data) sailfish = config_utils.get_program("sailfish", data["config"]) cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} " cmd += _libtype_string(fq1, fq2, strandedness) fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--useVBOpt --numBootstraps 30 " cmd += "-o {tx_out_dir}" message = "Quantifying transcripts in {fq1} and {fq2}." with file_transaction(data, sailfish_dir) as tx_out_dir: do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0]))) species = dd.get_species(data[0][0]) hairpin = op.join(mirbase, "hairpin.fa") mature = op.join(mirbase, "mature.fa") rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file): do.run(cmd.format(**locals()), "Running mirdeep2.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def variantcall_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ safe_makedir(os.path.dirname(out_file)) sam_ref = data["sam_ref"] config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if isinstance(data["work_bam"], basestring): align_bams = [data["work_bam"]] items = [data] else: align_bams = data["work_bam"] items = data["work_items"] call_file = "%s-raw%s" % os.path.splitext(out_file) call_file = caller_fn(align_bams, items, sam_ref, data["genome_resources"]["variation"], region, call_file) if data["config"]["algorithm"].get("phasing", False) == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) for ext in ["", ".idx"]: if not os.path.exists(out_file + ext): if os.path.exists(call_file + ext): try: os.symlink(call_file + ext, out_file + ext) except OSError, msg: if str(msg).find("File exists") == -1: raise
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data): valid_algorithms = ["pseudo", "quasi"] assert algorithm in valid_algorithms, \ "RapMap algorithm needs to be one of %s." % valid_algorithms safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): return out_file rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data, rapmap_dir) num_cores = dd.get_num_cores(data) algorithm_subcommand = algorithm + "map" rapmap = config_utils.get_program("rapmap", dd.get_config(data)) cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += "-r {fq1_cmd} " else: fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) " fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += "-1 {fq2_cmd} -2 {fq2_cmd} " with file_transaction(out_file) as tx_out_file: cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file) run_message = ("%smapping %s and %s to %s with Rapmap. " % (algorithm, fq1, fq2, rapmap_index)) do.run(cmd.format(**locals()), run_message, None) return out_file
def _install_kraken_db(datadir, args): """Install kraken minimal DB in genome folder. """ kraken = os.path.join(datadir, "genomes/kraken") url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz" compress = os.path.join(kraken, os.path.basename(url)) base, ext = utils.splitext_plus(os.path.basename(url)) db = os.path.join(kraken, base) tooldir = args.tooldir or get_defaults()["tooldir"] requests.packages.urllib3.disable_warnings() last_mod = urllib.urlopen(url).info().getheader("Last-Modified") last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc()) if os.path.exists(os.path.join(tooldir, "bin", "kraken")): if not os.path.exists(db): is_new_version = True else: cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0] cur_version = datetime.datetime.utcfromtimestamp(os.path.getmtime(cur_file)) is_new_version = last_mod.date() > cur_version.date() if is_new_version: shutil.move(cur_file, cur_file.replace("minikraken", "old")) if not os.path.exists(kraken): utils.safe_makedir(kraken) if is_new_version: if not os.path.exists(compress): subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"]) cmd = ["tar", "-xzvf", compress, "-C", kraken] subprocess.check_call(cmd) last_version = glob.glob(os.path.join(kraken, "minikraken_*")) utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken")) utils.remove_safe(compress) else: print "You have the latest version %s." % last_mod else: raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
def test_combine(self): to_combine = self.config["to_combine"] out_file = "results/%s/combined_counts.counts" % (STAGENAME) safe_makedir(os.path.dirname(out_file)) df = htseq_count.combine_counts(to_combine, None, out_file=out_file) self.assertTrue(os.path.exists(out_file)) self.assertTrue(os.path.getsize(out_file) > 0)
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ from bcbio.bam import callable data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")) data = _setup_variant_regions(data, out_dir) out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) if not utils.file_exists(out_file): region_files = [] regions = [] for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data): str_region = "_".join([str(x) for x in cur_region]) region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype", "regions")), "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region)) region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, region=cur_region, out_file=region_file) region_files.append(region_file) regions.append(cur_region) out_file = vcfutils.concat_variant_files(region_files, out_file, regions, dd.get_ref_file(data), data["config"]) return dd.set_vrn_file(data, out_file)
def prepare_bowtie_index(genome_fasta, bowtie_dir): if os.path.exists(bowtie_dir + ".1.bt2"): return bowtie_dir safe_makedir(bowtie_dir) cmd = "bowtie2-build {genome_fasta} {bowtie_dir}" subprocess.check_call(cmd.format(**locals()), shell=True) return bowtie_dir
def sample_annotation(data): """ Annotate miRNAs using miRBase database with seqbuster tool """ names = data["rgnames"]['sample'] tools = dd.get_expression_caller(data) work_dir = os.path.join(dd.get_work_dir(data), "mirbase") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = op.join(out_dir, names) if dd.get_mirbase_hairpin(data): mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data))) if utils.file_exists(data["collapse"]): data['transcriptome_bam'] = _align(data["collapse"], dd.get_mirbase_hairpin(data), out_file, data) data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config']) else: logger.debug("Trimmed collapsed file is empty for %s." % names) else: logger.debug("No annotation file from miRBase.") sps = dd.get_species(data) if dd.get_species(data) else "None" logger.debug("Looking for mirdeep2 database for %s" % names) if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")): data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps, op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config']) if "trna" in tools: data['trna'] = _mint_trna_annotation(data) data = spikein.counts_spikein(data) return [[data]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, names["lane"]) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % names["lane"]) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outSAMunmapped Within") cmd += _read_group_option(names) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif" run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file) do.run(cmd.format(**locals()), run_message, None) out_file = bam.sam_to_bam(out_file, config) out_file = _fix_sam_header(out_file, config) if not file_exists(final_out): symlink_plus(out_file, final_out) return final_out
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if trim_reads: adapter = dd.get_adapters(data)[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") else: symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") return hsmetric_file
def report_summary(samples, run_parallel): """ Run coverage report with bcbiocov package """ work_dir = dd.get_work_dir(samples[0][0]) parent_dir = utils.safe_makedir(os.path.join(work_dir, "report")) qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") with utils.chdir(parent_dir): logger.info("copy qsignature") if qsignature_fn: if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) out_dir = utils.safe_makedir("coverage") out_dir = utils.safe_makedir("variants") samples = run_parallel("coverage_report", samples) try: import bcbreport.prepare as bcbreport bcbreport.report(parent_dir) except: logger.info("skipping report. No bcbreport installed.") pass logger.info("summarize metrics") samples = _merge_metrics(samples) return samples
def run(items): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) data = paired.tumor_data if paired else items[0] work_dir = _sv_workdir(data) variant_file = _get_out_file(work_dir, paired) if not utils.file_exists(variant_file): with file_transaction(data, work_dir) as tx_work_dir: utils.safe_makedir(tx_work_dir) tx_workflow_file = _prep_config(items, paired, tx_work_dir) _run_workflow(items, paired, tx_workflow_file, tx_work_dir) assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file variant_file = shared.annotate_with_depth(variant_file, items) out = [] upload_counts = collections.defaultdict(int) for data in items: if "break-point-inspector" in dd.get_tools_on(data): if paired and paired.normal_bam and paired.tumor_name == dd.get_sample_name(data): variant_file = _run_break_point_inspector(data, variant_file, paired, work_dir) if "sv" not in data: data["sv"] = [] final_vcf = shared.finalize_sv(variant_file, data, items) vc = {"variantcaller": "manta", "do_upload": upload_counts[final_vcf] == 0, # only upload a single file per batch "vrn_file": final_vcf} evidence_bam = _get_evidence_bam(work_dir, data) if evidence_bam: vc["read_evidence"] = evidence_bam data["sv"].append(vc) upload_counts[final_vcf] += 1 out.append(data) return out
def summarize(calls, data, items): """Summarize results from multiple callers into a single flattened BED file. Approach: - Combine all calls found in all files - Filter files retaining those present with multiple levels of support. - Remove calls in high depth regions. - Remove calls with ends overlapping exclusion regions like low complexity regions. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]), [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls]) if len(input_beds) > 0: out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data) if utils.file_exists(out_file): if len(input_beds) > N_FILTER_CALLERS: filter_file = _filter_ensemble(out_file, data) else: filter_file = out_file limit_file = shared.remove_highdepth_regions(filter_file, items) exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f] exclude_file = exclude_files[0] if len(exclude_files) > 0 else None if exclude_file: noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data) else: noexclude_file = limit_file bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep")) if utils.file_exists(noexclude_file): calls.append({"variantcaller": "sv-ensemble", "input_beds": input_beds, "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)}) return calls
def run(bam_file, data, out_dir): out = {} preseq_cmd = tz.get_in(["config", "algorithm", "preseq"], data) if not preseq_cmd: return out samtools_stats_dir = os.path.join(out_dir, os.path.pardir, "samtools") samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data)) if not utils.file_exists(stats_file): utils.safe_makedir(out_dir) preseq = config_utils.get_program("preseq", data["config"]) params = _get_preseq_params(data, preseq_cmd, int(samtools_stats["Total_reads"])) param_line = "{options} -step {step} -seg_len {seg_len} " if preseq_cmd == "lc_extrap": param_line += "-extrap {extrap} " param_line = param_line.format(**params) with file_transaction(data, stats_file) as tx_out_file: cmd = "{preseq} {preseq_cmd} -bam -pe {bam_file} -o {tx_out_file} {param_line}".format(**locals()) do.run(cmd.format(**locals()), "preseq " + preseq_cmd, data) out = _prep_real_counts(bam_file, data, samtools_stats) return {"base": stats_file, "metrics": out}
def variantcall_sample(data, region=None, align_bams=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ if out_file is None or not os.path.exists(out_file) or not os.path.lexists(out_file): utils.safe_makedir(os.path.dirname(out_file)) sam_ref = data["sam_ref"] config = data["config"] caller_fns = get_variantcallers() caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if len(align_bams) == 1: items = [data] else: items = multi.get_orig_items(data) assert len(items) == len(align_bams) assoc_files = tz.get_in(("genome_resources", "variation"), data, {}) if not assoc_files: assoc_files = {} for bam_file in align_bams: bam.index(bam_file, data["config"], check_timestamp=False) do_phasing = data["config"]["algorithm"].get("phasing", False) call_file = "%s-raw%s" % utils.splitext_plus(out_file) if do_phasing else out_file call_file = caller_fn(align_bams, items, sam_ref, assoc_files, region, call_file) if do_phasing == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) utils.symlink_plus(call_file, out_file) if region: data["region"] = region data["vrn_file"] = out_file return [data]
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) toval_data = cwlutils.unpack_tarballs(toval_data, toval_data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir( os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError( "Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path( toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip( normalize_input_path( toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file( rm_interval_file, toval_data, prefix="validateregions-", bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms( rm_interval_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") # RTG can fail on totally empty files. Call everything in truth set as false negatives if not vcfutils.vcf_has_variants(vrn_file): eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir, toval_data, "fn") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data, "fp") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod in ["rtg", "rtg-squash-ploidy"]: eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data, vmethod) eval_files = _annotate_validations(eval_files, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def _sv_workdir(data): return utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "purecn"))
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError( "Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.bam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist( fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-coverage-search"] = True options["no-mixed"] = True cmd = [sys.executable, config_utils.get_program("tophat", config)] for k, v in options.items(): if v is True: cmd.append("--%s" % k) else: assert not isinstance(v, bool) cmd.append("--%s=%s" % (k, v)) # tophat requires options before arguments, otherwise it silently ignores them cmd += files do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file)) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base), ref_file, config) else: fixed = out_file fixed_unmapped = _fix_unmapped(fixed, unmapped, data) fixed = merge_unmapped(fixed, fixed_unmapped, config) fixed = _add_rg(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"], os.path.splitext(fixed)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def setup(args): template, template_txt = name_to_config(args.template) run_info.validate_yaml(template_txt, args.template) base_item = template["details"][0] project_name, metadata, global_vars, md_file = _pname_and_metadata( args.metadata) remotes = _retrieve_remote([args.metadata, args.template]) inputs = args.input_files + remotes.get("inputs", []) + _find_remote_inputs(metadata) remote_retriever = None remote_config = None if hasattr(args, "systemconfig") and args.systemconfig and hasattr( args, "integrations"): config, _ = config_utils.load_system_config(args.systemconfig) for iname, retriever in args.integrations.items(): if iname in config: remote_retriever = retriever remote_config = remote_retriever.set_cache(config[iname]) inputs += remote_retriever.get_files(metadata, remote_config) raw_items = [ _add_metadata(item, metadata, remotes, args.only_metadata) for item in _prep_items_from_base( base_item, inputs, args.separators.split(","), args.force_single) ] items = [x for x in raw_items if x] _check_all_metadata_found(metadata, items) if remote_retriever and remote_config: items = remote_retriever.add_remotes(items, remote_config) out_dir = os.path.join(os.getcwd(), project_name) work_dir = utils.safe_makedir(os.path.join(out_dir, "work")) if hasattr(args, "relpaths") and args.relpaths: items = [_convert_to_relpaths(x, work_dir) for x in items] out_config_file = _write_template_config(template_txt, project_name, out_dir) if md_file: shutil.copyfile( md_file, os.path.join(out_dir, "config", os.path.basename(md_file))) items = _copy_to_configdir(items, out_dir) if len(items) == 0: print() print("Template configuration file created at: %s" % out_config_file) print( "Edit to finalize custom options, then prepare full sample config with:" ) print(" bcbio_nextgen.py -w template %s %s sample1.bam sample2.fq" % \ (out_config_file, project_name)) else: out_config_file = _write_config_file(items, global_vars, template, project_name, out_dir, remotes) print() print("Configuration file created at: %s" % out_config_file) print("Edit to finalize and run with:") print(" cd %s" % work_dir) print(" bcbio_nextgen.py ../config/%s" % os.path.basename(out_config_file)) if remotes.get("base"): remote_path = os.path.join(remotes["base"], os.path.basename(out_config_file)) s3.upload_file_boto(out_config_file, remote_path) print("Also uploaded to AWS S3 in %s" % remotes["base"]) print("Run directly with bcbio_vm.py run %s" % remote_path)
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir) if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"] else None) if ds_file is not None: bam_file = ds_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file ] cl = "%s %s" % (utils.local_path_export(), " ".join( [str(x) for x in cl])) do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists( tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write( line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move( os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move( "%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): raise ValueError( "FastQC failed to produce output HTML file: %s" % os.listdir(tx_tmp_dir)) logger.info("Produced HTML report %s" % sentry_file) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def test_run(self): out_prefix = "results/tests/dss/test_dss" safe_makedir(os.path.dirname(out_prefix)) result = dss.run(self.count_file, self.conds, ("untreat", "treat"), out_prefix=out_prefix) self.assertTrue(file_exists(result))
def summarize_grading(samples, vkey="validate"): """Provide summaries of grading results across all samples. Handles both traditional pipelines (validation part of variants) and CWL pipelines (validation at top level) """ samples = list(utils.flatten(samples)) if not _has_grading_info(samples, vkey): return [[d] for d in samples] validate_dir = utils.safe_makedir( os.path.join(samples[0]["dirs"]["work"], vkey)) header = ["sample", "caller", "variant.type", "category", "value"] _summarize_combined(samples, vkey) validated, out = _group_validate_samples( samples, vkey, (["metadata", "validate_batch"], ["metadata", "batch" ], ["description"])) for vname, vitems in validated.items(): out_csv = os.path.join(validate_dir, "grading-summary-%s.csv" % vname) with open(out_csv, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(header) plot_data = [] plot_files = [] for data in sorted( vitems, key=lambda x: x.get("lane", dd.get_sample_name(x))): validations = [ variant.get(vkey) for variant in data.get("variants", []) if isinstance(variant, dict) ] validations = [v for v in validations if v] if len(validations) == 0 and vkey in data: validations = [data.get(vkey)] for validate in validations: if validate: validate["grading_summary"] = out_csv if validate.get("grading"): for row in _get_validate_plotdata_yaml( validate["grading"], data): writer.writerow(row) plot_data.append(row) elif validate.get("summary") and not validate.get( "summary") == "None": if isinstance(validate["summary"], (list, tuple)): plot_files.extend( list(set(validate["summary"]))) else: plot_files.append(validate["summary"]) if plot_files: plots = validateplot.classifyplot_from_plotfiles( plot_files, out_csv) elif plot_data: plots = validateplot.create(plot_data, header, 0, data["config"], os.path.splitext(out_csv)[0]) else: plots = [] for data in vitems: if data.get(vkey): data[vkey]["grading_plots"] = plots for variant in data.get("variants", []): if isinstance(variant, dict) and variant.get(vkey): variant[vkey]["grading_plots"] = plots out.append([data]) return out
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: config[iname] = retriever.set_cache(config[iname]) loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())) if os.path.isfile(item["vrn_file"]): item["vrn_file"] = vcfutils.bgzip_and_index(item["vrn_file"], config, remove_orig=False, out_dir=inputs_dir) if not tz.get_in(("metadata", "batch"), item): raise ValueError( "%s: Please specify a metadata batch for variant file (vrn_file) input.\n" % (item["description"]) + "Batching with a standard sample provides callable regions for validation." ) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def work_dir(data): return utils.safe_makedir( os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data), "sambamba"))
default=[], action="append") parser.add_argument("-q", "--queue", help="Queue to submit jobs to.") parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep") parser.add_argument("-t", "--paralleltype", choices=["local", "ipython"], default="local", help="Run with iptyhon") args = parser.parse_args() out_dir = os.path.abspath(args.out) utils.safe_makedir(out_dir) try: system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") except ValueError as err: print(err) print( "WARNING: Attempting to read bcbio_system.yaml in the current directory." ) system_config = "bcbio_system.yaml" if utils.file_exists(system_config): with open(system_config) as in_handle: config = yaml.load(in_handle) else: print("WARNING: bcbio_system.yaml not found, creating own resources.")
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (atropos, contamination, coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq, chipseq) tools = { "fastqc": fastqc.run, "atropos": atropos.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "contamination": contamination.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "peddy": peddy.run_qc, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, "chipqc": chipseq.run } qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = utils.deepish_copy(dd.get_summary_qc(data)) for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def chipseq_count(data): """ count reads mapping to ChIP/ATAC consensus peaks with featureCounts """ method = dd.get_chip_method(data) if method == "chip": in_bam = dd.get_work_bam(data) elif method == "atac": if bam.is_paired(dd.get_work_bam(data)): in_bam = tz.get_in(("atac", "align", "NF"), data) else: in_bam = tz.get_in(("atac", "align", "full"), data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data) if not consensus_file: return [[data]] saf_file = os.path.splitext(consensus_file)[0] + ".saf" work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "consensus") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): if method == "atac": if bam.is_paired(dd.get_work_bam(data)): data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) else: data = tz.assoc_in(data, ("peak_counts", "full"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count_file) return [[data]] featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ( "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {sorted_bam}") message = ("Count reads in {sorted_bam} overlapping {saf_file} using " "featureCounts.") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) if method == "atac": if bam.is_paired(dd.get_work_bam(data)): data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) else: data = tz.assoc_in(data, ("peak_counts", "full"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count_file) return [[data]]
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists( vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo["vrn_file"]): if vcinfo["vrn_file"] and vcfutils.vcf_has_nonfiltered_variants( vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None if not peddy or not vcf_file or not vcfanno.is_human(data): if not peddy: reason = "peddy executable not found" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" cmd = ( "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ((l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def _get_program_file(dirs): if dirs.get("work"): base_dir = utils.safe_makedir(os.path.join(dirs["work"], "provenance")) return os.path.join(base_dir, "programs.txt")
def _sv_workdir(data): return utils.safe_makedir( os.path.join(data["dirs"]["work"], "structural", dd.get_sample_name(data), "cnvkit"))
def _align(data, fastq_file, args): work_dir = os.path.join("align") work_dir = os.path.abspath(safe_makedir(work_dir)) out_prefix = os.path.join(work_dir, "seqs_") bam_file = star_align(data, args, fastq_file, out_prefix, 1000) return bam_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 15 --chimJunctionOverhangMin 15 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d) for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out
def setUp(self): self.out_dir = "test_picard" safe_makedir(self.out_dir)
def _run_cnvkit_shared_orig(inputs, backgrounds): """Original CNVkit implementation with full normalization and segmentation. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name( backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir( os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [ _get_general_coverage(cdata, itype) for itype, cdata in samples_to_run ]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [ _get_original_coverage(cdata, itype) for itype, cdata in samples_to_run ]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [ _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values() ]) background_cnn = cnvkit_background( _select_background_cnns(coverage_cnns), background_cnn, inputs, target_bed, antitarget_bed) else: coverage_cnns = raw_coverage_cnns background_cnn = cnvkit_background([ x["file"] for x in coverage_cnns if x["itype"] == "background" ], background_cnn, inputs, target_bed, antitarget_bed) parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } fixed_cnrs = run_multicore( _cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby( "bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [ _cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs ] return ckouts
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists( vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info( "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}" message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) if to_show[-1].find("IndexError") >= 0 and to_show[-1].find( "is out of bounds for axis") >= 0: logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def main(config_file, view): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["dir"]["data"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running Tophat on %s." % (curr_files)) #tophat = repository["tophat"](config) tophat = Tophat(config) tophat_outputs = view.map(tophat, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = repository[stage](config) view.map(disambiguate, curr_files) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (bamfiles)) name_sorted = view.map(sam.bam_name_sort, bamfiles) curr_files = view.map(sam.bam2sam, name_sorted) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name( backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir( os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({ "cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn }) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files( cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } pct_coverage = ( pybedtools.BedTool(raw_target_bed).total_coverage() / float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0 target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, raw_work_dir, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) raw_coverage_cnns = [ _cnvkit_coverage(cdata, bed, itype) for itype, cdata in samples_to_run for bed in [target_bed, antitarget_bed] ] coverage_cnns = reduce(operator.add, [ _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values() ]) background_cnn = _cnvkit_background( _select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore( _cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby( "bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs] return ckouts
def prep_cwl(samples, workflow_fn, out_dir, out_file, integrations=None, add_container_tag=None): """Output a CWL description with sub-workflows and steps. """ if add_container_tag is None: container_tags = None elif add_container_tag.lower() == "quay_lookup": container_tags = {} else: container_tags = collections.defaultdict(lambda: add_container_tag) step_dir = utils.safe_makedir(os.path.join(out_dir, "steps")) get_retriever = GetRetriever(integrations, samples) variables, keyvals = _flatten_samples(samples, out_file, get_retriever) cur_remotes = _get_cur_remotes(keyvals) file_estimates = _calc_input_estimates(keyvals, get_retriever) out = _cwl_workflow_template(variables) parent_wfs = [] step_parallelism = {} steps, wfoutputs = workflow_fn(samples) used_inputs = set([]) for cur in workflow.generate(variables, steps, wfoutputs): if cur[0] == "step": _, name, parallel, inputs, outputs, image, programs, disk, cores, no_files = cur step_file = _write_tool(step_dir, name, inputs, outputs, parallel, image, programs, file_estimates, disk, cores, samples, cur_remotes, no_files, container_tags) out["steps"].append( _step_template(name, step_file, inputs, outputs, parallel, step_parallelism)) used_inputs |= set(x["id"] for x in inputs) elif cur[0] == "expressiontool": _, name, inputs, outputs, expression, parallel = cur step_file = _write_expressiontool(step_dir, name, inputs, outputs, expression, parallel) out["steps"].append( _step_template(name, step_file, inputs, outputs, parallel, step_parallelism)) used_inputs |= set(x["id"] for x in inputs) elif cur[0] == "upload": for output in cur[1]: wf_output = copy.deepcopy(output) if "outputSource" not in wf_output: wf_output["outputSource"] = wf_output.pop("source") wf_output = _clean_record(wf_output) out["outputs"].append(wf_output) elif cur[0] == "wf_start": parent_wfs.append(out) out = _cwl_workflow_template(cur[1]) elif cur[0] == "wf_finish": _, name, parallel, inputs, outputs, scatter = cur wf_out_file = "wf-%s.cwl" % name with open(os.path.join(out_dir, wf_out_file), "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) out = parent_wfs.pop(-1) out["steps"].append( _step_template(name, wf_out_file, inputs, outputs, parallel, step_parallelism, scatter)) used_inputs |= set(x["id"] for x in inputs) else: raise ValueError("Unexpected workflow value %s" % str(cur)) step_parallelism[name] = parallel with open(out_file, "w") as out_handle: out["inputs"] = [x for x in out["inputs"] if x["id"] in used_inputs] yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) sample_json = "%s-samples.json" % utils.splitext_plus(out_file)[0] out_clean = _clean_final_outputs( copy.deepcopy({k: v for k, v in keyvals.items() if k in used_inputs}), get_retriever) with open(sample_json, "w") as out_handle: json.dump(out_clean, out_handle, sort_keys=True, indent=4, separators=(',', ': ')) return out_file, sample_json
def run_mosdepth(data, target_name, bed_file, per_base=False, quantize=None, thresholds=None): """Run mosdepth generating distribution, region depth and per-base depth. """ MosdepthCov = collections.namedtuple( "MosdepthCov", ("dist", "per_base", "regions", "quantize", "thresholds")) bam_file = dd.get_align_bam(data) or dd.get_work_bam(data) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) prefix = os.path.join(work_dir, "%s-%s" % (dd.get_sample_name(data), target_name)) old_dist_file = "%s.mosdepth.dist.txt" % (prefix) out = MosdepthCov( (old_dist_file if utils.file_uptodate( old_dist_file, bam_file) else "%s.mosdepth.%s.dist.txt" % (prefix, "region" if bed_file else "global")), ("%s.per-base.bed.gz" % prefix) if per_base else None, ("%s.regions.bed.gz" % prefix) if bed_file else None, ("%s.quantized.bed.gz" % prefix) if quantize else None, ("%s.thresholds.bed.gz" % prefix) if thresholds else None) if not utils.file_uptodate(out.dist, bam_file): with file_transaction(data, out.dist) as tx_out_file: tx_prefix = os.path.join(os.path.dirname(tx_out_file), os.path.basename(prefix)) num_cores = dd.get_cores(data) bed_arg = ("--by %s" % bed_file) if bed_file else "" perbase_arg = "" if per_base else "--no-per-base" mapq_arg = "-Q 1" if (per_base or quantize) else "" if quantize: quant_arg = "--quantize %s" % quantize[0] quant_export = " && ".join([ "export MOSDEPTH_Q%s=%s" % (i, x) for (i, x) in enumerate(quantize[1]) ]) quant_export += " && " else: quant_arg, quant_export = "", "" thresholds_cmdl = ( "-T " + ",".join([str(t) for t in thresholds])) if out.thresholds else "" cmd = ( "{quant_export}mosdepth -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} " "{tx_prefix} {bam_file} {thresholds_cmdl}") message = "Calculating coverage: %s %s" % ( dd.get_sample_name(data), target_name) do.run(cmd.format(**locals()), message.format(**locals())) if out.per_base: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.per_base)), out.per_base) if out.regions: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.regions)), out.regions) if out.quantize: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.quantize)), out.quantize) if out.thresholds: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.thresholds)), out.thresholds) return out
def _make_outdir(config): """ make the output directory "fastqc" where the data files live """ outdir = os.path.join(config["dir"]["results"], "fastqc") safe_makedir(outdir) return outdir
def start(parallel, items, config, dirs=None, name=None, multiplier=1, max_multicore=None): """Start a parallel cluster or machines to be used for running remote functions. Returns a function used to process, in parallel items with a given function. Allows sharing of a single cluster across multiple functions with identical resource requirements. Uses local execution for non-distributed clusters or completed jobs. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling cores when an item is split during processing. max_multicore -- The maximum number of cores to use for each process. Can be used to process less multicore usage when jobs run faster on more single cores. """ if name: checkpoint_dir = utils.safe_makedir( os.path.join(dirs["work"], "checkpoints_parallel")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name) else: checkpoint_file = None sysinfo = system.get_info(dirs, parallel) items = [x for x in items if x is not None] if items else [] max_multicore = int(max_multicore or sysinfo.get("cores", 1)) parallel = resources.calculate(parallel, items, sysinfo, config, multiplier=multiplier, max_multicore=max_multicore) try: view = None if checkpoint_file and os.path.exists(checkpoint_file): logger.info("run local -- checkpoint passed: %s" % name) parallel["cores_per_job"] = 1 parallel["num_jobs"] = 1 parallel["checkpointed"] = True yield multi.runner(parallel, config) elif parallel["type"] == "ipython": with ipython.create(parallel, dirs, config) as view: yield ipython.runner(view, parallel, dirs, config) elif parallel["type"] == "clusterk": with clusterk.create(parallel) as queue: yield clusterk.runner(queue, parallel) else: yield multi.runner(parallel, config) except: if view is not None: ipython.stop(view) raise else: for x in ["cores_per_job", "num_jobs", "mem"]: parallel.pop(x, None) if checkpoint_file: with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n")
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [ None, False, "None" ] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps( bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file) }
def _merge_target_information(samples): metrics_dir = utils.safe_makedir("metrics") out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
write_seq = True else: write_seq = False elif write_seq: out_handle.write(line) if not os.path.exists(out_file + ".bwt"): subprocess.check_call(["bwa", "index", out_file]) if not os.path.exists(out_file + ".ndx"): subprocess.check_call(["novoindex", out_file + ".ndx", out_file]) hlas = [] with open(out_file) as in_handle: for line in in_handle: if line.startswith(">"): hlas.append(line[1:].strip()) return out_file, hlas def samples_from_config(sample_yaml): with open(sample_yaml) as in_handle: config = yaml.safe_load(in_handle) by_batch = collections.defaultdict(dict) for s in config["details"]: by_batch[s["metadata"]["batch"]][s["metadata"]["phenotype"]] = s["description"] for bid in sorted(by_batch.keys()): yield by_batch[bid]["tumor"], by_batch[bid]["normal"] if __name__ == "__main__": sample_config, hla_fa, cromwell_dir = sys.argv[1:] work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "work_lohhla")) for t, n in sorted(samples_from_config(sample_config)): run_sample(t, n, work_dir, cromwell_dir, hla_fa)