def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) # use user supplied transcriptome FASTA file if it exists if dd.get_transcriptome_fasta(data): out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers." with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}" do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def _create_combined_fasta(data, out_dir): """ if there are genomes to be disambiguated, create a FASTA file of all of the transcripts for all genomes """ items = disambiguate.split([data]) fasta_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) ref_file = dd.get_ref_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa") if file_exists(out_file): fasta_files.append(out_file) else: out_file = _gtf_to_fasta(gtf_file, ref_file, out_file) out_file = _clean_gtf_fa(out_file, out_file) fasta_files.append(out_file) out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) combined_file = out_stem + ".fa" if file_exists(combined_file): return combined_file fasta_file_string = " ".join(fasta_files) cmd = "cat {fasta_file_string} > {tx_out_file}" with file_transaction(combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.") return combined_file
def run_vcfanno(vcf, anno_type, data): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ UNSUPPORTED_TYPE_MESSAGE = ( "{anno_type} is not a supported vcf annotation type with vcfanno. " "Supported types are {SUPPORTED_ANNOTATION_TYPES}") if anno_type not in SUPPORTED_ANNOTATION_TYPES: logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals())) return vcf build = dd.get_genome_build(data) annodir = os.path.dirname(dd.get_ref_file(data)) annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno")) annostem = os.path.join(annodir, build + "-") conffn = annostem + anno_type + ".conf" luafn = annostem + anno_type + ".lua" CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") if not utils.file_exists(conffn): logger.warn(CONF_NOT_FOUND.format(**locals())) return vcf base = os.path.splitext(vcf)[0] out_file = base + anno_type + "-annotated.vcf.gz" if utils.file_exists(out_file): return out_file basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.path.pardir)) basepath = annodir out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn) return out_file
def ref_file_from_bam(bam_file, data): """Subset a fasta input file to only a fraction of input contigs. """ new_ref = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")), "%s-subset.fa" % dd.get_genome_build(data)) if not utils.file_exists(new_ref): with file_transaction(data, new_ref) as tx_out_file: contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0] with open(contig_file, "w") as out_handle: for contig in [x.contig for x in idxstats(bam_file, data) if x.contig != "*"]: out_handle.write("%s\n" % contig) cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file) do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data)) ref.fasta_idx(new_ref, data["config"]) runner = broad.runner_from_path("picard", data["config"]) runner.run_fn("picard_index_ref", new_ref) return {"base": new_ref}
def _get_input_args(bam_file, data, out_base, background): """Retrieve input args, depending on genome build. VerifyBamID2 only handles GRCh37 (1, 2, 3) not hg19, so need to generate a pileup for hg19 and fix chromosome naming. """ if dd.get_genome_build(data) in ["hg19"]: return ["--PileupFile", _create_pileup(bam_file, data, out_base, background)] else: return ["--BamFile", bam_file]
def calling(data): """Main function to parallelize peak calling.""" chip_bam = dd.get_work_bam(data) input_bam = data["work_bam_input"] caller_fn = get_callers()[data["peak_fn"]] name = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name )) out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, data["config"]) data["peaks_file"] = out_file return [[data]]
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk} out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate(out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"] else dd.get_genome_build(paired.tumor_data)) cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(), utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info("PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def get_genome(data): """ get the effective length of the genome, falling back to the length of the genome if the effective length is not precomputed """ from bcbio.chipseq import macs2 from bcbio.bam import fasta genome = dd.get_genome_build(data) loaded = macs2.HS if genome in loaded: return loaded[genome] else: return sum([x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
def rapmap_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) rapmap = config_utils.get_program("rapmap", data["config"]) gtf_fa = create_combined_fasta(data, out_dir) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} pseudoindex -i {tx_out_dir} -t {gtf_fa}" message = "Creating RapMap pseudoindex for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def is_human(data, builds=None): """Check if human, optionally with build number, search by name or extra GL contigs. """ def has_build37_contigs(data): for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.startswith("GL") or contig.name.find("_gl") >= 0: if contig.name in naming.GMAP["hg19"] or contig.name in naming.GMAP["GRCh37"]: return True return False if not builds and tz.get_in(["genome_resources", "aliases", "human"], data): return True if not builds or "37" in builds: target_builds = ["hg19", "GRCh37"] if any([dd.get_genome_build(data).startswith(b) for b in target_builds]): return True elif has_build37_contigs(data): return True if not builds or "38" in builds: target_builds = ["hg38"] if any([dd.get_genome_build(data).startswith(b) for b in target_builds]): return True return False
def is_human(data, builds=None): """Check if human, optionally with build number, search by name or extra GL contigs. """ def has_build37_contigs(data): for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.startswith("GL") or contig.name.find("_gl") >= 0: if contig.name in naming.GMAP["hg19"] or contig.name in naming.GMAP["GRCh37"]: return True return False if not builds and tz.get_in(["genome_resources", "aliases", "human"], data): return True if not builds or "37" in builds: target_builds = ["hg19", "GRCh37"] if dd.get_genome_build(data) in target_builds: return True elif has_build37_contigs(data): return True if not builds or "38" in builds: target_builds = ["hg38"] if dd.get_genome_build(data) in target_builds: return True return False
def calling(data): """Main function to parallelize peak calling.""" chip_bam = dd.get_work_bam(data) input_bam = data.get("work_bam_input", None) caller_fn = get_callers()[data["peak_fn"]] name = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name )) # chip_bam = _prepare_bam(chip_bam, dd.get_variant_regions(data), data['config']) # input_bam = _prepare_bam(input_bam, dd.get_variant_regions(data), data['config']) out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["config"]) data["peaks_file"] = out_file return [[data]]
def run_arriba(data): build = dd.get_genome_build(data) if build not in SUPPORTED_BUILDS: logger.info(f"{build} not supported for arriba, skipping.") return data arriba_dir = os.path.join(dd.get_work_dir(data), "arriba", dd.get_sample_name(data)) utils.safe_makedir(arriba_dir) bam_file = dd.get_work_bam(data) ref_file = dd.get_ref_file(data) gtf = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) arriba = config_utils.get_program("arriba", data) fusion_file = os.path.join(arriba_dir, "fusions.tsv") discarded_fusion_file = os.path.join(arriba_dir, "fusions.discarded.tsv") blacklist_file = get_arriba_blacklist_file(data) contigs = get_contigs(data) contig_list = ",".join(contigs) if utils.file_exists(fusion_file): data["arriba"] = { "fusions": fusion_file, "discarded": discarded_fusion_file } return (data) with file_transaction(fusion_file) as tx_fusion_file, \ file_transaction(discarded_fusion_file) as tx_discarded_fusion_file: cmd = ( f"{arriba} -x {bam_file} -g {gtf} -a {ref_file} -o {tx_fusion_file} " f"-O {tx_discarded_fusion_file} -T -P " f"-i {contig_list} ") if blacklist_file: logger.info( f"arriba blacklist file found, running blacklisting with {blacklist_file}." ) cmd += (f"-b {blacklist_file} ") else: logger.info( "arriba blacklist file not found, disabling blacklist filtering." ) cmd += (f"-f blacklist ") if dd.get_known_fusions(data): cmd += (f"-k {dd.get_known_fusions(data)} ") message = f"Running arriba on {dd.get_sample_name(data)}." do.run(cmd, message) data["arriba"] = { "fusions": fusion_file, "discarded": discarded_fusion_file } return (data)
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) # use user supplied transcriptome FASTA file if it exists if dd.get_transcriptome_fasta(data): out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap {index_type} for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def sailfish_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) sailfish = config_utils.get_program("sailfish", data["config"]) num_cores = dd.get_num_cores(data) gtf_fa = create_combined_fasta(data, out_dir) if file_exists(out_dir + "versionInfo.json"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} -k 25" message = "Creating sailfish index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def calling(data): """Main function to parallelize peak calling.""" chip_bam = dd.get_work_bam(data) input_bam = data.get("work_bam_input", None) caller_fn = get_callers()[data["peak_fn"]] name = dd.get_sample_name(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) # chip_bam = _prepare_bam(chip_bam, dd.get_variant_regions(data), data['config']) # input_bam = _prepare_bam(input_bam, dd.get_variant_regions(data), data['config']) out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["config"]) data["peaks_file"] = out_file return [[data]]
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) gtf_fa = sailfish._create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) ### TODO PUT MEMOZATION HERE with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def rapmap_pseudoindex(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "pseudoindex", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) gtf_fa = sailfish._create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} pseudoindex -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap pseudoindex for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def get_genome(data): """ get the effective length of the genome, falling back to the length of the genome if the effective length is not precomputed """ from bcbio.chipseq import macs2 from bcbio.bam import fasta genome = dd.get_genome_build(data) loaded = macs2.HS if genome in loaded: return loaded[genome] else: return sum( [x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) gtf_fa = sailfish._create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) ### TODO PUT MEMOZATION HERE with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") data = to_single_data(salmon.run_salmon_decoy(data)[0]) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") data = to_single_data(salmon.run_salmon_reads(data)[0]) else: data = to_single_data(salmon.run_salmon_bam(data)[0]) else: data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def ref_file_from_bam(bam_file, data): """Subset a fasta input file to only a fraction of input contigs. """ new_ref = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")), "%s-subset.fa" % dd.get_genome_build(data)) if not utils.file_exists(new_ref): with file_transaction(data, new_ref) as tx_out_file: contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0] with open(contig_file, "w") as out_handle: for contig in [ x.contig for x in idxstats(bam_file, data) if x.contig != "*" ]: out_handle.write("%s\n" % contig) cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file) do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data)) ref.fasta_idx(new_ref, data["config"]) runner = broad.runner_from_path("picard", data["config"]) runner.run_fn("picard_index_ref", new_ref) return {"base": new_ref}
def calling(data): """Main function to parallelize peak calling.""" chip_bam = data.get("work_bam") input_bam = data.get("work_bam_input", None) caller_fn = get_callers()[data["peak_fn"]] name = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) greylistdir = greylisting(data) data.update({"peaks_files": out_files}) # data["input_bam_filter"] = input_bam if greylistdir: data["greylist"] = greylistdir return [[data]]
def calling(data): """Main function to parallelize peak calling.""" chip_bam = dd.get_work_bam(data) input_bam = data.get("work_bam_input", None) caller_fn = get_callers()[data["peak_fn"]] name = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data) # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) chip_bam = _prepare_bam(chip_bam, encode_bed, data['config']) input_bam = _prepare_bam(input_bam, encode_bed, data['config']) out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["config"]) data.update({"peaks_files": out_files}) return [[data]]
def sailfish_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) sailfish = config_utils.get_program("sailfish", data["config"]) num_cores = dd.get_num_cores(data) gtf_fa = _create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "versionInfo.json"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} -k 25" message = "Creating sailfish index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def calling(data): """Main function to parallelize peak calling.""" chip_bam = data.get("work_bam") input_bam = data.get("work_bam_input", None) caller_fn = get_callers()[data["peak_fn"]] name = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) greylistdir = greylisting(data) data.update({"peaks_files": out_files}) # data["input_bam_filter"] = input_bam if greylistdir: data["greylist"] = greylistdir return [[data]]
def _run_purple(paired, het_file, depth_file, vrn_files, work_dir): """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs. """ purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple")) out_file = os.path.join(purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-amber", os.path.dirname(het_file), "-baf", het_file, "-cobalt", os.path.dirname(depth_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"], "-output_dir", os.path.dirname(tx_out_file), "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19", "-run_dir", work_dir, "-threads", dd.get_num_cores(paired.tumor_data), "-tumor_sample", dd.get_sample_name(paired.tumor_data), "-ref_sample", dd.get_sample_name(paired.normal_data)] if vrn_files: cmd += ["-somatic_vcf", vrn_files[0]["vrn_file"]] # Avoid X11 display errors when writing plots cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd]) do.run(cmd, "PURPLE: purity and ploidy estimation") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(purple_dir, f)) out_file_export = os.path.join(purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data))) if not utils.file_exists(out_file_export): utils.symlink_plus(out_file, out_file_export) out = {"variantcaller": "purple", "call_file": out_file_export, "vrn_file": titancna.to_vcf(out_file_export, "PURPLE", _get_header, _export_to_vcf, paired.tumor_data), "plot": {}, "metrics": {}} for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]: plot_file = os.path.join(purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext)) if os.path.exists(plot_file): out["plot"][name] = plot_file purity_file = os.path.join(purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data)) with open(purity_file) as in_handle: header = in_handle.readline().replace("#", "").split("\t") vals = in_handle.readline().split("\t") for h, v in zip(header, vals): try: v = float(v) except ValueError: pass out["metrics"][h] = v return out
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [(x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"])] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever): logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def _sample_template(sample, out_dir): """R code to get QC for one sample""" bam_fn = dd.get_work_bam(sample) genome = dd.get_genome_build(sample) if genome in supported: peaks = sample.get("peaks_files", []).get("main") if peaks: r_code = ("library(ChIPQC);\n" "sample = ChIPQCsample(\"{bam_fn}\"," "\"{peaks}\", " "annotation = \"{genome}\"," ");\n" "ChIPQCreport(sample);\n") r_code_fn = os.path.join(out_dir, "chipqc.r") with open(r_code_fn, 'w') as inh: inh.write(r_code.format(**locals())) return r_code_fn
def _sample_template(sample, out_dir): """R code to get QC for one sample""" bam_fn = dd.get_work_bam(sample) genome = dd.get_genome_build(sample) if genome in supported: peaks = sample.get("peaks_files", []).get("main") if peaks: r_code = ("library(ChIPQC);\n" "sample = ChIPQCsample(\"{bam_fn}\"," "\"{peaks}\", " "annotation = \"{genome}\"," ");\n" "ChIPQCreport(sample);\n") r_code_fn = os.path.join(out_dir, "chipqc.r") with open(r_code_fn, 'w') as inh: inh.write(r_code.format(**locals())) return r_code_fn
def _generate_estimates(bam_file, out_base, failed_file, exts, data): background = {"dataset": "1000g.phase3", "nvars": "100k", "build":"b38" if dd.get_genome_build(data) == "hg38" else "b37"} with file_transaction(data, out_base) as tx_out_base: cmd = ["verifybamid2", background["dataset"], background["nvars"], background["build"], "--Reference", dd.get_ref_file(data), "--Output", tx_out_base] cmd += _get_input_args(bam_file, data, out_base, background) try: do.run(cmd, "VerifyBamID contamination checks") except subprocess.CalledProcessError as msg: def allowed_errors(l): return (l.find("Insufficient Available markers") >= 0 or l.find("No reads found in any of the regions") >= 0) if any([allowed_errors(l) for l in str(msg).split("\n")]): logger.info("Skipping VerifyBamID, not enough overlapping markers found: %s" % dd.get_sample_name(data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.warning(str(msg)) raise else: # Fix any sample name problems, for pileups shutil.move(tx_out_base + ".selfSM", tx_out_base + ".selfSM.orig") with open(tx_out_base + ".selfSM.orig") as in_handle: with open(tx_out_base + ".selfSM", "w") as out_handle: sample_name = None for line in in_handle: if line.startswith("DefaultSampleName"): line = line.replace("DefaultSampleName", dd.get_sample_name(data)) # work around bug in finding SM from BAM RG at end of line if len(line.strip().split("\t")) == 1: sample_name = line.strip() line = None elif sample_name: parts = line.split("\t") parts[0] = sample_name line = "\t".join(parts) sample_name = None if line: out_handle.write(line) for e in exts + [".selfSM"]: if os.path.exists(tx_out_base + e): shutil.copy(tx_out_base + e, out_base + e)
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap {index_type} for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def _bam_coverage(name, bam_input, data): """Run bamCoverage from deeptools""" cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} " "--binSize 20 --effectiveGenomeSize {size} " "--smoothLength 60 --extendReads 150 --centerReads -p {cores}") size = int(get_genome(dd.get_genome_build(data))) cores = dd.get_num_cores(data) try: bam_coverage = config_utils.get_program("bamCoverage", data) except config_utils.CmdNotFound: logger.info("No bamCoverage found, skipping bamCoverage.") return None bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name) if utils.file_exists(bw_output): return bw_output with file_transaction(bw_output) as out_tx: do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name) return bw_output
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] to_index = determine_indexes_to_make(samples) samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): run_parallel("run_kallisto_index", [to_index]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): run_parallel("run_sailfish_index", [to_index]) samples = run_parallel("run_sailfish", samples) # always run salmon run_parallel("run_salmon_index", [to_index]) if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") samples = run_parallel("run_salmon_decoy", samples) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") samples = run_parallel("run_salmon_reads", samples) else: samples = run_parallel("run_salmon_bam", samples) else: samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def find_annotations(data): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))) for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file else: conffn = os.path.join(annodir, conf_file + ".conf") if conf_file in conf_checkers and not conf_checkers[conf_file](data): logger.warn( "Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not utils.file_exists(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if os.path.exists(luafn): out.append(luafn) return out
def run_vcfanno(vcf, conf_files, data, data_basepath=None): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] build = dd.get_genome_build(data) basepath = os.path.abspath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir)) annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno")) conf_fns = [] lua_fns = [] anno_type = None for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file luafn = "%s.lua" % utils.splitext_plus(conffn)[0] else: anno_type = os.path.basename(conf_file) conffn = os.path.join(annodir, anno_type + ".conf") luafn = os.path.join(annodir, anno_type + ".lua") if not utils.file_exists(conffn): CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: conf_fns.append(conffn) lua_fns.append(luafn) if not conf_fns: return vcf if not anno_type: anno_type = "gemini" out_file = utils.splitext_plus( vcf)[0] + "-annotated-" + anno_type + ".vcf.gz" if utils.file_exists(out_file): return out_file out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath or basepath, lua_fns) return out_file
def kallisto_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index") out_stem = dd.get_genome_build(data) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) index_dir = os.path.join(out_dir, out_stem) out_file = os.path.join(index_dir, out_stem + ".idx") kallisto = config_utils.get_program("kallisto", dd.get_config(data)) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}" message = "Creating Kallisto index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def kallisto_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index") out_stem = dd.get_genome_build(data) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) index_dir = os.path.join(out_dir, out_stem) out_file = os.path.join(index_dir, out_stem + ".idx") kallisto = config_utils.get_program("kallisto", dd.get_config(data)) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}" message = "Creating Kallisto index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def determine_indexes_to_make(samples): """ returns a subset of the samples that have different indexes in them to make sure we only make each index once """ samples = [to_single_data(x) for x in samples] indexes = set() tomake = [] for data in samples: out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or [])) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or [])) combined_file = out_stem + ".fa" if combined_file not in indexes: tomake.append(data) indexes.add(combined_file) return tomake
def get_coords(data): """Retrieve coordinates of genes of interest for prioritization. Can read from CIViC input data or a supplied BED file of chrom, start, end and gene information. """ for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}), ("amplification", {"AMPLIFICATION"})]: out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {}) priority_file = dd.get_svprioritize(data) if priority_file: if os.path.basename(priority_file).find("civic") >= 0: for chrom, start, end, gene in _civic_regions(priority_file, vtypes, dd.get_disease(data)): out[gene] = (chrom, start, end) elif os.path.basename(priority_file).find(".bed") >= 0: for line in utils.open_gzipsafe(priority_file): parts = line.strip().split("\t") if len(parts) >= 4: chrom, start, end, gene = parts[:4] out[gene] = (chrom, int(start), int(end)) yield category, out
def _get_input_args(bam_file, data, out_base): """Retrieve input args, depending on genome build. VerifyBamID2 only handles GRCh37 (1, 2, 3) not hg19, so need to generate a pileup for hg19 and fix chromosome naming. """ if dd.get_genome_build(data) in ["hg19"]: out_file = "%s-mpileup.txt" % out_base if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: mpileup_cl = samtools.prep_mpileup( [bam_file], dd.get_ref_file(data), data["config"], want_bcf=False, target_regions=_get_autosomal_bed(data, tx_out_file)) cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}") do.run(cl.format(**locals()), "Create pileup from BAM input") return ["--PileupFile", out_file] else: return ["--BamFile", bam_file]
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("base") interval_file_r = utils.R_package_script("PureCN", "extdata/IntervalFile.R", env="base") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources( data)["purecn_mappability"] genome = dd.get_genome_build(data) tools_off = dd.get_tools_off(data) if tools_off and "purecn_offtarget" in tools_off: offtarget_flag = "" else: offtarget_flag = "--off-target" cmd = [ rscript, interval_file_r, "--in-file", bed_file, "--fasta", ref_file, "--out-file", ready_file, offtarget_flag, "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource ] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib( env="base"), utils.get_R_exports(env="base"), " ".join( [str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): # If a single index present, index_dir points to that index_file = None if index_dir and os.path.isfile(index_dir): index_dir = os.path.dirname(index_dir) index_file = os.path.join( index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ( "minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run( cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def find_annotations(data): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations if not specified: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = _default_conf_files(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] if any([x in dd.get_tools_on(data) for x in ["gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand"]]): if annotate_gemini(data) and "gemini" not in conf_files: conf_files.append("gemini") out = [] annodir = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))) for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file else: conffn = os.path.join(annodir, conf_file + ".conf") if not utils.file_exists(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if os.path.exists(luafn): out.append(luafn) return out
def get_coords(data): """Retrieve coordinates of genes of interest for prioritization. Can read from CIViC input data or a supplied BED file of chrom, start, end and gene information. """ for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}), ("amplification", {"AMPLIFICATION"})]: out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {}) priority_file = dd.get_svprioritize(data) if priority_file: if os.path.basename(priority_file).find("civic") >= 0: for chrom, start, end, gene in _civic_regions( priority_file, vtypes, dd.get_disease(data)): out[gene] = (chrom, start, end) elif os.path.basename(priority_file).find(".bed") >= 0: for line in utils.open_gzipsafe(priority_file): parts = line.strip().split("\t") if len(parts) >= 4: chrom, start, end, gene = parts[:4] out[gene] = (chrom, int(start), int(end)) yield category, out
def run_vcfanno(vcf, conf_files, data, data_basepath=None): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] build = dd.get_genome_build(data) basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir)) annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno")) conf_fns = [] lua_fns = [] anno_type = None for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file luafn = "%s.lua" % utils.splitext_plus(conffn)[0] else: anno_type = os.path.basename(conf_file) conffn = os.path.join(annodir, anno_type + ".conf") luafn = os.path.join(annodir, anno_type + ".lua") if not utils.file_exists(conffn): CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: conf_fns.append(conffn) lua_fns.append(luafn) if conf_fns: if not anno_type: anno_type = "gemini" out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz" if not utils.file_exists(out_file): out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath or basepath, lua_fns) return vcfutils.bgzip_and_index(out_file, data["config"])
def run_vcfanno(vcf, anno_type, data, data_basepath=None): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ build = dd.get_genome_build(data) basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir)) annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno")) conffn = os.path.join(annodir, anno_type + ".conf") luafn = os.path.join(annodir, anno_type + ".lua") CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") if not utils.file_exists(conffn): logger.warn(CONF_NOT_FOUND.format(**locals())) return vcf out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz" if utils.file_exists(out_file): return out_file out_file = vcfanno(vcf, out_file, conffn, data, data_basepath or basepath, luafn) return out_file
def support_gemini_orig(data): return dd.get_genome_build(data) in set(["hg19", "GRCh37"])
def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set( dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list( original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum( len(x) for x in pybedtools.BedTool( dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def get_build_string(data): build_string = dd.get_genome_build(data) if dd.get_disambiguate(data): build_string = "-".join([build_string] + (dd.get_disambiguate(data) or [])) return build_string
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [ None, False, "None" ] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps( bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file) }
def _run_purple(paired, het_file, depth_file, work_dir): """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs. XXX Need to add output conversion into VCF for standard formats """ purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple")) out_file = os.path.join( purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = [ "PURPLE", "-amber", os.path.dirname(het_file), "-baf", het_file, "-cobalt", os.path.dirname(depth_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"], "-output_dir", os.path.dirname(tx_out_file), "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19", "-run_dir", work_dir, "-threads", dd.get_num_cores(paired.tumor_data), "-tumor_sample", dd.get_sample_name(paired.tumor_data), "-ref_sample", dd.get_sample_name(paired.normal_data) ] # Avoid X11 display errors when writing plots cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd]) do.run(cmd, "PURPLE: purity and ploidy estimation") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(purple_dir, f)) out_file_export = os.path.join( purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data))) if not utils.file_exists(out_file_export): utils.symlink_plus(out_file, out_file_export) out = { "variantcaller": "purple", "call_file": out_file_export, "plot": {}, "metrics": {} } for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]: plot_file = os.path.join( purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext)) if os.path.exists(plot_file): out["plot"][name] = plot_file purity_file = os.path.join( purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data)) with open(purity_file) as in_handle: header = in_handle.readline().replace("#", "").split("\t") vals = in_handle.readline().split("\t") for h, v in zip(header, vals): try: v = float(v) except ValueError: pass out["metrics"][h] = v return out