def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, names["lane"]) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % names["lane"]) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outSAMunmapped Within") cmd += _read_group_option(names) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif" run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file) do.run(cmd.format(**locals()), run_message, None) out_file = bam.sam_to_bam(out_file, config) out_file = _fix_sam_header(out_file, config) if not file_exists(final_out): symlink_plus(out_file, final_out) return final_out
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if not _can_use_mem(fastq_file, data): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """Perform a BWA alignment, generating a SAM file. """ config = data["config"] sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base) sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base) if pair_file else None) sam_file = os.path.join(align_dir, "%s.sam" % out_base) if not utils.file_exists(sam_file): if not utils.file_exists(sai1_file): with file_transaction(sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not utils.file_exists(sai2_file): with file_transaction(sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) align_type = "sampe" if sai2_file else "samse" sam_cl = [config_utils.get_program("bwa", config), align_type, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file) as tx_sam_file: cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file) do.run(cmd, "bwa {align_type}".format(**locals()), None) return sam_file
def combine_bed_by_size(input_beds, sample, work_dir, data, delim=","): """Combine a set of BED files, breaking into individual size chunks. """ out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) if len(input_beds) > 0: size_beds = [] for e_start, e_end in validate.EVENT_SIZES: base, ext = os.path.splitext(out_file) size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext) if not utils.file_exists(size_out_file): with file_transaction(data, size_out_file) as tx_out_file: with shared.bedtools_tmpdir(data): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] has_regions = False with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): chrom, start, end, event_str = line.split()[:4] event = event_str.split("_", 1)[0] size = int(end) - int(start) if size >= e_start and size < e_end or event == "BND": out_handle.write(line) has_regions = True if has_regions: pybedtools.BedTool(all_file).sort(stream=True)\ .merge(c=4, o="distinct", delim=delim).saveas(tx_out_file) if utils.file_exists(size_out_file): ann_size_out_file = annotate.add_genes(size_out_file, data) size_beds.append(ann_size_out_file) if len(size_beds) > 0: out_file = bedutils.combine(size_beds, out_file, data) return out_file
def align_to_sort_bam(fastq1, fastq2, aligner, data): """Align to the named genome build, returning a sorted BAM file. """ names = data["rgnames"] align_dir_parts = [data["dirs"]["work"], "align", names["sample"]] if data.get("disambiguate"): align_dir_parts.append(data["disambiguate"]["genome_build"]) aligner_index = _get_aligner_index(aligner, data) align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts)) ref_file = tz.get_in(("reference", "fasta", "base"), data) if fastq1.endswith(".bam"): data = _align_from_bam(fastq1, aligner, aligner_index, ref_file, names, align_dir, data) else: data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index, ref_file, names, align_dir, data) if data["work_bam"] and utils.file_exists(data["work_bam"]): if data.get("align_split") and dd.get_mark_duplicates(data): # If merging later with with bamsormadup need query sorted inputs # but CWL requires a bai file. Create a fake one to make it happy. bam.fake_index(data["work_bam"], data) else: bam.index(data["work_bam"], data["config"]) for extra in ["-sr", "-disc"]: extra_bam = utils.append_stem(data['work_bam'], extra) if utils.file_exists(extra_bam): bam.index(extra_bam, data["config"]) return data
def _fetch_chrom_sizes(config): PROGRAM = "fetchChromSizes" if not program_exists(PROGRAM): logger.error("%s is not in the path or is not executable. Make sure " "it is installed or go to " "http://hgdownload.cse.ucsc.edu/admin/exe/" "to download it." % (PROGRAM)) exit(1) if "annotation" not in config: logger.error("'annotation' must be in the yaml file. See example " " configuration files") exit(1) if "name" not in config["annotation"]: logger.error("'name' must be in the yaml file under " " 'annotation'. See example configuration files.") exit(1) genome = config["annotation"]["name"] chrom_size_file = os.path.join(_results_dir(config), genome + ".sizes") if file_exists(chrom_size_file): return chrom_size_file with file_transaction(chrom_size_file) as tmp_chrom_size_file: sh.fetchChromSizes(genome, _out=tmp_chrom_size_file) if not file_exists(chrom_size_file): logger.error("chromosome size file does not exist. Check " "'annotation': 'name' to make sure it is valid.") exit(1) return chrom_size_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir() as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _run_amber(paired, work_dir, lenient=False): """AMBER: calculate allele frequencies at likely heterozygous sites. lenient flag allows amber runs on small test sets. """ amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: key = "germline_het_pon" het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data) cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-threads", dd.get_num_cores(paired.tumor_data), "-tumor", dd.get_sample_name(paired.tumor_data), "-tumor_bam", dd.get_align_bam(paired.tumor_data), "-reference", dd.get_sample_name(paired.normal_data), "-reference_bam", dd.get_align_bam(paired.normal_data), "-ref_genome", dd.get_ref_file(paired.tumor_data), "-bed", het_bed, "-output_dir", os.path.dirname(tx_out_file)] if lenient: cmd += ["-max_het_af_percent", "1.0"] try: do.run(cmd, "PURPLE: AMBER baf generation") except subprocess.CalledProcessError as msg: if not lenient and _amber_allowed_errors(str(msg)): return _run_amber(paired, work_dir, True) for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(amber_dir, f)) return out_file
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items): """Run delly, calling structural variations for the specified type. """ out_file = os.path.join(work_dir, "%s-svs%s-%s.vcf" % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type, chrom)) cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1), len(bam_files)) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: if not _has_variant_regions(items, out_file, chrom): vcfutils.write_empty_vcf(tx_out_file) else: exclude = ["-x", prepare_exclude_file(items, out_file, chrom)] cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files multi_cmd = "export OMP_NUM_THREADS=%s && " % cores try: do.run(multi_cmd + " ".join(cmd), "delly structural variant") # Delly will write nothing if no variants found if not utils.file_exists(tx_out_file): vcfutils.write_empty_vcf(tx_out_file) except subprocess.CalledProcessError, msg: # delly returns an error exit code if there are no variants if "No structural variants found" in str(msg): vcfutils.write_empty_vcf(tx_out_file) else: raise
def split_gtf(gtf, sample_size=None, out_dir=None): """ split a GTF file into two equal parts, randomly selecting genes. sample_size will select up to sample_size genes in total """ if out_dir: part1_fn = os.path.basename(os.path.splitext(gtf)[0]) + ".part1.gtf" part2_fn = os.path.basename(os.path.splitext(gtf)[0]) + ".part2.gtf" part1 = os.path.join(out_dir, part1_fn) part2 = os.path.join(out_dir, part2_fn) if file_exists(part1) and file_exists(part2): return part1, part2 else: part1 = tempfile.NamedTemporaryFile(delete=False, suffix=".part1.gtf").name part2 = tempfile.NamedTemporaryFile(delete=False, suffix=".part2.gtf").name db = get_gtf_db(gtf) gene_ids = set([x['gene_id'][0] for x in db.all_features()]) if not sample_size or (sample_size and sample_size > len(gene_ids)): sample_size = len(gene_ids) gene_ids = set(random.sample(gene_ids, sample_size)) part1_ids = set(random.sample(gene_ids, sample_size / 2)) part2_ids = gene_ids.difference(part1_ids) with open(part1, "w") as part1_handle: for gene in part1_ids: for feature in db.children(gene): part1_handle.write(str(feature) + "\n") with open(part2, "w") as part2_handle: for gene in part2_ids: for feature in db.children(gene): part2_handle.write(str(feature) + "\n") return part1, part2
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None): """bgzip and tabix index an input file, handling VCF and BED. """ if config is None: config = {} out_file = in_file if in_file.endswith(".gz") else in_file + ".gz" if out_dir: remove_orig = False out_file = os.path.join(out_dir, os.path.basename(out_file)) if (not utils.file_exists(out_file) or not os.path.lexists(out_file) or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))): assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file assert os.path.exists(in_file), "Input file %s not found" % in_file if not utils.file_uptodate(out_file, in_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" if prep_cmd: prep_cmd = "| %s " % prep_cmd cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}" try: do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file)) except subprocess.CalledProcessError: # Race conditions: ignore errors where file has been deleted by another if os.path.exists(in_file) and not os.path.exists(out_file): raise if remove_orig: try: os.remove(in_file) except OSError: # Handle cases where run in parallel and file has been deleted pass tabix_index(out_file, config, tabix_args=tabix_args) return out_file
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0] all_vrs = _get_variant_regions(items) ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items) if len(all_vrs) > 0 else chrom) with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions if ready_region == chrom: want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed)) else: want_bedtool = pybedtools.BedTool(ready_region).saveas() sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def make_large_exons_gtf(gtf_file): """ Save all exons > 1000 bases to a separate file for estimating the insert size distribution """ out_dir = os.path.abspath(os.path.join(os.path.dirname(gtf_file), "tophat")) out_file = os.path.join(out_dir, "large_exons.gtf") if file_exists(out_file): return out_file dbfn = gtf_file + ".db" if not file_exists(dbfn): db = gffutils.create_db(gtf_file, dbfn=dbfn, keep_order=True, merge_strategy='merge', force=False, infer_gene_extent=False) else: db = gffutils.FeatureDB(dbfn) processed_count = 0 kept_exons = [] for exon in db.features_of_type('exon'): processed_count += 1 if processed_count % 10000 == 0: print("Processed %d exons." % processed_count) if exon.end - exon.start > 1000: kept_exons.append(exon) with open(out_file, "w") as out_handle: print("Writing %d large exons to %s." % (processed_count, out_file)) for exon in kept_exons: out_handle.write(str(exon) + "\n") return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): index_file = None # Skip trying to use indices now as they provide only slight speed-ups # and give inconsitent outputs in BAM headers # If a single index present, index_dir points to that # if index_dir and os.path.isfile(index_dir): # index_dir = os.path.dirname(index_dir) # index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) if sambamba: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" else: cmd = "{samtools} index {tx_bam_file}" do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def update_loc_file(galaxy_base, loc_type, genome_build, ref_loc): ref_loc = os.path.abspath(ref_loc) loc_file = get_loc_file(galaxy_base, loc_type) if not loc_file: return None formatter = get_locformatter(loc_type) builds = [] tmp_out = tempfile.NamedTemporaryFile(delete=False).name if file_exists(loc_file): with open(loc_file) as in_handle, open(tmp_out, "w") as out_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: parts = line.strip().split() build = parts[1] builds.append(build) if build != genome_build: out_handle.write(line) else: out_handle.write(formatter(genome_build, ref_loc)) shutil.copyfile(tmp_out, loc_file) if genome_build not in builds or not file_exists(loc_file): with open(loc_file, "a") as out_handle: out_handle.write(formatter(genome_build, ref_loc)) return loc_file
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def _combine_sample_regions_batch(batch, items): """Combine sample regions within a group of batched samples. """ config = items[0]["config"] work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "regions")) analysis_file = os.path.join(work_dir, "%s-analysis_blocks.bed" % batch) no_analysis_file = os.path.join(work_dir, "%s-noanalysis_blocks.bed" % batch) if not utils.file_exists(analysis_file) or _needs_region_update(analysis_file, items): # Combine all nblocks into a final set of intersecting regions # without callable bases. HT @brentp for intersection approach # https://groups.google.com/forum/?fromgroups#!topic/bedtools-discuss/qA9wK4zN8do bed_regions = [pybedtools.BedTool(x["regions"]["nblock"]) for x in items if "regions" in x] if len(bed_regions) == 0: analysis_file, no_analysis_file = None, None else: with file_transaction(items[0], analysis_file, no_analysis_file) as (tx_afile, tx_noafile): def intersect_two(a, b): return a.intersect(b, u=True, nonamecheck=True) nblock_regions = reduce(intersect_two, bed_regions).saveas( "%s-nblock%s" % utils.splitext_plus(tx_afile)) ref_file = tz.get_in(["reference", "fasta", "base"], items[0]) ref_regions = get_ref_bedtool(ref_file, config) min_n_size = int(config["algorithm"].get("nomap_split_size", 250)) block_filter = NBlockRegionPicker(ref_regions, config, min_n_size) final_nblock_regions = nblock_regions.filter( block_filter.include_block).saveas().each(block_filter.expand_block).saveas( "%s-nblockfinal%s" % utils.splitext_plus(tx_afile)) final_regions = ref_regions.subtract(final_nblock_regions, nonamecheck=True).merge(d=min_n_size) _write_bed_regions(items[0], final_regions, tx_afile, tx_noafile) if analysis_file and utils.file_exists(analysis_file): return analysis_file, no_analysis_file else: return None, None
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0]))) species = dd.get_species(data[0][0]) hairpin = op.join(mirbase, "hairpin.fa") mature = op.join(mirbase, "mature.fa") rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file): do.run(cmd.format(**locals()), "Running mirdeep2.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def get_summary_metrics(self, align_metrics, dup_metrics, insert_metrics=None, hybrid_metrics=None, vrn_vals=None, rnaseq_metrics=None): """Retrieve a high level summary of interesting metrics. """ with open(align_metrics) as in_handle: align_vals = self._parse_align_metrics(in_handle) if dup_metrics: with open(dup_metrics) as in_handle: dup_vals = self._parse_dup_metrics(in_handle) else: dup_vals = {} (insert_vals, hybrid_vals, rnaseq_vals) = (None, None, None) if insert_metrics and file_exists(insert_metrics): with open(insert_metrics) as in_handle: insert_vals = self._parse_insert_metrics(in_handle) if hybrid_metrics and file_exists(hybrid_metrics): with open(hybrid_metrics) as in_handle: hybrid_vals = self._parse_hybrid_metrics(in_handle) if rnaseq_metrics and file_exists(rnaseq_metrics): with open(rnaseq_metrics) as in_handle: rnaseq_vals = self._parse_rnaseq_metrics(in_handle) return self._tabularize_metrics(align_vals, dup_vals, insert_vals, hybrid_vals, vrn_vals, rnaseq_vals)
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ toval_data = _get_validate(data) if toval_data: if isinstance(toval_data["vrn_file"], (list, tuple)): vrn_file = [os.path.abspath(x) for x in toval_data["vrn_file"]] else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip( normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data ) rm_genome = toval_data["config"]["algorithm"].get("validate_genome_build") sample = toval_data["name"][-1].replace(" ", "_") caller = _get_caller(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) val_config_file = _create_validate_config_file( vrn_file, rm_file, rm_interval_file, rm_genome, base_dir, toval_data ) work_dir = os.path.join(base_dir, "work") out = { "summary": os.path.join(work_dir, "validate-summary.csv"), "grading": os.path.join(work_dir, "validate-grading.yaml"), "discordant": os.path.join(work_dir, "%s-eval-ref-discordance-annotate.vcf" % sample), } if not utils.file_exists(out["discordant"]) or not utils.file_exists(out["grading"]): bcbio_variation_comparison(val_config_file, base_dir, toval_data) out["concordant"] = filter( os.path.exists, [os.path.join(work_dir, "%s-%s-concordance.vcf" % (sample, x)) for x in ["eval-ref", "ref-eval"]], )[0] data["validate"] = out return [[data]]
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def regions_coverage(chanjo_db, batch_name, out_dir): """ create BED file of coverage of all regions from a Chanjo database """ if not utils.file_exists(chanjo_db): return None out_file = os.path.join(out_dir, batch_name + "-all-regions.bed.gz") if utils.file_exists(out_file): return out_file conn = sqlite3.connect(chanjo_db) c = conn.cursor() q = c.execute("SELECT contig, start, end, strand, coverage, completeness, " "sample_id " "FROM interval_data " "JOIN interval ON interval_data.parent_id=interval.id ") with file_transaction(out_file) as tx_out_file: with open(tx_out_file + ".tmp", "w") as out_handle: out_handle.write("\t".join(["#chr", "start", "end", "name", "coverage", "completeness"]) + "\n") for line in q: line = [str(x) for x in line] # chanjo reports coordinates as 1 based instead of 0 based start = str(int(line[1]) - 1) out_handle.write("\t".join([line[0], start, line[2], line[6], line[3], line[4], line[5]]) + "\n") bt = BedTool(tx_out_file + ".tmp").sort().bgzip() shutil.move(bt, tx_out_file) return out_file
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def sample_annotation(data): """ Annotate miRNAs using miRBase database with seqbuster tool """ names = data["rgnames"]['sample'] tools = dd.get_expression_caller(data) work_dir = os.path.join(dd.get_work_dir(data), "mirbase") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = op.join(out_dir, names) if dd.get_mirbase_hairpin(data): mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data))) if utils.file_exists(data["collapse"]): data['transcriptome_bam'] = _align(data["collapse"], dd.get_mirbase_hairpin(data), out_file, data) data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config']) else: logger.debug("Trimmed collapsed file is empty for %s." % names) else: logger.debug("No annotation file from miRBase.") sps = dd.get_species(data) if dd.get_species(data) else "None" logger.debug("Looking for mirdeep2 database for %s" % names) if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")): data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps, op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config']) if "trna" in tools: data['trna'] = _mint_trna_annotation(data) data = spikein.counts_spikein(data) return [[data]]
def prep_gemini_db(fnames, call_info, samples): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) name, caller, is_batch = call_info gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) use_gemini_quick = (do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)) if not utils.file_exists(gemini_db) and use_gemini_quick: use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames) if use_gemini: with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) if "program_versions" in data["config"].get("resources", {}): gemini_ver = programs.get_version("gemini", config=data["config"]) else: gemini_ver = None # Recent versions of gemini allow loading only passing variants load_opts = "" if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"): if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s %s" % (name, caller), data) return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None}]]
def summarize(calls, data, items): """Summarize results from multiple callers into a single flattened BED file. Approach: - Combine all calls found in all files - Filter files retaining those present with multiple levels of support. - Remove calls in high depth regions. - Remove calls with ends overlapping exclusion regions like low complexity regions. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) with shared.bedtools_tmpdir(data): input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]), [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls]) if len(input_beds) > 0: out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data) if utils.file_exists(out_file): if len(input_beds) > N_FILTER_CALLERS: filter_file = _filter_ensemble(out_file, data) else: filter_file = out_file limit_file = shared.remove_highdepth_regions(filter_file, items) exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f] exclude_file = exclude_files[0] if len(exclude_files) > 0 else None if exclude_file: noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data) else: noexclude_file = limit_file bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep")) if utils.file_exists(noexclude_file): calls.append({"variantcaller": "sv-ensemble", "input_beds": input_beds, "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)}) return calls
def _create_combined_fasta(data, out_dir): """ if there are genomes to be disambiguated, create a FASTA file of all of the transcripts for all genomes """ items = disambiguate.split([data]) fasta_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) ref_file = dd.get_ref_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa") if file_exists(out_file): fasta_files.append(out_file) else: out_file = _gtf_to_fasta(gtf_file, ref_file, out_file) out_file = _clean_gtf_fa(out_file, out_file) fasta_files.append(out_file) out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) combined_file = out_stem + ".fa" if file_exists(combined_file): return combined_file fasta_file_string = " ".join(fasta_files) cmd = "cat {fasta_file_string} > {tx_out_file}" with file_transaction(combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.") return combined_file
def run_freebayes(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Detect small polymorphisms with FreeBayes. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): logger.info("Genotyping with FreeBayes: {region} {fname}".format( region=region, fname=os.path.basename(align_bam))) with file_transaction(out_file) as tx_out_file: cl = [ config["program"].get("freebayes", "freebayes"), "-b", align_bam, "-v", tx_out_file, "-f", ref_file, "--left-align-indels" ] cl += _freebayes_options_from_config(config["algorithm"], out_file, region) subprocess.check_call(cl) return out_file
def _create_bed(call, sample, work_dir, calls, data): """Create a simplified BED file from caller specific input. """ out_file = os.path.join( work_dir, "%s-ensemble-%s.bed" % (sample, call["variantcaller"])) if call.get("vrn_file") and not utils.file_uptodate( out_file, call["vrn_file"]): with file_transaction(data, out_file) as tx_out_file: convert_fn = CALLER_TO_BED.get(call["variantcaller"]) if convert_fn: vrn_file = call["vrn_file"] if call["variantcaller"] in SUBSET_BY_ENSEMBLE: ecalls = [ x for x in calls if x["variantcaller"] in SUBSET_BY_ENSEMBLE[call["variantcaller"]] ] if len(ecalls) > 0: vrn_file = _subset_by_ensemble(call["vrn_file"], ecalls[0]["vrn_file"], data) convert_fn(vrn_file, call["variantcaller"], tx_out_file) if utils.file_exists(out_file): return out_file
def _grabix_index(data): """Create grabix index of bgzip input file. grabix does not allow specification of output file, so symlink the original file into a transactional directory. """ in_file = data["bgzip_file"] config = data["config"] grabix = config_utils.get_program("grabix", config) gbi_file = _get_grabix_index(in_file) # We always build grabix input so we can use it for counting reads and doing downsampling if not gbi_file or _is_partial_index(gbi_file): if gbi_file: utils.remove_safe(gbi_file) else: gbi_file = in_file + ".gbi" with file_transaction(data, gbi_file) as tx_gbi_file: tx_in_file = os.path.splitext(tx_gbi_file)[0] utils.symlink_plus(in_file, tx_in_file) do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file)) assert utils.file_exists(gbi_file) return [gbi_file]
def _bgzip_from_cram(cram_file, dirs, data): """Create bgzipped fastq files from an input CRAM file in regions of interest. Returns a list with a single file, for single end CRAM files, or two files for paired end input. """ import pybedtools region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data) if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome", "amplicon"] else None) if region_file: regions = [ "%s:%s-%s" % tuple(r[:3]) for r in pybedtools.BedTool(region_file) ] else: regions = [None] work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_s, out_p1, out_p2 = [ os.path.join( work_dir, "%s-%s.fq.gz" % (utils.splitext_plus(os.path.basename(cram_file))[0], fext)) for fext in ["s1", "p1", "p2"] ] if (not utils.file_exists(out_s) and (not utils.file_exists(out_p1) or not utils.file_exists(out_p2))): cram.index(cram_file, data["config"]) fastqs, part_dir = _cram_to_fastq_regions(regions, cram_file, dirs, data) if len(fastqs[0]) == 1: with file_transaction(data, out_s) as tx_out_file: _merge_and_bgzip([xs[0] for xs in fastqs], tx_out_file, out_s) else: for i, out_file in enumerate([out_p1, out_p2]): if not utils.file_exists(out_file): ext = "/%s" % (i + 1) with file_transaction(data, out_file) as tx_out_file: _merge_and_bgzip([xs[i] for xs in fastqs], tx_out_file, out_file, ext) shutil.rmtree(part_dir) if utils.file_exists(out_p1): return [out_p1, out_p2] else: assert utils.file_exists(out_s) return [out_s]
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config, extra_args=None, names=None): """Alignment with bowtie2. """ out_file = os.path.join(align_dir, "%s.sam" % out_base) if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: cl = [config_utils.get_program("bowtie2", config)] cl += _bowtie2_args_from_config(config) cl += extra_args if extra_args is not None else [] cl += ["-q", "--sensitive", "-X", 2000, # default is too selective for most data "-x", ref_file] if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += ["-U", fastq_file] cl += ["-S", tx_out_file] cl = [str(i) for i in cl] do.run(cl, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file), None) return out_file
def rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, ref_file, data): safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): return out_file rapmap_idx = rapmap_index(gtf_file, ref_file, data, rapmap_dir) num_cores = dd.get_num_cores(data) rapmap = config_utils.get_program("rapmap", data["config"]) cmd = ("{rapmap} pseudomap -i {rapmap_idx} -t {num_cores} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " message = "pseudomapping transcripts in {fq1} and {fq2}." with file_transaction(data, out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def prepare_mask_gtf(gtf): """ make a mask file of usually-masked RNA biotypes """ mask_biotype = [ "rRNA", "Mt_rRNA", "misc_RNA", "snRNA", "snoRNA", "tRNA", "Mt_tRNA" ] mask_chrom = ["MT"] out_file = os.path.join(os.path.dirname(gtf), "ref-transcripts-mask.gtf") if file_exists(out_file): return out_file biotype_lookup = _biotype_lookup_fn(gtf) # if we can't find a biotype column, skip this if not biotype_lookup: return None db = _get_gtf_db(gtf) with open(out_file, "w") as out_handle: for g in db.all_features(): biotype = biotype_lookup(g) if (biotype in mask_biotype) or (g.chrom in mask_chrom): out_handle.write(str(g) + "\n") return out_file
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = _libtype_string(bam_file, strandedness) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} " "-o {tx_out_dir} -a {bam_file} ") cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = "Quantifying transcripts in %s with Salmon." % bam_file do.run(cmd.format(**locals()), message, None) return out_file
def run(bam_file, data, out_dir): out = {} if not tz.get_in(["config", "algorithm", "preseq"], data): return out samtools_stats_dir = os.path.join(out_dir, os.path.pardir, "samtools") samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data)) if not utils.file_exists(stats_file): utils.safe_makedir(out_dir) preseq = config_utils.get_program("preseq", data["config"]) params = _get_preseq_params(data, int(samtools_stats["Total_reads"])) param_line = "-step {step} -extrap {extrap} -seg_len {seg_len}".format( **params) with file_transaction(data, stats_file) as tx_out_file: cmd = "{preseq} lc_extrap -bam -pe {bam_file} -o {tx_out_file} {param_line}".format( **locals()) do.run(cmd.format(**locals()), "preseq lc_extrap", data) out = _prep_real_counts(bam_file, data, samtools_stats) return {"base": stats_file, "metrics": out}
def genotype_filter(vcf_file, expression, data, name, filterext=""): """Perform genotype based filtering using GATK with the provided expression. Adds FT tags to genotypes, rather than the general FILTER flag. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "VariantFiltration", "-R", tz.get_in(["reference", "fasta", "base"], data), "--variant", vcf_file, "--out", tx_out_file, "--genotypeFilterName", name, "--genotypeFilterExpression", "'%s'" % expression ] jvm_opts = broad.get_gatk_framework_opts(data["config"]) cmd = [config_utils.get_program("gatk-framework", data["config"]) ] + jvm_opts + params do.run(cmd, "Filter with expression: %s" % expression) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def bam2bigwig(in_file, config, out_prefix=None): """ assumes the library preparation was not strand specific for now """ PROGRAM = "bam2wig.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "bigwig" chrom_size_file = config["annotation"].get("chrom_size_file", None) out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) if not chrom_size_file: chrom_size_file = _fetch_chrom_sizes(config) wiggle_file = out_prefix + ".wig" if not file_exists(wiggle_file): bam2wig = sh.Command(which(PROGRAM)) bam2wig(i=in_file, s=chrom_size_file, o=out_prefix) bigwig_file = out_prefix + ".bw" return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
def square_batch_region(data, region, bam_files, vrn_files, out_file): """Perform squaring of a batch in a supplied region, with input BAMs """ from bcbio.variation import sentieon if not utils.file_exists(out_file): jointcaller = tz.get_in(("config", "algorithm", "jointcaller"), data) if jointcaller in ["%s-joint" % x for x in SUPPORTED["general"]]: _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, "square") elif jointcaller in ["%s-merge" % x for x in SUPPORTED["general"]]: _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, "merge") elif jointcaller in ["%s-joint" % x for x in SUPPORTED["gatk"]]: gatkjoint.run_region(data, region, vrn_files, out_file) elif jointcaller in ["%s-joint" % x for x in SUPPORTED["gvcf"]]: merge_gvcfs(data, region, vrn_files, out_file) elif jointcaller in ["%s-joint" % x for x in SUPPORTED["sentieon"]]: sentieon.run_gvcftyper(vrn_files, out_file, region, data) else: raise ValueError("Unexpected joint calling approach: %s." % jointcaller) if region: data["region"] = region data = _fix_orig_vcf_refs(data) data["vrn_file"] = out_file return [data]
def _save_uploaded_data_json(samples, data_json_work, out_dir): """ Fixes all absolute work-rooted paths to relative final-rooted paths """ if not utils.file_exists(data_json_work): return None upload_path_mapping = dict() for sample in samples: upload_path_mapping.update(get_all_upload_paths_from_sample(sample)) if not upload_path_mapping: return data_json_work with io.open(data_json_work, encoding="utf-8") as f: data = json.load(f, object_pairs_hook=OrderedDict) upload_base = samples[0]["upload"]["dir"] data = walk_json( data, lambda s: _work_path_to_rel_final_path(s, upload_path_mapping, upload_base)) data_json_final = os.path.join(out_dir, "multiqc_data_final.json") with io.open(data_json_final, "w", encoding="utf-8") as f: json.dump(data, f, indent=4) return data_json_final
def run_align(*data): """ Prepare data to run alignment step, only once for each project """ work_dir = dd.get_work_dir(data[0][0]) out_dir = op.join(work_dir, "seqcluster", "prepare") seq_out = op.join(out_dir, "seqs.fastq") bam_dir = op.join(work_dir, "align") new_bam_file = op.join(bam_dir, "seqs.bam") tools = dd.get_expression_caller(data[0][0]) if not file_exists(new_bam_file): sample = process_alignment(data[0][0], [seq_out, None]) bam_file = dd.get_work_bam(sample[0][0]) shutil.move(bam_file, new_bam_file) shutil.move(bam_file + ".bai", new_bam_file + ".bai") shutil.rmtree(op.join(bam_dir, sample[0][0]["rgnames"]['sample'])) for sample in data: sample[0]["align_bam"] = sample[0]["clean_fastq"] sample[0]["work_bam"] = new_bam_file if "mirdeep2" in tools: novel_db = mirdeep.run(data) return data
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, region=None, out_file=None, deep_coverage=False, config=None): """Perform realignment of BAM file in specified regions """ if out_file is None: out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] if not file_exists(out_file): with curdir_tmpdir({"config": config}) as tmp_dir: with file_transaction(out_file) as tx_out_file: logger.info("GATK IndelRealigner: %s %s" % (os.path.basename(align_bam), region)) cl = gatk_indel_realignment_cl(runner, align_bam, ref_file, intervals, tmp_dir, region, deep_coverage) cl += ["-o", tx_out_file] do.run(cl, "GATK indel realignment", {}) return out_file
def clipping_profile(in_file, config, out_prefix=None): """ estimate the clipping profile of the reads """ PROGRAM = "clipping_profile.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "clipping" out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping") clip_plot_file = out_prefix + ".clipping_profile.pdf" print clip_plot_file if file_exists(clip_plot_file): return clip_plot_file clip_run = sh.Command(which(PROGRAM)) clip_run(i=in_file, o=out_prefix) # hack to get around the fact that clipping_profile saves the file in # the script execution directory #sh.mv("clipping_profile.pdf", clip_plot_file) return clip_plot_file
def kallisto_table(kallisto_dir, index): """ convert kallisto output to a count table where the rows are equivalence classes and the columns are cells """ quant_dir = os.path.join(kallisto_dir, "quant") out_file = os.path.join(quant_dir, "matrix.csv") if file_exists(out_file): return out_file tsvfile = os.path.join(quant_dir, "matrix.tsv") ecfile = os.path.join(quant_dir, "matrix.ec") cellsfile = os.path.join(quant_dir, "matrix.cells") fastafile = os.path.splitext(index)[0] + ".fa" fasta_names = fasta.sequence_names(fastafile) ec_names = get_ec_names(ecfile, fasta_names) df = pd.read_table(tsvfile, header=None, names=["ec", "cell", "count"]) df["ec"] = [ec_names[x] for x in df["ec"]] df = df.pivot(index='ec', columns='cell', values='count') cellnames = get_cell_names(cellsfile) colnames = [cellnames[x] for x in df.columns] df.columns = colnames df.to_csv(out_file) return out_file
def by_regions(items): """Plot for a union set of combined ensemble regions across all of the data items. """ work_dir = os.path.join(dd.get_work_dir(items[0]), "structural", "coverage") safe_makedir(work_dir) out_file = os.path.join(work_dir, "%s-coverage.pdf" % (dd.get_sample_name(items[0]))) if file_exists(out_file): items = _add_regional_coverage_plot(items, out_file) else: bed_files = _get_ensemble_bed_files(items) merged = bed.merge(bed_files) breakpoints = breakpoints_by_caller(bed_files) if merged: priority_merged = _prioritize_plot_regions(merged, items[0]) out_file = plot_multiple_regions_coverage(items, out_file, items[0], priority_merged, breakpoints) items = _add_regional_coverage_plot(items, out_file) return items
def _freebayes_custom(in_file, ref_file, data): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. Experimental: for testing new methods. """ if vcfutils.get_paired_phenotype(data): return None config = data["config"] bv_ver = programs.get_version("bcbio_variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp")) bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", config, "dir")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file
def pizzly(pizzly_path, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data): outdir = os.path.join(pizzlydir, samplename) out_stem = os.path.join(outdir, samplename) pizzly_gtf = make_pizzly_gtf(gtf, os.path.join(pizzlydir, "pizzly.gtf"), data) sentinel = os.path.join(out_stem, "-flat-filtered.tsv") pizzlycalls = out_stem + ".json" if not file_exists(pizzlycalls): with file_transaction(data, outdir) as tx_out_dir: safe_makedir(tx_out_dir) tx_out_stem = os.path.join(tx_out_dir, samplename) cmd = ( "{pizzly_path} -k 31 --gtf {pizzly_gtf} --cache {cachefile} " "--align-score 2 --insert-size {fraglength} --fasta {gtf_fa} " "--output {tx_out_stem} {fusions}") message = ("Running pizzly on %s." % fusions) do.run(cmd.format(**locals()), message) flatfile = out_stem + "-flat.tsv" filteredfile = out_stem + "-flat-filtered.tsv" flatten_pizzly(pizzlycalls, flatfile, data) filter_pizzly(flatfile, filteredfile, data) return outdir
def _snpeff_args_from_config(data): """Retrieve snpEff arguments supplied through input configuration. """ config = data["config"] args = ["-hgvs"] # General supplied arguments resources = config_utils.get_resources("snpeff", config) if resources.get("options"): args += [str(x) for x in resources.get("options", [])] # cancer specific calling arguments if vcfutils.get_paired_phenotype(data): args += ["-cancer"] effects_transcripts = dd.get_effects_transcripts(data) if effects_transcripts in set(["canonical_cancer"]): _, snpeff_base_dir = get_db(data) canon_list_file = os.path.join(snpeff_base_dir, "transcripts", "%s.txt" % effects_transcripts) if not utils.file_exists(canon_list_file): raise ValueError("Cannot find expected file for effects_transcripts: %s" % canon_list_file) args += ["-canonList", canon_list_file] elif effects_transcripts == "canonical" or tz.get_in(("config", "algorithm", "clinical_reporting"), data): args += ["-canon"] return args
def _work_handles(in_files, dirs, ext): """Create working handles for input files and close on completion. """ out_dir = safe_makedir(os.path.join(dirs["work"], "trim")) out_handles = {} in_handles = {} name_map = {} for in_file in in_files: out_file = os.path.join( out_dir, "{base}{ext}".format(base=os.path.splitext( os.path.basename(in_file))[0], ext=ext)) name_map[in_file] = out_file if not file_exists(out_file): in_handles[in_file] = open(in_file) out_handles[in_file] = open(out_file, "w") try: yield in_handles, out_handles, name_map finally: for h in in_handles.values(): h.close() for h in out_handles.values(): h.close()
def merge_overlaps(in_file, data, distance=None, out_dir=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ if in_file: bedtools = config_utils.get_program("bedtools", data["config"]) work_dir = tz.get_in(["dirs", "work"], data) if out_dir: bedprep_dir = out_dir elif work_dir: bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep")) else: bedprep_dir = os.path.dirname(in_file) out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0])) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: distance = "-d %s" % distance if distance else "" cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare merged BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def gatk_rnaseq_calling(data): """ use GATK to perform variant calling on RNA-seq data """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) split_bam = dd.get_split_bam(data) out_file = os.path.splitext(split_bam)[0] + ".gvcf" num_cores = dd.get_num_cores(data) if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o", tx_out_file, "-nct", str(num_cores), "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", "-dontUseSoftClippedBases" ] broad_runner.run_gatk(params) data = dd.set_vrn_file(data, out_file) return data
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data bowtie2 = config_utils.get_program("bowtie2", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_index = index_transcriptome(gtf_file, ref_file, data) num_cores = data["config"]["algorithm"].get("num_cores", 1) fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file pair_cmd = "-2 %s " % pair_file if pair_file else "" cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} ") with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file, name_sort=True) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def _maybe_add_alignment(algorithm, sample, out): if _has_alignment_file(algorithm, sample): for (fname, ext, isplus) in [(sample.get("work_bam"), "ready", False), (dd.get_disc_bam(sample), "disc", True), (dd.get_sr_bam(sample), "sr", True)]: if fname and os.path.exists(fname): if fname.endswith("bam"): ftype, fext = "bam", ".bai" elif fname.endswith("cram"): ftype, fext = "cram", ".crai" else: raise ValueError("Unexpected alignment file type %s" % fname) out.append({"path": fname, "type": ftype, "plus": isplus, "ext": ext}) if utils.file_exists(fname + fext): out.append({"path": fname + fext, "type": ftype + fext, "plus": isplus, "index": True, "ext": ext}) return out
def picard_mark_duplicates(picard, align_bam, remove_dups=False): base, ext = os.path.splitext(align_bam) base = base.replace(".", "-") dup_bam = "%s-dup%s" % (base, ext) dup_metrics = "%s-dup.dup_metrics" % base if not file_exists(dup_bam): with tx_tmpdir(picard._config) as tmp_dir: with file_transaction(picard._config, dup_bam, dup_metrics) as (tx_dup_bam, tx_dup_metrics): opts = [("INPUT", align_bam), ("OUTPUT", tx_dup_bam), ("TMP_DIR", tmp_dir), ("REMOVE_DUPLICATES", "true" if remove_dups else "false"), ("METRICS_FILE", tx_dup_metrics)] if picard.get_picard_version("MarkDuplicates") >= 1.82: opts += [("PROGRAM_RECORD_ID", "null")] picard.run("MarkDuplicates", opts, memscale={ "direction": "decrease", "magnitude": 2 }) return dup_bam, dup_metrics
def _get_vcf(x, key): """Retrieve VCF file with the given key if it exists, handling bgzipped. """ out = [] fname = utils.get_in(x, key) if fname: if fname.endswith(".gz"): out.append({"path": fname, "type": "vcf.gz", "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) if utils.file_exists(fname + ".tbi"): out.append({"path": fname + ".tbi", "type": "vcf.gz.tbi", "index": True, "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) else: out.append({"path": fname, "type": "vcf", "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) return out
def picard_fastq_to_bam(picard, fastq_one, fastq_two, out_dir, names, order="queryname"): """Convert fastq file(s) to BAM, adding sample, run group and platform information. """ out_bam = os.path.join( out_dir, "%s-fastq.bam" % os.path.splitext(os.path.basename(fastq_one))[0]) if not file_exists(out_bam): with tx_tmpdir(picard._config) as tmp_dir: with file_transaction(picard._config, out_bam) as tx_out_bam: opts = [("FASTQ", fastq_one), ("READ_GROUP_NAME", names["rg"]), ("SAMPLE_NAME", names["sample"]), ("PLATFORM_UNIT", names["pu"]), ("PLATFORM", names["pl"]), ("TMP_DIR", tmp_dir), ("OUTPUT", tx_out_bam), ("SORT_ORDER", order)] if fastq_two: opts.append(("FASTQ2", fastq_two)) picard.run("FastqToSam", opts) return out_bam
def _run_snpeff(snp_in, out_format, data): snpeff_db, datadir = get_db(data) assert datadir is not None, \ "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"] assert os.path.exists(os.path.join(datadir, snpeff_db)), \ "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir) snpeff_cmd = get_cmd("eff", datadir, data["config"]) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(out_file) as tx_out_file: cmd = ( "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} " "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file