def gatk_rnaseq_calling(data): """ use GATK to perform variant calling on RNA-seq data """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) split_bam = dd.get_split_bam(data) out_file = os.path.splitext(split_bam)[0] + ".vcf" bgzipped_file = out_file + ".gz" num_cores = dd.get_num_cores(data) if file_exists(bgzipped_file): data = dd.set_vrn_file(data, bgzipped_file) return data with file_transaction(data, out_file) as tx_out_file: params = ["-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o", tx_out_file, "-nct", str(num_cores), "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", "-dontUseSoftClippedBases"] broad_runner.run_gatk(params) bgzip_and_index(out_file, dd.get_config(data)) data = dd.set_vrn_file(data, bgzipped_file) return data
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, count_file) as tx_count_file: do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) shutil.move(fixed_count_file, count_file) return count_file
def gatk_filter_rnaseq(data, vrn_file, out_file): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: params = ["-T", "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--clusterWindowSize", "35", "--clusterSize", "3", "--filterExpression", "\"'FS > 30.0'\"", "--filterName", "FS", "--filterExpression", "\"'QD < 2.0'\"", "--filterName", "QD", "-o", tx_out_file] jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter variants.") return out_file
def calculate_complexity_metrics(work_bam, data): """ the work_bam should have duplicates marked but not removed mitochondrial reads should be removed """ bedtools = config_utils.get_program("bedtools", dd.get_config(data)) work_dir = dd.get_work_dir(data) metrics_dir = os.path.join(work_dir, "metrics", "atac") utils.safe_makedir(metrics_dir) metrics_file = os.path.join( metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv") # complexity metrics only make sense for paired-end reads if not bam.is_paired(work_bam): return data if utils.file_exists(metrics_file): data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data # BAM file must be sorted by read name work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname") with file_transaction(metrics_file) as tx_metrics_file: with open(tx_metrics_file, "w") as out_handle: out_handle.write("mt,m0,m1,m2\n") cmd = ( f"{bedtools} bamtobed -bedpe -i {work_bam} | " "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | " "sort | " "uniq -c | " "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} " "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> " f"{tx_metrics_file}") message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}." do.run(cmd, message) data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) resources = config_utils.get_resources("salmon", dd.get_config(data)) params = "" if resources.get("options") is not None: params = " ".join([str(x) for x in resources.get("options", [])]) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} {params} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped( fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def gatk_filter_rnaseq(data, vrn_file, out_file): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: params = ["-T", "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--clusterWindowSize", "35", "--clusterSize", "3", "--filterExpression", "\"'FS > 30.0'\"", "--filterName", "FS", "--filterExpression", "\"'QD < 2.0'\"", "--filterName", "QD", "-o", tx_out_file] jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data)) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter variants.") return out_file
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}" message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts" with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def gatk_splitreads(data): """ use GATK to split reads with Ns in the CIGAR string, hard clipping regions that end up in introns """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) deduped_bam = dd.get_deduped_bam(data) base, ext = os.path.splitext(deduped_bam) split_bam = base + ".splitN" + ext if dd.get_quality_format(data) == "illumina": quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"] else: quality_flag = [] if file_exists(split_bam): data = dd.set_split_bam(data, split_bam) return data with file_transaction(split_bam) as tx_split_bam: params = ["-T", "SplitNCigarReads", "-R", ref_file, "-I", deduped_bam, "-o", tx_split_bam, "-rf", "ReassignOneMappingQuality", "-RMQF", "255", "-RMQT", "60", "-rf", "UnmappedRead", "-U", "ALLOW_N_CIGAR_READS"] + quality_flag broad_runner.run_gatk(params) bam.index(split_bam, dd.get_config(data)) data = dd.set_split_bam(data, split_bam) return data
def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) method = dd.get_chip_method(data) if method == "atac": data = clean_ATAC(data) # for ATAC-seq, this will be the NF BAM work_bam = dd.get_work_bam(data) work_bam = bam.sort(work_bam, dd.get_config(data)) bam.index(work_bam, dd.get_config(data)) clean_bam = remove_nonassembled_chrom(work_bam, data) clean_bam = remove_mitochondrial_reads(clean_bam, data) data = atac.calculate_complexity_metrics(clean_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) resources = config_utils.get_resources("salmon", dd.get_config(data)) params = "" if resources.get("options") is not None: params = " ".join([str(x) for x in resources.get("options", [])]) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} {params} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) #sailfish.sleuthify_sailfish(tx_out_dir) return out_file
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data): valid_algorithms = ["pseudo", "quasi"] assert algorithm in valid_algorithms, \ "RapMap algorithm needs to be one of %s." % valid_algorithms safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): bam.index(out_file, dd.get_config(data)) return out_file rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data, rapmap_dir) num_cores = dd.get_num_cores(data) algorithm_subcommand = algorithm + "map" rapmap = config_utils.get_program("rapmap", dd.get_config(data)) cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += "-r {fq1_cmd} " else: fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) " fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += "-1 {fq2_cmd} -2 {fq2_cmd} " with file_transaction(out_file) as tx_out_file: cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file) run_message = ("%smapping %s and %s to %s with Rapmap. " % (algorithm, fq1, fq2, rapmap_index)) do.run(cmd.format(**locals()), run_message, None) bam.index(out_file, dd.get_config(data)) return out_file
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) if dd.get_aligner(data) == "star": out_dir = os.path.join( out_dir, "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data))) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") resources = config_utils.get_resources("featureCounts", data["config"]) if resources: options = resources.get("options") if options: cmd += " %s" % " ".join([str(x) for x in options]) message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def gatk_joint_calling(data, vrn_files, ref_file): joint_file = os.path.join("variation", "joint.vcf") out_file = os.path.join("variation", "combined.vcf") bgzjoint_file = os.path.join("variation", "joint.vcf.gz") bgzout_file = os.path.join("variation", "combined.vcf.gz") if not file_exists(bgzout_file): joint_file = _run_genotype_gvcfs(data, vrn_files, ref_file, joint_file) bgzip_and_index(joint_file, dd.get_config(data)) out_file = gatk_filter_rnaseq(data, bgzjoint_file, out_file) bgzip_and_index(out_file, dd.get_config(data)) return bgzout_file
def chipseq_count(data): """ count reads mapping to ChIP/ATAC consensus peaks with featureCounts """ method = dd.get_chip_method(data) if method == "chip": in_bam = dd.get_work_bam(data) elif method == "atac": in_bam = tz.get_in(("atac", "align", "NF"), data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data) saf_file = os.path.splitext(consensus_file)[0] + ".saf" work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "consensus") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): if method == "atac": data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count) return [[data]] featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ( "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {sorted_bam}") message = ("Count reads in {sorted_bam} overlapping {saf_file} using " "featureCounts.") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) if method == "atac": data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count) return [[data]]
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): safe_makedir(salmon_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(salmon_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish._create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--numBootstraps 30 --useVBOpt " with file_transaction(data, salmon_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) # use user supplied transcriptome FASTA file if it exists if dd.get_transcriptome_fasta(data): out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers." with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}" do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def collect_oxog_metrics(data): """ extracts 8-oxoguanine (OxoG) artifact metrics from CollectSequencingArtifacts output so we don't have to run CollectOxoGMetrics. """ input_base = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data), dd.get_sample_name(data)) if not utils.file_exists(input_base + ".pre_adapter_detail_metrics"): return None OUT_SUFFIXES = [".oxog_metrics"] picard = broad.runner_from_path("picard", dd.get_config(data)) out_dir = os.path.join(dd.get_work_dir(data), "metrics", "oxog", dd.get_sample_name(data)) utils.safe_makedir(out_dir) ref_file = dd.get_ref_file(data) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [("--INPUT_BASE", input_base), ("--OUTPUT_BASE", out_base), ("--REFERENCE_SEQUENCE", ref_file)] picard.run("ConvertSequencingArtifactToOxoG", params) return out_files
def trim_adapters(data): to_trim = [x for x in data["files"] if x is not None] logger.info("Trimming low quality ends and read through adapter " "sequence from %s." % (", ".join(to_trim))) out_dir = os.path.join(dd.get_work_dir(data), "trimmed") config = dd.get_config(data) return _trim_adapters(to_trim, out_dir, config)
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): safe_makedir(salmon_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(salmon_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish._create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--numBootstraps 30 --useVBOpt " with file_transaction(data, salmon_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant -l A -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {in_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, count_file) as tx_count_file: do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) os.rename(fixed_count_file, count_file) return count_file
def dedup_bismark(data): """Remove alignments to the same position in the genome from the Bismark mapping output using deduplicate_bismark """ input_file = datadict.get_work_bam(data) input_file = bam.sort(input_file, datadict.get_config(data), order="queryname") sample_name = datadict.get_sample_name(data) output_dir = os.path.join(datadict.get_work_dir(data), 'dedup', sample_name) output_dir = utils.safe_makedir(output_dir) input_file_name, input_file_extension = os.path.splitext( os.path.basename(input_file)) output_file = os.path.join( output_dir, f'{input_file_name}.deduplicated{input_file_extension}') if utils.file_exists(output_file): data = datadict.set_work_bam(data, output_file) return [[data]] deduplicate_bismark = config_utils.get_program('deduplicate_bismark', data['config']) command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}' with transaction.file_transaction(output_dir): do.run(command, 'remove deduplicate alignments') data = datadict.set_work_bam(data, output_file) return [[data]]
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compres_bdg_files(out_dir) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") _compres_bdg_files(out_dir) return _get_output_files(out_dir)
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir)) fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else "" single_flag = "--single" if not fq2 else "" fraglength_flag = "--fragment-length=200" if not fq2 else "" sd_flag = "--sd=25" if not fq2 else "" bootstrap_flag = "--bootstrap-samples=30" fq2 = "" if not fq2 else fq2 if not fq2: logger.warning("kallisto was run on single-end data and we set the " "estimated fragment length to 200 and the standard " "deviation to 25, if these don't reflect your data then " "the results may be inaccurate. Use with caution. See " "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w " "for details.") cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} " "{fraglength_flag} {sd_flag} {bootstrap_flag} " "-o {tx_out_dir} -i {index} {fq1} {fq2}") with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with kallisto.") do.run(cmd.format(**locals()), message, None) return quant_dir
def collect_artifact_metrics(data): """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard- use picard wrapper rather than gatk - works for gatk4 and gatk3 projects refactor - move to broad/picardrun """ OUT_SUFFIXES = [".bait_bias_detail_metrics", ".error_summary_metrics", ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"] picard = broad.runner_from_path("picard", dd.get_config(data)) ref_file = dd.get_ref_file(data) bam_file = dd.get_work_bam(data) if not bam_file: return None if "collectsequencingartifacts" in dd.get_tools_off(data): return None out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data)) utils.safe_makedir(out_dir) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [("-REFERENCE_SEQUENCE", ref_file), ("-INPUT", bam_file), ("-OUTPUT", out_base)] # picard runner sets VALIDATION_STRINGENCY picard.run("CollectSequencingArtifactMetrics", params) return out_files
def starts_by_depth(bam_file, data, sample_size=10000000): """ Return a set of x, y points where x is the number of reads sequenced and y is the number of unique start sites identified If sample size < total reads in a file the file will be downsampled. """ config = dd.get_config(data) binsize = (sample_size / 100) + 1 seen_starts = set() counted = 0 num_reads = [] starts = [] buffer = [] downsampled = bam.downsample(bam_file, data, sample_size) with bam.open_samfile(downsampled) as samfile: for read in samfile: if read.is_unmapped: continue counted += 1 buffer.append(str(read.tid) + ":" + str(read.pos)) if counted % binsize == 0: seen_starts.update(buffer) buffer = [] num_reads.append(counted) starts.append(len(seen_starts)) seen_starts.update(buffer) num_reads.append(counted) starts.append(len(seen_starts)) return pd.DataFrame({"reads": num_reads, "starts": starts})
def collect_artifact_metrics(data): """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard- """ OUT_SUFFIXES = [ ".bait_bias_detail_metrics", ".error_summary_metrics", ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics" ] broad_runner = broad.runner_from_config(dd.get_config(data)) gatk_type = broad_runner.gatk_type() ref_file = dd.get_ref_file(data) bam_file = dd.get_work_bam(data) out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data)) utils.safe_makedir(out_dir) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [ "-T", "CollectSequencingArtifactMetrics", "-R", ref_file, "-I", bam_file, "-O", out_base ] broad_runner.run_gatk(params, log_error=False, parallel_gc=True) return out_files
def collect_oxog_metrics(data): """ extracts 8-oxoguanine (OxoG) artifact metrics from CollectSequencingArtifacts output so we don't have to run CollectOxoGMetrics. """ input_base = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data), dd.get_sample_name(data)) if not utils.file_exists(input_base + ".pre_adapter_detail_metrics"): return None OUT_SUFFIXES = [".oxog_metrics"] broad_runner = broad.runner_from_config(dd.get_config(data)) gatk_type = broad_runner.gatk_type() out_dir = os.path.join(dd.get_work_dir(data), "metrics", "oxog", dd.get_sample_name(data)) utils.safe_makedir(out_dir) ref_file = dd.get_ref_file(data) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [ "-T", "ConvertSequencingArtifactToOxoG", "--INPUT_BASE", input_base, "-O", out_base, "-R", ref_file ] broad_runner.run_gatk(params, log_error=False, parallel_gc=True) # multiqc <= 1.9 looks for INPUT not INPUT_BASE for these files # see (https://github.com/ewels/MultiQC/pull/1310) cmd = f"sed 's/INPUT_BASE/INPUT/g' {out_base}.oxog_metrics -i" do.run(cmd, f"Fixing {out_base}.oxog_metrics to work with MultiQC.") return out_files
def salmon_decoy_index(gtf_file, data, out_dir): input_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") decoy_transcriptome = os.path.join( input_dir, sailfish.get_build_string(data) + "-decoy.fa") out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa decoy_sequence_file = get_decoy_sequence_file(data) decoy_name_file = get_decoy_name_file(data) gtf_fa = create_decoy_transcriptome(gtf_fa, get_decoy_sequence_file(data), decoy_transcriptome) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = ( "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa} " "--decoys {decoy_name_file} ") message = "Creating decoy-aware Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant -l A -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped( fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def convert_to_kallisto(data): files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq") out_file = os.path.join(kallisto_dir, "barcodes.batch") umis = config_utils.get_program("umis", dd.get_config(data)) if file_exists(out_file): return out_file if dd.get_minimum_barcode_depth(data): cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt") cb_cutoff = dd.get_minimum_barcode_depth(data) cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}" cb_options = cb_options.format(**locals()) else: cb_options = "" cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}") with file_transaction(data, kallisto_dir) as tx_kallisto_dir: safe_makedir(tx_kallisto_dir) message = ("Transforming %s to Kallisto singlecell format. " % fq1) do.run(cmd.format(**locals()), message) return out_file
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) sentinel_file = os.path.join(quant_dir, "abundance.h5") if os.path.exists(sentinel_file): return quant_dir num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir)) fusion_flag = "--fusion" if dd.get_fusion_mode( data) or dd.get_fusion_caller(data) else "" single_flag = "--single" if not fq2 else "" fraglength_flag = "--fragment-length=200" if not fq2 else "" sd_flag = "--sd=25" if not fq2 else "" bootstrap_flag = "--bootstrap-samples=30" fq2 = "" if not fq2 else fq2 if not fq2: logger.warning( "kallisto was run on single-end data and we set the " "estimated fragment length to 200 and the standard " "deviation to 25, if these don't reflect your data then " "the results may be inaccurate. Use with caution. See " "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w " "for details.") cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} " "{fraglength_flag} {sd_flag} {bootstrap_flag} " "-o {tx_out_dir} -i {index} {fq1} {fq2}") with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with kallisto.") do.run(cmd.format(**locals()), message, None) return quant_dir
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compres_bdg_files(out_dir) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = HS.get( genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data))) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compres_bdg_files(out_dir) return _get_output_files(out_dir)
def split_ATAC(data, bam_file=None): """ splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based on the estimated insert sizes uses the current working BAM file if no BAM file is supplied """ sambamba = config_utils.get_program("sambamba", data) num_cores = dd.get_num_cores(data) base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} ' bam_file = bam_file if bam_file else dd.get_work_bam(data) out_stem = os.path.splitext(bam_file)[0] split_files = {} # we can only split these fractions from paired runs if not bam.is_paired(bam_file): split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data for arange in ATACRanges.values(): out_file = f"{out_stem}-{arange.label}.bam" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: cmd = base_cmd +\ f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\ f'{bam_file} > {tx_out_file}' message = f'Splitting {arange.label} regions from {bam_file}.' do.run(cmd, message) bam.index(out_file, dd.get_config(data)) split_files[arange.label] = out_file split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def create_ataqv_report(samples): """ make the ataqv report from a set of ATAC-seq samples """ data = samples[0][0] new_samples = [] reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv") sentinel = os.path.join(reportdir, "index.html") if utils.file_exists(sentinel): ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data) mkarv = config_utils.get_program("mkarv", dd.get_config(data)) ataqv_files = [] for data in dd.sample_data_iterator(samples): qc = dd.get_summary_qc(data) ataqv_file = tz.get_in(("ataqv", "base"), qc, None) if ataqv_file and utils.file_exists(ataqv_file): ataqv_files.append(ataqv_file) if not ataqv_files: return samples ataqv_json_file_string = " ".join(ataqv_files) with file_transaction(reportdir) as txreportdir: cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}" message = f"Creating ataqv report from {ataqv_json_file_string}." do.run(cmd, message) new_data = [] ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data)
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = ["VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = [ "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file ] jvm_opts = broad.get_gatk_opts(dd.get_config(data), os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter RNA-seq variants.") return out_file
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext if file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter}"' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) return out_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def gatk_rnaseq_calling(data): """ use GATK to perform variant calling on RNA-seq data """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) split_bam = dd.get_split_bam(data) out_file = os.path.splitext(split_bam)[0] + ".gvcf" num_cores = dd.get_num_cores(data) if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data with file_transaction(out_file) as tx_out_file: params = ["-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o", tx_out_file, "-nct", str(num_cores), "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", "-dontUseSoftClippedBases", "-stand_call_conf", "20.0", "-stand_emit_conf", "20.0"] broad_runner.run_gatk(params) data = dd.set_vrn_file(data, out_file) return data
def _gtf_to_fasta(gtf_file, ref_file, out_file, data=None): gtf_to_fasta = config_utils.get_program("gtf_to_fasta", dd.get_config(data)) with file_transaction(data, out_file) as tx_gtf_fa: cmd = "{gtf_to_fasta} {gtf_file} {ref_file} {tx_gtf_fa}" message = "Extracting genomic sequences of {gtf_file}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def includes_missingalt(data): """ As of GATK 4.1.0.0, variants with missing alts are generated (see https://github.com/broadinstitute/gatk/issues/5650) """ MISSINGALT_VERSION = LooseVersion("4.1.0.0") version = LooseVersion(broad.get_gatk_version(config=dd.get_config(data))) return version >= MISSINGALT_VERSION
def get_star_version(data): star_path = config_utils.get_program("STAR", dd.get_config(data)) cmd = "%s --version" % star_path subp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) with contextlib.closing(subp.stdout) as stdout: for line in stdout: if "STAR_" in line: version = line.split("STAR_")[1].strip() return version
def trim_adapters(data): to_trim = [x for x in data["files"] if x is not None and is_fastq(x)] if not to_trim: return data["files"] logger.info("Trimming low quality ends and read through adapter " "sequence from %s." % (", ".join(to_trim))) out_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "trimmed")) config = dd.get_config(data) name = dd.get_sample_name(data) return _trim_adapters(to_trim, out_dir, name, config)
def _index_spikein(fasta, out_dir, data, kmer=31): out_dir = safe_makedir(os.path.join(out_dir, "index")) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmer} -p {num_cores} -i {tx_out_dir} -t {fasta}" message = "Creating Salmon index for {fasta}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def rapmap_pseudoindex(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "pseudoindex", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) gtf_fa = sailfish._create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} pseudoindex -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap pseudoindex for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) gtf_fa = sailfish._create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) ### TODO PUT MEMOZATION HERE with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def run_pizzly(data): work_dir = dd.get_work_dir(data) pizzlydir = os.path.join(work_dir, "pizzly") samplename = dd.get_sample_name(data) gtf = dd.get_gtf_file(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) fraglength = get_fragment_length(data) cachefile = os.path.join(pizzlydir, "pizzly.cache") fusions = kallisto.get_kallisto_fusions(data) pizzlypath = config_utils.get_program("pizzly", dd.get_config(data)) outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data) return outdir
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap {index_type} for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir