def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} report_file = os.path.join(out_dir, "qualimapReport.html") raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): utils.safe_makedir(out_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, out_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def _detect_rRNA(data, out_dir): out_file = os.path.join(out_dir, "rRNA_metrics.txt") if not utils.file_exists(out_file): gtf_file = dd.get_gtf_file(data) quant = tz.get_in(["quant", "tsv"], data) if not quant: salmon_dir = dd.get_salmon_dir(data) if salmon_dir: quant = os.path.join(salmon_dir, "quant", "quant.sf") logger.info("Calculating RNA-seq rRNA metrics for %s." % quant) rrna_features = gtf.get_rRNA(gtf_file) transcripts = set([x[1] for x in rrna_features if x]) if not (transcripts and quant and utils.file_exists(quant)): return {'rRNA': "NA", "rRNA_rate": "NA"} sample_table = pd.read_csv(quant, sep="\t") rrna_exp = list(map(float, sample_table[sample_table["Name"].isin(transcripts)]["NumReads"])) total_exp = list(map(float, sample_table["NumReads"])) rrna = sum(rrna_exp) if sum(total_exp) == 0: rrna_rate = "NA" else: rrna_rate = float(rrna) / sum(total_exp) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write(",".join(["rRNA", str(rrna)]) + "\n") out_handle.write(",".join(["rRNA_rate", str(rrna_rate)]) + "\n") return _read_memoized_rrna(out_file)
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") to_run.append("gemini") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] return to_run
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def _create_combined_fasta(data, out_dir): """ if there are genomes to be disambiguated, create a FASTA file of all of the transcripts for all genomes """ items = disambiguate.split([data]) fasta_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) ref_file = dd.get_ref_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa") if file_exists(out_file): fasta_files.append(out_file) else: out_file = _gtf_to_fasta(gtf_file, ref_file, out_file) out_file = _clean_gtf_fa(out_file, out_file) fasta_files.append(out_file) out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) combined_file = out_stem + ".fa" if file_exists(combined_file): return combined_file fasta_file_string = " ".join(fasta_files) cmd = "cat {fasta_file_string} > {tx_out_file}" with file_transaction(combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.") return combined_file
def _detect_rRNA(data): gtf_file = dd.get_gtf_file(data) count_file = dd.get_count_file(data) rrna_features = gtf.get_rRNA(gtf_file) genes = [x[0] for x in rrna_features if x] count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"]) return {'rRNA': sum(count_table.ix[genes]["counts"])}
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {in_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, count_file) as tx_count_file: do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) os.rename(fixed_count_file, count_file) return count_file
def calling(data): """Main function to parallelize peak calling.""" chip_bam = dd.get_work_bam(data) if data["work_bam_rep"] != "": rep_bam = data["work_bam_rep"] else: rep_bam = "" input_bam = data["work_bam_input"] caller_fn = get_callers()[data["rmats_fn"]] name = dd.get_sample_name(data) fastq_file = fastq.get_fastq_files(data) read_len = bam.fastq.estimate_read_length(fastq_file[0]) if read_len < 50: read_len = 50 elif read_len > 50 and read_len < 75: read_len = 75 else: read_len = 100 if len(fastq_file) > 1: read_pair = "paired" else: read_pair = "single" out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["rmats_fn"], name )) out_file = caller_fn(name, chip_bam, rep_bam, input_bam, dd.get_gtf_file(data), out_dir, read_len, read_pair, data["config"]) data["rmats_file"] = out_file return [[data]]
def _get_ericscript_db(self, data): transcript_file = dd.get_gtf_file(data) if transcript_file and os.path.exists(transcript_file): transcript_dir = os.path.dirname(transcript_file) ericscript_dirs = glob.glob(os.path.join(transcript_dir, "ericscript", "ericscript_db*")) if ericscript_dirs: return sorted(ericscript_dirs)[-1]
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): with file_transaction(data, results_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) raw_file = os.path.join(tx_out_dir, "rnaseq_qc_results.txt") bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_out_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def get_known_splicesites_file(align_dir, data): gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") if not file_exists(splicesites): splicesites = create_splicesites_file(gtf_file, align_dir, data) return splicesites
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}" message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts" with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) return data with file_transaction(data, out_dir) as tx_out_dir: exon_file, transcript_file = _stringtie_expression(bam, gtf_file, num_cores, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) return data
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) config = data['config'] if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return data gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(out_dir) as tx_out_dir: bam_file = _prepare_bam_file(in_bam, tmp_dir, config) cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}") do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data) if pair_file: pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data) bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = ( "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | samtools view -bhS - > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks can use if dd.get_assemble_transcripts(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ gtf_file = dd.get_gtf_file(samples[0][0], None) dexseq_gff = dd.get_dexseq_gff(samples[0][0]) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) print("hello") data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def run_salmon_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon") gtf_file = dd.get_gtf_file(data) fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file salmon_index(gtf_file, fasta_file, data, salmon_dir) return samples
def run_rapmap_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) rapmap_dir = os.path.join(work_dir, "rapmap") gtf_file = dd.get_gtf_file(data) fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file rapmap_index(gtf_file, fasta_file, "quasi", data, rapmap_dir) return samples
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join(dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames"] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def run_kallisto_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto") gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file kallisto_index(gtf_file, fasta_file, data, kallisto_dir) return samples
def _get_files(data): mapped = bam.mapped(data["work_bam"], data["config"]) in_file = bam.sort(mapped, data["config"], order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") sample_name = dd.get_sample_name(data) out_file = os.path.join(out_dir, sample_name + ".counts") stats_file = os.path.join(out_dir, sample_name + ".stats") return in_file, gtf_file, out_file, stats_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def _detect_rRNA(data): gtf_file = dd.get_gtf_file(data) count_file = dd.get_count_file(data) rrna_features = gtf.get_rRNA(gtf_file) genes = [x[0] for x in rrna_features if x] if not genes: return {'rRNA': "NA", "rRNA_rate": "NA"} count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"]) rrna = sum(count_table[count_table["id"].isin(genes)]["counts"]) rrna_rate = float(rrna) / sum(count_table["counts"]) return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
def run_sailfish_index(*samples): samples = [utils.to_single_data(x) for x in samples] Build = namedtuple('Build', ['build', 'ref', 'gtf']) builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x)) for x in samples} data = samples[0] indexdirs = {} for build in builds: indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data, build.build) return [[x] for x in samples]
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) fasta_file = dd.get_ref_file(data) out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) return [[data]]
def run_sailfish_index(*samples): fq1, _ = dd.get_input_sequence_files(samples[0][0]) kmer_size = estimate_kmer_size(fq1) Build = namedtuple('Build', ['build', 'ref', 'gtf']) builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x)) for x in dd.sample_data_iterator(samples)} data = samples[0][0] indexdirs = {} for build in builds: indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data, build.build, kmer_size) return samples
def stringtie_merge(*samples): to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)])) data = samples[0][0] ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def index(ref_file, out_dir, data): """Create a bismark index in the defined reference directory. """ (ref_dir, local_file) = os.path.split(ref_file) gtf_file = dd.get_gtf_file(data) bismark = config_utils.find_program("bismark", data["config"]) if not utils.file_exists(gtf_file): raise ValueError("%s not found, could not create a star index." % (gtf_file)) if not utils.file_exists(out_dir): with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: num_cores = dd.get_cores(data) cmd = "{bismark} --bowtie2 -p {num_cores} -n 1 -o {tx_out_dir} --basename {sample} --unmapped {ref_file} {in_fastq}" do.run(cmd.format(**locals()), "Index STAR") if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_out_dir, out_dir) return out_dir
def run_rapmap_pseudoalign(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None rapmap_dir = os.path.join(work_dir, "rapmap", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, fasta_file, data) data = dd.set_work_bam(data, out_file) data = dd.set_transcriptome_bam(data, out_file) return data
def run_sailfish(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None sailfish_dir = os.path.join(work_dir, "sailfish", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file stranded = dd.get_strandedness(data).lower() out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, sailfish_dir) return [[data]]
def _detect_rRNA(data): sample = dd.get_sample_name(data) gtf_file = dd.get_gtf_file(data) tidy_file = dd.get_sailfish_tidy(data) rrna_features = gtf.get_rRNA(gtf_file) transcripts = set([x[1] for x in rrna_features if x]) if not (transcripts and tidy_file): return {'rRNA': "NA", "rRNA_rate": "NA"} count_table = pd.read_csv(tidy_file, sep="\t") sample_table = count_table[count_table["sample"].isin([sample])] rrna_exp = map( float, sample_table[sample_table["id"].isin(transcripts)]["numreads"]) total_exp = map(float, sample_table["numreads"]) rrna = sum(rrna_exp) if sum(total_exp) == 0: return {'rRNA': str(rrna), 'rRNA_rate': "NA"} rrna_rate = float(rrna) / sum(total_exp) return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
def cufflinks_merge(*samples): to_merge = set( filter_missing( flatten([ dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples) ]))) data = samples[0][0] ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def index(ref_file, out_dir, data): """Create a STAR index in the defined reference directory. """ (ref_dir, local_file) = os.path.split(ref_file) gtf_file = dd.get_gtf_file(data) if not utils.file_exists(gtf_file): raise ValueError("%s not found, could not create a star index." % (gtf_file)) if not utils.file_exists(out_dir): with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: num_cores = dd.get_cores(data) cmd = ("STAR --genomeDir {tx_out_dir} --genomeFastaFiles {ref_file} " "--runThreadN {num_cores} " "--runMode genomeGenerate --sjdbOverhang 99 --sjdbGTFfile {gtf_file}") do.run(cmd.format(**locals()), "Index STAR") if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_out_dir, out_dir) return out_dir
def run_salmon_reads(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, salmon_dir) return [[data]]
def run_ataqv(data): if not dd.get_chip_method(data) == "atac": return None work_dir = dd.get_work_dir(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join(work_dir, "qc", sample_name, "ataqv") peak_file = get_full_peaks(data) bam_file = get_unfiltered_bam(data) out_file = os.path.join(out_dir, sample_name + ".ataqv.json.gz") if not peak_file: logger.info(f"Full peak file for {sample_name} not found, skipping ataqv") return None if not bam_file: logger.info(f"Unfiltered BAM file for {sample_name} not found, skipping ataqv") return None if utils.file_exists(out_file): return out_file tss_bed_file = os.path.join(out_dir, "TSS.bed") tss_bed_file = gtf.get_tss_bed(dd.get_gtf_file(data), tss_bed_file, data, padding=0) if chromhacks.is_human(data): organism = "human" autosomal_reference_flag = "" elif chromhacks.is_mouse(data): organism = "mouse" autosomal_reference_flag = "" else: autosomal_reference = os.path.join(out_dir, "autosomal.txt") autosomal_reference = _make_autosomal_reference_file(autosomal_reference, data) organism = "None" autosomal_reference_flag = f"--autosomal-reference-file {autosomal_reference} " ataqv = config_utils.get_program("ataqv", data) mitoname = chromhacks.get_mitochondrial_chroms(data)[0] if not ataqv: logger.info(f"ataqv executable not found, skipping running ataqv.") return None with file_transaction(out_file) as tx_out_file: cmd = (f"{ataqv} --peak-file {peak_file} --name {sample_name} --metrics-file {tx_out_file} " f"--tss-file {tss_bed_file} {autosomal_reference_flag} " f"--ignore-read-groups --mitochondrial-reference-name {mitoname} " f"--tss-extension 1000 " f"{organism} {bam_file}") message = f"Running ataqv on {sample_name}." do.run(cmd, message) return out_file
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([ tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"] ]): to_run.append("qualimap") if analysis.startswith("rna-seq") or analysis == "smallrna-seq": if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") to_run.append("atropos") if "coverage_qc" not in dd.get_tools_off(data): to_run.append("samtools") if analysis.startswith(("standard", "variant", "variant2")): if "coverage_qc" not in dd.get_tools_off(data): to_run += ["coverage", "picard"] to_run += ["qsignature", "variants"] if peddy.is_human(data): to_run += ["peddy"] if vcfutils.get_paired([data]): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] if tz.get_in(["config", "algorithm", "preseq"], data): to_run.append("preseq") return to_run
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can " "add support for this if you open up an issue about it here: " "https://github.com/bcbio/bcbio-nextgen/issues") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("We don't support kallisto for single-end reads and fusion " "calling with pizzly does not accept single end reads.") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ( "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") if not file_exists(splicesites): splicesites = create_splicesites_file(gtf_file, align_dir, data) # empty splicesite files means there is no splicing, so skip this option # if there is no splicing for this organism if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with file_transaction(data, out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def _detect_rRNA(data): sample = dd.get_sample_name(data) gtf_file = dd.get_gtf_file(data) salmon_dir = dd.get_salmon_dir(data) quant = os.path.join(salmon_dir, "quant", "quant.sf") rrna_features = gtf.get_rRNA(gtf_file) transcripts = set([x[1] for x in rrna_features if x]) if not (transcripts and utils.file_exists(quant)): return {'rRNA': "NA", "rRNA_rate": "NA"} sample_table = pd.read_csv(quant, sep="\t") rrna_exp = map( float, sample_table[sample_table["Name"].isin(transcripts)]["NumReads"]) total_exp = map(float, sample_table["NumReads"]) rrna = sum(rrna_exp) if sum(total_exp) == 0: return {'rRNA': str(rrna), 'rRNA_rate': "NA"} rrna_rate = float(rrna) / sum(total_exp) return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
def index(ref_file, out_dir, data): """Create a bismark index in the defined reference directory. """ (ref_dir, local_file) = os.path.split(ref_file) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) bismark = config_utils.find_program("bismark", data["config"]) if not utils.file_exists(gtf_file): raise ValueError("%s not found, could not create a bismark index." % (gtf_file)) if not utils.file_exists(out_dir): with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: num_cores = dd.get_cores(data) other_opts = config_utils.get_resources("bismark", data["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() cmd = "{bismark} {other_opts} --bowtie2 -p {num_cores} -n 1 -o {tx_out_dir} --basename {sample} --unmapped {ref_file} {in_fastq}" do.run(cmd.format(**locals()), "Index STAR") if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_out_dir, out_dir) return out_dir
def combine_express(samples, combined): """Combine tpm, effective counts and fpkm from express results""" if not combined: return None to_combine = [ dd.get_express_counts(x) for x in dd.sample_data_iterator(samples) if dd.get_express_counts(x) ] gtf_file = dd.get_gtf_file(samples[0][0]) isoform_to_gene_file = os.path.join(os.path.dirname(combined), "isoform_to_gene.txt") isoform_to_gene_file = express.isoform_to_gene_name( gtf_file, isoform_to_gene_file, dd.sample_data_iterator(samples).next()) if len(to_combine) > 0: eff_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_counts" eff_counts_combined = count.combine_count_files( to_combine, eff_counts_combined_file, ext=".counts") to_combine = [ dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_tpm(x) ] tpm_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_tpm" tpm_counts_combined = count.combine_count_files( to_combine, tpm_counts_combined_file) to_combine = [ dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_fpkm(x) ] fpkm_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_fpkm" fpkm_counts_combined = count.combine_count_files( to_combine, fpkm_counts_combined_file, ext=".fpkm") return { 'counts': eff_counts_combined, 'tpm': tpm_counts_combined, 'fpkm': fpkm_counts_combined, 'isoform_to_gene': isoform_to_gene_file } return {}
def sample_summary(bam_file, data, out_dir): """Run RNA-SeQC on a single RNAseq sample, writing to specified output directory. """ metrics_file = os.path.join(out_dir, "metrics.tsv") if not file_exists(metrics_file): with file_transaction(data, out_dir) as tx_out_dir: config = data["config"] ref_file = data["sam_ref"] genome_dir = os.path.dirname(os.path.dirname(ref_file)) gtf_file = dd.get_gtf_file(data) sample_file = os.path.join(safe_makedir(tx_out_dir), "sample_file.txt") _write_sample_id_file(data, bam_file, sample_file) runner = rnaseqc_runner_from_config(config) rna_file = config_utils.get_rRNA_sequence(genome_dir) bam.index(bam_file, config) single_end = not bam.is_paired(bam_file) runner.run(sample_file, ref_file, rna_file, gtf_file, tx_out_dir, single_end) # we don't need this large directory for just the report shutil.rmtree(os.path.join(tx_out_dir, data["description"])) return _parse_rnaseqc_metrics(metrics_file, data["name"][-1])
def run_pizzly(data): work_dir = dd.get_work_dir(data) pizzlydir = os.path.join(work_dir, "pizzly") samplename = dd.get_sample_name(data) gtf = dd.get_gtf_file(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) stripped_fa = os.path.splitext( os.path.basename(gtf_fa))[0] + "-noversions.fa" stripped_fa = os.path.join(pizzlydir, stripped_fa) gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa) fraglength = get_fragment_length(data) cachefile = os.path.join(pizzlydir, "pizzly.cache") fusions = kallisto.get_kallisto_fusions(data) pizzlypath = config_utils.get_program("pizzly", dd.get_config(data)) outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data) return outdir
def _rnaseq_qualimap(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ report_file = os.path.join(out_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) ref_file = dd.get_ref_file(data) single_end = not bam.is_paired(bam_file) if not utils.file_exists(report_file): utils.safe_makedir(out_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file, single_end) do.run(cmd, "Qualimap for {}".format(data["name"][-1])) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, out_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Fragment Length Mean": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) config = data['config'] if not in_bam: logger.info( "Transcriptome-mapped BAM file not found, skipping eXpress.") return data out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_dir) as tx_out_dir: bam_file = _prepare_bam_file(in_bam, tmp_dir, config) cmd = ( "{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}" ) do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7, data=data) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14, data=data) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10, data=data) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) combined = count.combine_count_files( [x[0]["count_file"] for x in samples if "count_file" in x[0]]) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]] fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" to_combine_isoform = [ x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0] ] fpkm_isoform_combined = count.combine_count_files( to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples] to_combine_dexseq = filter(lambda x: x, to_combine_dexseq) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated if fpkm_combined: x[0]["combined_fpkm"] = fpkm_combined if fpkm_isoform_combined: x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined if dexseq_combined: x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file) return samples
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data bowtie2 = config_utils.get_program("bowtie2", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_index = index_transcriptome(gtf_file, ref_file, data) num_cores = data["config"]["algorithm"].get("num_cores", 1) pair_cmd = "-2 %s " % pair_file if pair_file else "" cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} -1 {fastq_file} {pair_cmd} | samtools view -hbS - > {tx_out_file}") with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def run_salmon_decoy(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir)) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def count(data): """ count reads mapping to genes using featureCounts falls back on htseq_count method if featureCounts is not found """ in_bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" if file_exists(count_file): return count_file config = data["config"] try: featureCounts = config_utils.get_program("featureCounts", config) except config_utils.CmdNotFound: logger.info("featureCounts not found, falling back to htseq-count " "for feature counting. You can upgrade the tools to " "install featureCount with bcbio_nextgen.py upgrade " "--tools.") return htseq_count(data) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(config) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {in_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, count_file) as tx_count_file: do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) os.rename(fixed_count_file, count_file) return count_file
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics}
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def _stringtie_expression(bam, data, out_dir="."): """ only estimate expression the Stringtie, do not assemble new transcripts """ gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data)) num_cores = dd.get_num_cores(data) error_message = "The %s file for %s is missing. StringTie has an error." stringtie = config_utils.get_program("stringtie", data, default="stringtie") # don't assemble transcripts unless asked exp_flag = ("-e" if "stringtie" not in dd.get_transcript_assembler(data) else "") base_cmd = ("{stringtie} {exp_flag} -b {out_dir} -p {num_cores} -G {gtf_file} " "-o {out_gtf} {bam}") transcript_file = os.path.join(out_dir, "t_data.ctab") exon_file = os.path.join(out_dir, "e_data.ctab") out_gtf = os.path.join(out_dir, "stringtie-assembly.gtf") if file_exists(transcript_file): return exon_file, transcript_file, out_gtf cmd = base_cmd.format(**locals()) do.run(cmd, "Running Stringtie on %s." % bam) assert file_exists(exon_file), error_message % ("exon", exon_file) assert file_exists(transcript_file), error_message % ("transcript", transcript_file) return transcript_file
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) report_file = os.path.join(results_dir, "qualimapReport.html") raw_file = os.path.join(results_dir, "rnaseq_qc_results.txt") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): utils.safe_makedir(results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update( {"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " resources = config_utils.get_resources("vardict", data) if resources.get("options"): opts += " ".join([str(x) for x in resources["options"]]) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data