def _maybe_add_transcriptome_alignment(sample, out): transcriptome_bam = dd.get_transcriptome_bam(sample) if transcriptome_bam and utils.file_exists(transcriptome_bam): out.append({"path": transcriptome_bam, "type": "bam", "ext": "transcriptome"}) return out
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False) and not dd.get_fusion_caller(data): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data): # to create a disambiguated transcriptome file realign with bowtie2 if dd.get_disambiguate(data): logger.info("Aligning to the transcriptome with bowtie2 using the " "disambiguated reads.") bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None ref_file = dd.get_ref_file(data) data = bowtie2.align_transcriptome(file1, file2, ref_file, data) else: file1, file2 = dd.get_input_sequence_files(data) if not dd.get_transcriptome_bam(data): ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) data = spikein.counts_spikein(data) return [[data]]
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) config = data['config'] if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return data gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(out_dir) as tx_out_dir: bam_file = _prepare_bam_file(in_bam, tmp_dir, config) cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}") do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.basename(os.path.splitext(gtf_file)[0]) + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam umis = _umis_cmd(data) cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [ umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames" ] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def run(data): """Quantitaive isoforms expression by express""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) tophat_index = get_in(data, ('genome_resources', 'rnaseq', 'transcriptome_index', 'tophat')) if not tophat_index: logger.info("Tophat index not found, skipping running eXpress.") return None tophat_fa = tophat_index.replace("ver", "fa") out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") safe_makedir(out_dir) express = config_utils.get_program("express", data['config']) if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return None if not file_exists(out_file): with tx_tmpdir() as tmp_dir: chdir(tmp_dir) ref_transcript = _do_fasta(tophat_fa) cmd = ("{express} {ref_transcript} {in_bam}") do.run(cmd.format(**locals()), "Run express", {}) shutil.move("results.xprs", out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs","fpkm"), 10) return (eff_count_file, tpm_file, fpkm_file)
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) if not in_bam: logger.info( "Transcriptome-mapped BAM file not found, skipping eXpress.") return data gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): with file_transaction(out_dir) as tx_out_dir: cmd = ( "{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {in_bam}" ) do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join(dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames"] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) fasta_file = dd.get_ref_file(data) out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) fasta_file = dd.get_ref_file(data) out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) return [[data]]
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) # if RSEM set to run, but the aligner didn't create the transcriptome BAM # file, make one with bwa if dd.get_rsem(data) and not dd.get_transcriptome_bam(data): file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("RSEM was flagged to run, but the transcriptome BAM file " "was not found. Aligning to the transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) # if RSEM set to run, but the aligner didn't create the transcriptome BAM # file, make one with bwa if dd.get_rsem(data) and not dd.get_transcriptome_bam(data): file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info( "RSEM was flagged to run, but the transcriptome BAM file " "was not found. Aligning to the transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".counts") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} tagcount --positional --cb_cutoff {cutoff} --cb_histogram " "{cb_histogram} {bam} {tx_out_file}") with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) # if RSEM set to run, but the aligner didn't create the transcriptome BAM # file, make one with bwa if dd.get_disambiguate(data): logger.info("RSEM is not supported yet for disambiguation protocols. " "See https://github.com/chapmanb/bcbio-nextgen/issues/859") return [[data]] if dd.get_rsem(data) and not dd.get_transcriptome_bam(data): file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("RSEM was flagged to run, but the transcriptome BAM file " "was not found. Aligning to the transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse " "--cb_histogram {cb_histogram} {bam} {tx_out_file}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag}" "--cb_histogram {cb_histogram} {bam} {tx_out_file_full}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]