def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): samples = run_parallel("run_kallisto_index", [samples]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): samples = run_parallel("run_sailfish_index", [samples]) samples = run_parallel("run_sailfish", samples) # always run salmon samples = run_parallel("run_salmon_index", [samples]) samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def quantitate_expression_noparallel(samples, run_parallel): """ run transcript quantitation for algorithms that don't run in parallel """ data = samples[0][0] if "express" in dd.get_expression_caller(data): samples = run_parallel("run_express", samples) if "dexseq" in dd.get_expression_caller(data): samples = run_parallel("run_dexseq", samples) return samples
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] samples = run_parallel("generate_transcript_counts", samples) samples = run_parallel("run_sailfish", samples) samples = sailfish.combine_sailfish(samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) return samples
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") data = to_single_data(salmon.run_salmon_decoy(data)[0]) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") data = to_single_data(salmon.run_salmon_reads(data)[0]) else: data = to_single_data(salmon.run_salmon_bam(data)[0]) else: data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def sample_annotation(data): """ Annotate miRNAs using miRBase database with seqbuster tool """ names = data["rgnames"]['sample'] tools = dd.get_expression_caller(data) work_dir = os.path.join(dd.get_work_dir(data), "mirbase") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = op.join(out_dir, names) if dd.get_mirbase_hairpin(data): mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data))) if utils.file_exists(data["collapse"]): data['transcriptome_bam'] = _align(data["collapse"], dd.get_mirbase_hairpin(data), out_file, data) data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config']) else: logger.debug("Trimmed collapsed file is empty for %s." % names) else: logger.debug("No annotation file from miRBase.") sps = dd.get_species(data) if dd.get_species(data) else "None" logger.debug("Looking for mirdeep2 database for %s" % names) if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")): data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps, op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config']) if "trna" in tools: data['trna'] = _mint_trna_annotation(data) data = spikein.counts_spikein(data) return [[data]]
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] tools = dd.get_expression_caller(data[0][0]) work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = data[0][0]["cluster_bam"] if "seqcluster" in tools: gtf_file = dd.get_transcriptome_gtf(sample) if dd.get_transcriptome_gtf(sample) else dd.get_srna_gtf_file(sample) sample["seqcluster"] = _cluster(bam_file, data[0][0]["seqcluster_prepare_ma"], out_dir, dd.get_ref_file(sample), gtf_file) sample["report"] = _report(sample, dd.get_ref_file(sample)) if "mirge" in tools: sample["mirge"] = mirge.run(data) out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample data = spikein.combine_spikein(data) return data
def run_prepare(*data): """ Run seqcluster prepare to merge all samples in one file """ out_dir = os.path.join(dd.get_work_dir(data[0][0]), "seqcluster", "prepare") out_dir = os.path.abspath(safe_makedir(out_dir)) prepare_dir = os.path.join(out_dir, "prepare") tools = dd.get_expression_caller(data[0][0]) if len(tools) == 0: logger.info("You didn't specify any other expression caller tool." "You can add to the YAML file:" "expression_caller:[trna, seqcluster, mirdeep2]") fn = [] for sample in data: name = sample[0]["rgnames"]['sample'] fn.append("%s\t%s" % (sample[0]['collapse'], name)) args = namedtuple('args', 'debug print_debug minc minl maxl out') args = args(False, False, 2, 17, 40, out_dir) ma_out = op.join(out_dir, "seqs.ma") seq_out = op.join(out_dir, "seqs.fastq") min_shared = max(int(len(fn) / 10.0), 1) if not file_exists(ma_out): seq_l, sample_l = prepare._read_fastq_files(fn, args) with file_transaction(ma_out) as ma_tx: with open(ma_tx, 'w') as ma_handle: with open(seq_out, 'w') as seq_handle: logger.info("Prepare seqs.fastq with -minl 17 -maxl 40 -minc 2 --min_shared 0.1") prepare._create_matrix_uniq_seq(sample_l, seq_l, ma_handle, seq_handle, min_shared) for sample in data: sample[0]["seqcluster_prepare_ma"] = ma_out sample[0]["seqcluster_prepare_fastq"] = seq_out return data
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] tools = dd.get_expression_caller(data[0][0]) work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = data[0][0]["work_bam"] if "seqcluster" in tools: sample["seqcluster"] = _cluster(bam_file, data[0][0]["seqcluster_prepare_ma"], out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample)) sample["report"] = _report(sample, dd.get_ref_file(sample)) out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample return data
def sample_annotation(data): """ Annotate miRNAs using miRBase database with seqbuster tool """ names = data["rgnames"]['sample'] tools = dd.get_expression_caller(data) work_dir = os.path.join(dd.get_work_dir(data), "mirbase") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = op.join(out_dir, names) if dd.get_mirbase_hairpin(data): mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data))) data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config']) else: logger.debug("No annotation file from miRBase.") sps = dd.get_species(data) if dd.get_species(data) else "None" logger.debug("Looking for mirdeep2 database for %s" % names) if file_exists( op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")): data['seqbuster_novel'] = _miraligner( data["collapse"], "%s_novel" % out_file, sps, op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config']) if "trna" in tools: data['trna'] = _trna_annotation(data) data = spikein.counts_spikein(data) return [[data]]
def run_prepare(*data): """ Run seqcluster prepare to merge all samples in one file """ out_dir = os.path.join(dd.get_work_dir(data[0][0]), "seqcluster", "prepare") out_dir = os.path.abspath(safe_makedir(out_dir)) prepare_dir = os.path.join(out_dir, "prepare") tools = dd.get_expression_caller(data[0][0]) if len(tools) == 0: logger.info("You didn't specify any other expression caller tool." "You can add to the YAML file:" "expression_caller:[trna, seqcluster, mirdeep2]") fn = [] for sample in data: name = sample[0]["rgnames"]['sample'] fn.append("%s\t%s" % (sample[0]['collapse'], name)) args = namedtuple('args', 'debug print_debug minc minl maxl out') args = args(False, False, 2, 17, 40, out_dir) ma_out = op.join(out_dir, "seqs.ma") seq_out = op.join(out_dir, "seqs.fastq") min_shared = max(int(len(fn) / 10.0), 1) if not file_exists(ma_out): seq_l, sample_l = prepare._read_fastq_files(fn, args) with file_transaction(ma_out) as ma_tx: with open(ma_tx, 'w') as ma_handle: with open(seq_out, 'w') as seq_handle: prepare._create_matrix_uniq_seq(sample_l, seq_l, ma_handle, seq_handle, min_shared) for sample in data: sample[0]["seqcluster_prepare_ma"] = ma_out sample[0]["seqcluster_prepare_fastq"] = seq_out return data
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] to_index = determine_indexes_to_make(samples) samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): run_parallel("run_kallisto_index", [to_index]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): run_parallel("run_sailfish_index", [to_index]) samples = run_parallel("run_sailfish", samples) # always run salmon run_parallel("run_salmon_index", [to_index]) if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") samples = run_parallel("run_salmon_decoy", samples) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") samples = run_parallel("run_salmon_reads", samples) else: samples = run_parallel("run_salmon_bam", samples) else: samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def run_align(*data): """ Prepare data to run alignment step, only once for each project """ work_dir = dd.get_work_dir(data[0][0]) out_dir = op.join(work_dir, "seqcluster", "prepare") seq_out = op.join(out_dir, "seqs.fastq") bam_dir = op.join(work_dir, "align") new_bam_file = op.join(bam_dir, "seqs.bam") tools = dd.get_expression_caller(data[0][0]) if not file_exists(new_bam_file): sample = process_alignment(data[0][0], [seq_out, None]) bam_file = dd.get_work_bam(sample[0][0]) shutil.move(bam_file, new_bam_file) shutil.move(bam_file + ".bai", new_bam_file + ".bai") shutil.rmtree(op.join(bam_dir, sample[0][0]["rgnames"]['sample'])) for sample in data: sample[0]["align_bam"] = sample[0]["clean_fastq"] if "mirdeep2" in tools: novel_db = mirdeep.run(data) return data
def run_align(*data): """ Prepare data to run alignment step, only once for each project """ work_dir = dd.get_work_dir(data[0][0]) out_dir = op.join(work_dir, "seqcluster", "prepare") seq_out = op.join(out_dir, "seqs.fastq") bam_dir = op.join(work_dir, "align") new_bam_file = op.join(bam_dir, "seqs.bam") tools = dd.get_expression_caller(data[0][0]) if not file_exists(new_bam_file): sample = process_alignment(data[0][0], [seq_out, None]) bam_file = dd.get_work_bam(sample[0][0]) shutil.move(bam_file, new_bam_file) shutil.move(bam_file + ".bai", new_bam_file + ".bai") shutil.rmtree(op.join(bam_dir, sample[0][0]["rgnames"]['sample'])) for sample in data: # sample[0]["align_bam"] = sample[0]["clean_fastq"] sample[0]["cluster_bam"] = new_bam_file if "mirdeep2" in tools: novel_db = mirdeep.run(data) return data
def _maybe_add_sailfish_files(algorithm, sample, out): if "sailfish" in dd.get_expression_caller(sample): out.append({"path": dd.get_sailfish_dir(sample), "type": "directory", "ext": os.path.join("sailfish", dd.get_sample_name(sample))}) return out