def should_run_fusion(with_caller, config): fusion_mode = dd.get_fusion_mode(config) or \ utils.get_in(config, ("algorithm", "fusion_mode"), False) fusion_caller = dd.get_fusion_caller(config) or \ utils.get_in(config, ("algorithm", "fusion_caller"), None) return fusion_mode and fusion_caller in (None, with_caller)
def detect_fusions(data): data = to_single_data(data) # support the old style of fusion mode calling if dd.get_fusion_mode(data, False): data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"]) logger.warning( "``fusion_mode`` is deprecated in favor of turning on " "callers with ``fusion_caller``. It will run pizzly and " "oncofuse for now, but will eventually have support " "dropped.") fusion_caller = dd.get_fusion_caller(data, []) if "oncofuse" in fusion_caller: oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if "pizzly" in fusion_caller: pizzly_dir = pizzly.run_pizzly(data) if pizzly_dir: data = dd.set_pizzly_dir(data, pizzly_dir) data["fusion"] = { "fasta": os.path.join(pizzly_dir, "%s.fusions.fasta" % dd.get_sample_name(data)), "json": os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data)) } if "ericscript" in fusion_caller: ericscript_dir = ericscript.run(data) return [[data]]
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def should_run_fusion(with_caller, config): fusion_mode = dd.get_fusion_mode(config) or \ utils.get_in(config, ("algorithm", "fusion_mode"), False) fusion_caller = dd.get_fusion_caller(config) or \ utils.get_in(config, ("algorithm", "fusion_caller"), None) return fusion_mode and fusion_caller in (None, with_caller)
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) sentinel_file = os.path.join(quant_dir, "abundance.h5") if os.path.exists(sentinel_file): return quant_dir num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir)) fusion_flag = "--fusion" if dd.get_fusion_mode( data) or dd.get_fusion_caller(data) else "" single_flag = "--single" if not fq2 else "" fraglength_flag = "--fragment-length=200" if not fq2 else "" sd_flag = "--sd=25" if not fq2 else "" bootstrap_flag = "--bootstrap-samples=30" fq2 = "" if not fq2 else fq2 if not fq2: logger.warning( "kallisto was run on single-end data and we set the " "estimated fragment length to 200 and the standard " "deviation to 25, if these don't reflect your data then " "the results may be inaccurate. Use with caution. See " "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w " "for details.") cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} " "{fraglength_flag} {sd_flag} {bootstrap_flag} " "-o {tx_out_dir} -i {index} {fq1} {fq2}") with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with kallisto.") do.run(cmd.format(**locals()), message, None) return quant_dir
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir)) fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else "" single_flag = "--single" if not fq2 else "" fraglength_flag = "--fragment-length=200" if not fq2 else "" sd_flag = "--sd=25" if not fq2 else "" bootstrap_flag = "--bootstrap-samples=30" fq2 = "" if not fq2 else fq2 if not fq2: logger.warning("kallisto was run on single-end data and we set the " "estimated fragment length to 200 and the standard " "deviation to 25, if these don't reflect your data then " "the results may be inaccurate. Use with caution. See " "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w " "for details.") cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} " "{fraglength_flag} {sd_flag} {bootstrap_flag} " "-o {tx_out_dir} -i {index} {fq1} {fq2}") with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with kallisto.") do.run(cmd.format(**locals()), message, None) return quant_dir
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False) and not dd.get_fusion_caller(data): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data): # to create a disambiguated transcriptome file realign with bowtie2 if dd.get_disambiguate(data): logger.info("Aligning to the transcriptome with bowtie2 using the " "disambiguated reads.") bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None ref_file = dd.get_ref_file(data) data = bowtie2.align_transcriptome(file1, file2, ref_file, data) else: file1, file2 = dd.get_input_sequence_files(data) if not dd.get_transcriptome_bam(data): ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) data = spikein.counts_spikein(data) return [[data]]
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): samples = run_parallel("run_kallisto_index", [samples]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): samples = run_parallel("run_sailfish_index", [samples]) samples = run_parallel("run_sailfish", samples) # always run salmon samples = run_parallel("run_salmon_index", [samples]) samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): samples = run_parallel("run_kallisto_index", [samples]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): samples = run_parallel("run_sailfish_index", [samples]) samples = run_parallel("run_sailfish", samples) # always run salmon samples = run_parallel("run_salmon_index", [samples]) samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False) and not dd.get_fusion_caller(data): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data): # to create a disambiguated transcriptome file realign with bowtie2 if dd.get_disambiguate(data): logger.info("Aligning to the transcriptome with bowtie2 using the " "disambiguated reads.") bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None ref_file = dd.get_ref_file(data) data = bowtie2.align_transcriptome(file1, file2, ref_file, data) else: file1, file2 = dd.get_input_sequence_files(data) if not dd.get_transcriptome_bam(data): ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) data = spikein.counts_spikein(data) return [[data]]
def detect_fusions(data): # support the old style of fusion mode calling if dd.get_fusion_mode(data, False): data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"]) logger.warning("``fusion_mode`` is deprecated in favor of turning on " "callers with ``fusion_caller``. It will run pizzly and " "oncofuse for now, but will eventually have support " "dropped.") if "oncofuse" in dd.get_fusion_caller(data, []): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if "pizzly" in dd.get_fusion_caller(data, []): pizzly_dir = pizzly.run_pizzly(data) if pizzly_dir: data = dd.set_pizzly_dir(data, pizzly_dir) return [[data]]
def test_get_fusion_caller(): data = { 'config': { 'algorithm': { 'fusion_caller': 'FUSION_CALLER', }, }, } result = dd.get_fusion_caller(data) assert result == 'FUSION_CALLER'
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") data = to_single_data(salmon.run_salmon_decoy(data)[0]) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") data = to_single_data(salmon.run_salmon_reads(data)[0]) else: data = to_single_data(salmon.run_salmon_bam(data)[0]) else: data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def _set_transcriptome_option(options, data, ref_file): # prefer transcriptome-index vs a GTF file if available transcriptome_index = get_in(data, ("genome_resources", "rnaseq", "transcriptome_index", "tophat")) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) fusion_mode = fusion_mode or dd.get_fusion_caller(data) if transcriptome_index and file_exists(transcriptome_index) and not fusion_mode: options["transcriptome-index"] = os.path.splitext(transcriptome_index)[0] return options gtf_file = dd.get_gtf_file(data) if gtf_file: options["GTF"] = gtf_file return options return options
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] to_index = determine_indexes_to_make(samples) samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): run_parallel("run_kallisto_index", [to_index]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): run_parallel("run_sailfish_index", [to_index]) samples = run_parallel("run_sailfish", samples) # always run salmon run_parallel("run_salmon_index", [to_index]) if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") samples = run_parallel("run_salmon_decoy", samples) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") samples = run_parallel("run_salmon_reads", samples) else: samples = run_parallel("run_salmon_bam", samples) else: samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def detect_fusions(data): data = to_single_data(data) # support the old style of fusion mode calling if dd.get_fusion_mode(data, False): data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"]) logger.warning("``fusion_mode`` is deprecated in favor of turning on " "callers with ``fusion_caller``. It will run pizzly and " "oncofuse for now, but will eventually have support " "dropped.") fusion_caller = dd.get_fusion_caller(data, []) if "oncofuse" in fusion_caller: oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if "pizzly" in fusion_caller: pizzly_dir = pizzly.run_pizzly(data) if pizzly_dir: data = dd.set_pizzly_dir(data, pizzly_dir) data["fusion"] = {"fasta": os.path.join(pizzly_dir, "%s.fusions.fasta" % dd.get_sample_name(data)), "json": os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))} if "ericscript" in fusion_caller: ericscript_dir = ericscript.run(data) return [[data]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([_unpack_fastq(fastq_file), _unpack_fastq(pair_file)]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join([str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([ _unpack_fastq(fastq_file), _unpack_fastq(pair_file) ]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join( [str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def _should_run_fusion(data): return dd.get_fusion_caller(data)
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) fusion_mode = fusion_mode or dd.get_fusion_caller(data) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinBAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data