def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_transcriptome_align(data): # to create a disambiguated transcriptome file realign with bowtie2 if dd.get_disambiguate(data): logger.info("Aligning to the transcriptome with bowtie2 using the " "disambiguated reads.") bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam( bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None ref_file = dd.get_ref_file(data) data = bowtie2.align_transcriptome(file1, file2, ref_file, data) else: file1, file2 = dd.get_input_sequence_files(data) if not dd.get_transcriptome_bam(data): ref_file = dd.get_ref_file(data) logger.info( "Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) data = spikein.counts_spikein(data) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam( bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) if dd.get_transcriptome_align(data) and not is_transcriptome_broken(): transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if dd.get_transcriptome_align(data) and not is_transcriptome_broken(): cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data