def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina") if pair_file: pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina") bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) samtools = config_utils.get_program("samtools", data["config"]) cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | {samtools} view -bhS - > {tx_out_file}") with file_transaction(data, out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data) if pair_file: pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data) bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = ( "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | samtools view -bhS - > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data bowtie2 = config_utils.get_program("bowtie2", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_index = index_transcriptome(gtf_file, ref_file, data) num_cores = data["config"]["algorithm"].get("num_cores", 1) fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file pair_cmd = "-2 %s " % pair_file if pair_file else "" cmd = ( "{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} " ) with file_transaction(data, out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) cmd += "| " + postalign.sam_to_sortbam_cl( data, tx_out_file, name_sort=True) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) if dd.get_transcriptome_align(data) and not is_transcriptome_broken(): transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) sjfile = get_splicejunction_file(out_dir, data) sjbed = junction2bed(sjfile) data = dd.set_junction_bed(data, sjbed) return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) sjfile = get_splicejunction_file(out_dir, data) if sjfile: sjbed = junction2bed(sjfile) data = dd.set_junction_bed(data, sjbed) return data
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data bowtie2 = config_utils.get_program("bowtie2", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_index = index_transcriptome(gtf_file, ref_file, data) num_cores = data["config"]["algorithm"].get("num_cores", 1) fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file pair_cmd = "-2 %s " % pair_file if pair_file else "" cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} | samtools view -hbS - > {tx_out_file}") with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data["oncofuse_file"] = oncofuse.run(data) if dd.get_dexseq_gff(data, None): data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data)) # if RSEM was run, stick the transcriptome BAM file into the datadict if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data): base, ext = os.path.splitext(dd.get_work_bam(data)) data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext) return [[data]]
def run_rapmap_align(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None rapmap_dir = os.path.join(work_dir, "rapmap", samplename) gtf_file = dd.get_gtf_file(data) fasta_file = dd.get_ref_file(data) out_file = rapmap_align(fq1, fq2, rapmap_dir, gtf_file, fasta_file, "quasi", data) data = dd.set_transcriptome_bam(data, out_file) return [[data]]
def run_rapmap_pseudoalign(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None rapmap_dir = os.path.join(work_dir, "rapmap", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, fasta_file, data) data = dd.set_transcriptome_bam(data, out_file) return [[data]]
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data