def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ( "{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks can use if dd.get_assemble_transcripts(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks can use if dd.get_assemble_transcripts(data): cmd += "--dta-cufflinks " if dd.get_analysis(data) == "rna-seq": splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def assemble_transcripts(run_parallel, samples): """ assembly strategy rationale implemented as suggested in http://www.nature.com/nprot/journal/v7/n3/full/nprot.2012.016.html run Cufflinks in without a reference GTF for each individual sample merge the assemblies with Cuffmerge using a reference GTF """ if dd.get_assemble_transcripts(samples[0][0]): samples = run_parallel("cufflinks_assemble", samples) samples = run_parallel("cufflinks_merge", [samples]) return samples