def prinseq_task(opc, out_dir, input_1, input_2, basename, opts, tasks): ''' Defines prinseq task. Uses GEN_PATH_DIR(), PATH_PRINSEQ Params : input_1 - a list of 1/left fastq files input_2 - a list of 2/right fastq files basename - the basename for all output files opts - optional params for trinity task. tasks = the tasks that this task is dependent on ''' trgs = [ '{0!s}/{1!s}_1_{2!s}'.format(out_dir, basename, os.path.basename(input_1)), '{0!s}/{1!s}_2_{2!s}'.format(out_dir, basename, os.path.basename(input_2)) ] pseudo_trgs = [ '{0!s}/{1!s}_{2!s}.fastq'.format(out_dir, basename, x) for x in range(1, 3) ] cmd = ( 'perl {0!s} -fastq {1!s} -fastq2 {2!s} --out_format 3 --out_good {3!s}/{4!s} ' '--out_bad null --trim_qual_left 20 --trim_qual_right 20 --trim_qual_type min ' '--min_len 55 --trim_tail_left 8 --trim_tail_right 8 {5!s} -log; mv {6!s} {7!s};' ' mv {8!s} {9!s};').format( tool_path_check(TOOLS_DICT['prinseq'].full_exe[0]), input_1, input_2, out_dir, basename, opts, pseudo_trgs[0], trgs[0], pseudo_trgs[1], trgs[1]) name = basename out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, name=name, stdout=out, stderr=err, targets=trgs)
def busco_task(opc, dbs, assembly_path, assembly_name, out_dir, reference_name, cpu_cap, tasks): ''' Defines the busco task. Uses PATH_DIR, PATH_BUSCO, PATH_BUSCO_REFERENCE Params : reference_name - Name of the reference file to be used by busco cpu_cap - the cpu limit to be gicen to busco. tasks - a list of tasks that this task is dependant on. ''' trgs = [ '{0!s}/run_busco_{1!s}_{2!s}'.format(out_dir, assembly_name, reference_name) ] busco_db = dbs['busco_' + reference_name] cmd = ('cd {0!s}; python {1!s} ' '-o busco_{3!s}_{2!s} -i {4!s} -l {5!s}/{2!s}_odb9 -m ' 'tran -f -c {6!s}').format( out_dir, tool_path_check(TOOLS_DICT['busco'].full_exe[0]), reference_name, assembly_name, assembly_path, busco_db.call_path, cpu_cap) name = 'busco_' + reference_name + '_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, cpu=cpu_cap, stdout=out, stderr=err)
def trinity_task(opc, path_assembly, out_dir, fastq, fastq2, unpaired, cpu_cap_trin, cpu_cap_bfly, mem_trin, mem_bfly, normalize_flag, tasks): ''' Defines the trinity task. Uses GEN_PATH_DIR(), PATH_TRINITY, NAME_ASSEMBLY Params : left - a 1/left fastq files right - a 2/right fastq files cpu_cap - number of threads used by trinity tasks - a list of tasks that this task is dependent on ''' normalize_flag = '--normalize_reads' if (normalize_flag) else '' input_str = '' if (unpaired != [] and fastq == []): input_str += '--single ' + ','.join(unpaired) if (fastq != []): input_str += '--left ' + ','.join(fastq + unpaired) input_str += ' --right ' + ','.join(fastq2) trgs = [path_assembly] cmd = ( '{0!s} --seqType fq {1!s} --CPU {2!s} --max_memory {3!s}G --bflyCalculateCPU {4!s} ' '--output {6!s}/trinity; cp {6!s}/trinity/Trinity.fasta {7!s};' ).format(tool_path_check(TOOLS_DICT['trinity'].full_exe[0]), input_str, cpu_cap_trin, mem_trin, normalize_flag, mem_bfly, out_dir, trgs[0]) name = 'trinity_assembly' out, err = gen_logs(opc.path_logs, name) cpu_cap = max(cpu_cap_trin, cpu_cap_bfly) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, cpu=cpu_cap, stdout=out, stderr=err)
def filter_task(assembly_path, assembly_name, out_dir, quant_file_list, tpm_threshold, tpm_column_index, tasks, log_flag=True, opc=None): # TPM column index: transrate uses older salmon; use index =2. Newer salmon: index=3 trgs = [ '{0!s}/{1!s}_{2!s}tpm.fasta'.format(out_dir, assembly_name, tpm_threshold) ] quants = ''.join(' --quant_files ' + x for x in quant_file_list) cmd = ( 'python {0!s}/filter_contigs_by_tpm.py --assembly {1!s} --tpm {2!s} {3!s} ' '--out {4!s} --tpm_column_index {5!s}').format(statics.PATH_UTIL, assembly_path, tpm_threshold, quants, trgs[0], tpm_column_index) name = 'filt_{0!s}_{1!s}tpm'.format(assembly_name, tpm_threshold) out, err = gen_logs(opc.path_logs, name) if (log_flag) else (None, None) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def salmon_task(opc, index, left, right, out_name, gene_map, out_dir, cpu_cap, tasks): trgs = [ '{0!s}/{1!s}/quant.sf'.format(out_dir, out_name), '{0!s}/{1!s}/quant.genes.sf'.format(out_dir, out_name) ] if len(gene_map) > 0: trans_gene_map = ' --geneMap {0!s}'.format(gene_map) cmd = ( '{0!s} quant -i {1!s} -l IU -1 {2!s} -2 {3!s} -o {4!s}/{5!s} ' #'--geneMap {6!s} -p {7!s} --extraSensitive; cp {4!s}/{5!s}/quant.sf ' '{6!s} -p {7!s} --dumpEq' ).format( #cmd = '{0!s} quant -i {1!s} -l IU -1 {2!s} -2 {3!s} -o {4!s}/{5!s} #--geneMap {6!s} -p {7!s} --extraSensitive --numBootstraps 30 --biasCorrect ; cp ' \ tool_path_check(TOOLS_DICT['salmon'].full_exe[0]), index, left, right, out_dir, out_name, trans_gene_map, cpu_cap) name = os.path.basename(index) + '_' + os.path.basename(left) out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap)
def salmon_unpaired_task(opc, index, unpaired, out_name, gene_map, out_dir, cpu_cap, tasks): trgs = [ '{0!s}/{1!s}_quant.sf'.format(out_dir, out_name), '{0!s}/{1!s}_quant.genes.sf'.format(out_dir, out_name) ] if len(gene_map) > 0: trans_gene_map = ' --geneMap {0!s}'.format(gene_map) cmd = ( '{0!s} quant -i {1!s} -l U -r {2!s} -o {3!s}/{4!s} {5!s} ' #'-p {6!s} --extraSensitive; cp {3!s}/{4!s}/quant.sf ' '-p {6!s} --dumpEq --extraSensitive; cp {3!s}/{4!s}/quant.sf ' '{3!s}/{4!s}_quant.sf; cp {3!s}/{4!s}/quant.genes.sf ' '{3!s}/{4!s}_quant.genes.sf').format( tool_path_check(TOOLS_DICT['salmon'].full_exe[0]), index, unpaired, out_dir, out_name, trans_gene_map, cpu_cap) name = 'salmon_unpaired_' + os.path.basename( index) + '_' + os.path.basename(unpaired) out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap)
def trimmomatic_task(opc, out_dir, left, right, cpu_cap, basename, tasks): base_str = '{0!s}/{1!s}'.format(out_dir, basename) trgs = [ base_str + '_1_' + os.path.basename(left), base_str + '_2_' + os.path.basename(right) ] orphans = [ base_str + '_1s_' + os.path.basename(left), base_str + '_2s_' + os.path.basename(right) ] cmd = ('java -jar {0!s} PE -threads {3!s} {1!s} {2!s} {5!s} {4!s} {7!s} ' '{6!s} ILLUMINACLIP:{8!s}:2:30:10 LEADING:3 TRAILING:3 ' 'SLIDINGWINDOW:4:15 MINLEN:35').format( tool_path_check(TOOLS_DICT['trimmomatic'].full_exe[0]), left, right, cpu_cap, orphans[0], trgs[0], orphans[1], trgs[1], TOOLS_DICT['trimmomatic'].full_exe[1]) name = basename out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, name=name, stdout=out, stderr=err, targets=trgs, cpu=cpu_cap)
def signalp_task(opc, path_orfs, out_dir, tasks): out_name = os.path.basename(path_orfs).split('.')[0] trgs = ['{0!s}/{1!s}.signalp'.format(out_dir, out_name)] cmd = '{0!s} -f short -n {1!s} {2!s}'.format(tool_path_check(TOOLS_DICT['signalp'].full_exe[0]), trgs[-1], path_orfs) name = 'signalp_' + out_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def blast_augment_task(opc, db, blast, tasks): id2name = db+'.stitle' trgs = ['{0!s}_ex'.format(blast)] cmd = 'python {0!s}/addStitleToBlastTab.py --db2Name {1!s} --blast {2!s} > {3!s}'.format( statics.PATH_UTIL, id2name, blast, trgs[0]) name = 'Blast_Augmentation_'+os.path.basename(blast) out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def assembly_to_bed_task(opc, path_assembly, out_dir, tasks): assembly_name = os.path.basename(path_assembly).split('.fa')[0] trgs = ['{0!s}/{1!s}.bed'.format(out_dir, assembly_name)] cmd = 'python {0!s}/fasta_to_bed_count_length.py {1!s} {2!s}'.format( statics.PATH_UTIL, path_assembly, trgs[0]) name = 'fasta_to_bed_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def pipeplot_task(opc, dbs, annotation_table, out_dir, tasks): trgs = ['{0!s}/plots/cogMultiple.png'.format(out_dir)] # pipeplot no targets cmd = 'mkdir -p {0!s}/plots ; cd {0!s}/plots ; python {1!s}/pipePlot.py -i {2!s} --nog_categories {3!s};'.format( out_dir, statics.PATH_UTIL, annotation_table, dbs['nog_categories'].call_path) name = 'pipeplot' out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def gff3_task(opc, path_assembly, out_path, opts, tasks): trgs = [out_path] cmd = ('python {0!s}/annot_table_gff3.py --fasta {1!s} --outfile {2!s} ' ).format(statics.PATH_UTIL, path_assembly, out_path) cmd += ' '.join(['--'+k+' '+opts[k] for k in opts]) name = 'build_gff3_' + os.path.basename(out_path) out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def pfam_seq_task(opc, dbs, path_orfs, out_dir, cpu_cap, tasks): out_name = os.path.basename(path_orfs).split('.')[0] trgs = ['{0!s}/{1!s}.pfam_tblout'.format(out_dir, out_name)] cmd = '{0!s} --cpu {1!s} --tblout {2!s} {3!s} {4!s}'.format( tool_path_check(TOOLS_DICT['hmmer'].full_exe[0]), cpu_cap, trgs[0], dbs['pfam'].call_path, path_orfs) name = 'pfam_tblout_' + out_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap)
def gene_trans_map_task(opc, path_assembly, out_dir, tasks): assembly_name = os.path.basename(path_assembly).split('.fa')[0] trgs = ['{0!s}/{1!s}.gene_trans_map'.format(out_dir, assembly_name)] cmd = 'perl {0!s} {1!s} > {2!s}'.format( tool_path_check(TOOLS_DICT['trinity'].full_exe[1]), path_assembly, trgs[0]) name = 'gene_trans_map_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def tmhmm_task(opc, path_orfs, out_dir, tasks): out_name = os.path.basename(path_orfs).split('.')[0] trgs = ['{0!s}/{1!s}.tmhmm'.format(out_dir, out_name)] cmd = 'cd {0!s}; {1!s} --short < {2!s} > {3!s}'.format( out_dir, tool_path_check(TOOLS_DICT['tmhmm'].full_exe[0]), path_orfs, trgs[0]) name = 'tmhmm_' + out_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def kegg_task(opc, annotation_table, out_dir, tasks, kegg_map_id='ko01100'): kegg_dir = '{0!s}/kegg_maps'.format(out_dir) trgs = ['{0!s}/{1!s}.pdf'.format(kegg_dir, kegg_map_id), '{0!s}/{1!s}_KO.txt'.format(kegg_dir, kegg_map_id)] cmd = ('mkdir -p {3!s} ; cd {3!s} ; python {0!s}/color_pathways2.py --path {1!s} ' ' --transcriptomeKO {2!s} --output {3!s}').format( statics.PATH_UTIL, kegg_map_id, annotation_table, kegg_dir) name = 'draw_kegg_map_{0!s}_{1!s}'.format(os.path.basename(annotation_table), kegg_map_id) out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def rnammer_task(opc, path_assembly, out_dir, tasks): assembly_name = os.path.basename(path_assembly).split('.fa')[0] path_to_rnammer = os.path.dirname(TOOLS_DICT['rnammer'].folder_name) trgs = ['{0!s}/{1!s}.fasta.rnammer.gff'.format(out_dir, assembly_name)] cmd = ("cd {0!s}; {1!s} --transcriptome {2!s} --path_to_rnammer {4!s} " "--org_type euk; cd -").format( out_dir, tool_path_check(TOOLS_DICT['rnammer'].full_exe[0]), path_assembly, path_to_rnammer) name = 'rnammer_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def transdecoder_longorfs_task(opc, path_assembly, path_transdecoder_output, cpu_cap, tasks): assembly_name = os.path.basename(path_assembly).split('.fa')[0] longorf_outbase = os.path.join(path_transdecoder_output, opc.assembly_name + '.fasta.transdecoder_dir') trgs = ['{0!s}/longest_orfs.pep'.format(longorf_outbase), '{0!s}/longest_orfs.gff3'.format(longorf_outbase), '{0!s}/longest_orfs.cds'.format(longorf_outbase)] cmd = ("mkdir -p {0!s}; cd {0!s}; {1!s} -t {2!s}").format(path_transdecoder_output, tool_path_check(TOOLS_DICT['transdecoder'].full_exe[0]), path_assembly, cpu_cap) name = 'TransDecoder_LongORFs_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap)
def sam_sort_task(opc, out_dir, bam_file, out_name, tasks): trgs = ['{0!s}/{1!s}.bam'.format(out_dir, out_name)] cmd = 'samtools sort {0!s} {1!s}/{2!s}'.format(bam_file, out_dir, out_name) name = 'sam_sort_' + os.path.basename(bam_file) + '_' + out_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def salmon_gene_map_task(opc, out_dir, assembly_name, gene_trans_map, tasks): ''' salmon requires gene_trans_map in reverse column order (transcript \\t gene \\n)''' trgs = ['{0!s}/{1!s}.trans_gene_map'.format(out_dir, assembly_name)] cmd = '''awk '{{ print $2 " " $1}}' {0!s} > {1!s}'''.format( gene_trans_map, trgs[0]) name = 'salmon_gene_map_task_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def build_bowtie_task(opc, path_assembly, assembly_name, out_dir, tasks): trgs = ['{0!s}/{1!s}.1.bt2'.format(out_dir, assembly_name)] cmd = '{0!s} --offrate 1 -f {1!s} {2!s}/{3!s}'.format( tool_path_check(TOOLS_DICT['bowtie2'].full_exe[0]), path_assembly, out_dir, assembly_name) name = 'build_bowtie_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def diamond_task(opc, blast_type, out_dir, path_query, ref, cpu_cap, tasks): ''' valid blast_types: "blastx", "blastp" ''' base_ref = os.path.basename(ref) query_name = os.path.basename(path_query).split('.')[0] trgs = ['{0!s}/{1!s}_{2!s}.diamond_{3!s}'.format(out_dir, query_name, base_ref, blast_type)] pseudo_trgs = ['{0!s}/diamond_{1!s}_{2!s}'.format(out_dir, base_ref, blast_type)] cmd = ('{0!s} {1!s} --db {2!s} --query {3!s} --daa {4!s} --tmpdir {5!s} ' '--max-target-seqs 20 --sensitive --threads {6!s} --evalue 0.001; {0!s} view ' '--daa {4!s}.daa --out {7!s};').format( tool_path_check(TOOLS_DICT['diamond'].full_exe[0]), blast_type, ref, path_query, pseudo_trgs[0], out_dir, cpu_cap, trgs[0]) name = 'diamond_{0!s}_{1!s}_{2!s}'.format(blast_type, base_ref, query_name) out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, cpu=cpu_cap, targets=trgs, name=name, stdout=out, stderr=err)
def kallisto_task(opc, index, out_dir, out_name, left, right, tasks): # NO TARGETS trgs = [] cmd = '{0!s} quant -i {1!s} -o {2!s}/{3!s} {4!s} {5!s}'.format( tool_path_check(TOOLS_DICT['kallisto'].full_exe[0]), index, out_dir, out_name, left, right) name = 'kallisto' out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def build_kallisto_task(opc, assembly_path, assembly_name, out_dir, tasks): # NO TARGETS trgs = [] cmd = '{0!s} index -i {1!s}/{2!s}_kallisto {3!s}'.format( tool_path_check(TOOLS_DICT['kallisto'].full_exe[0]), out_dir, assembly_name, assembly_path) name = 'build_kallisto' out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def cat_task(opc, out_dir, left, right, basename, tasks): trgs = [ '{0!s}/{1!s}_1.fastq'.format(out_dir, basename), '{0!s}/{1!s}_2.fastq'.format(out_dir, basename) ] cmd = 'cat {0!s} > {1!s}; cat {2!s} > {3!s}'.format( ' '.join(left), trgs[0], ' '.join(right), trgs[1]) name = 'cat_basename' out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, name=name, stdout=out, stderr=err, targets=trgs)
def express_task(opc, bowtie2_index, assembly_path, out_dir, out_name, bam_input, tasks): trgs = ['{0!s}/{1!s}.xprs'.format(out_dir, out_name)] cmd = ('mkdir {1!s}/{2!s}; {0!s} --output-dir {1!s}/{2!s} {3!s} {4!s}; mv ' '{1!s}/{2!s}/results.xprs {5!s}; rm -rf {1!s}/{2!s};').format( tool_path_check(TOOLS_DICT['express'].full_exe[0]), out_dir, out_name, assembly_path, bam_input, trgs[0]) name = 'express_' + os.path.basename(bowtie2_index) + '_' + out_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def intersect_bed_task(opc, out_dir, bam_file, bed_reference, output_name, tasks): trgs = ['{0!s}/{1!s}.bed'.format(out_dir, output_name)] # cmd = '{0!s} intersect -abam {1!s} -b {2!s} -wb -bed > {3!s}'.format( cmd = '{0!s} -abam {1!s} -b {2!s} -wb -bed > {3!s}'.format( tool_path_check(TOOLS_DICT['bedtools'].full_exe[0]), bam_file, bed_reference, trgs[0]) name = 'intersect_bed_' + os.path.basename( bed_reference) + '_' + output_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def assembly_stats_task(opc, out_dir, assembly, tasks): ''' Defines assembly_stats task. Uses PATH_DIR, PATH_SCRIPTS, NAME_ASSEMBLY. Params : tasks - a list of tasks that this task is dependant on (trinity_task) ''' trgs = ['{0!s}/assembly_stats.json'.format(out_dir)] cmd = 'python {0!s}/assembly_stats.py {1!s} > {2!s}'.format( statics.PATH_UTIL, assembly, trgs[0]) name = 'assembly_stats_' + os.path.basename(assembly) out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def build_salmon_task(opc, path_assembly, assembly_name, out_dir, cpu_cap, tasks): trgs = ['{0!s}/{1!s}_salmon'.format(out_dir, assembly_name)] cmd = ('{0!s} index --transcripts {1!s} --index {2!s}/{3!s}_salmon ' '--threads {4!s} --type quasi').format( tool_path_check(TOOLS_DICT['salmon'].full_exe[0]), path_assembly, out_dir, assembly_name, cpu_cap) name = 'build_salmon_' + assembly_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap)
def bowtie2_unpaired_task(opc, bowtie2_index, out_dir, fastq, out_name, opt, cpu_cap, tasks): opts = ['-a -t --end-to-end', '-t --local', '-k 200 --end-to-end'] trgs = ['{0!s}/{1!s}.bam'.format(out_dir, out_name)] cmd = ('{0!s} {1!s} -L {2!s} -N 1 --threads {3!s} -x {4!s} -U ' '{5!s} | samtools view -Sb - > {6!s} ').format( tool_path_check(TOOLS_DICT['bowtie2'].full_exe[1]), opts[opt], 22, cpu_cap, bowtie2_index, fastq, trgs[0]) name = 'bowtie2_' + os.path.basename(bowtie2_index) + '_' + out_name out, err = gen_logs(opc.path_logs, name) return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap)