def signalp_task(path_orfs,out_dir,tasks):
    out_name = os.path.basename(path_orfs).split('.')[0]
    trgs = ['{0!s}/{1!s}.signalp'.format(out_dir,out_name)]
    cmd = '{0!s} -f short -n {1!s} {2!s}'.format(fg.tool_path_check(TOOLS_DICT['signalp'].full_exe[0]),trgs[-1],path_orfs)
    name = 'signalp_' + out_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def salmon_unpaired_task(index,unpaired,out_name,gene_map,out_dir,cpu_cap,tasks):
    trgs = []
    cmd = '{0!s} quant -i {1!s} -l U -r {2!s} -o {3!s}/{4!s} --geneMap {5!s} -p {6!s} --extraSensitive'.format(
            fg.tool_path_check(TOOLS_DICT['salmon'].full_exe[0]),index,unpaired,out_dir,out_name,gene_map,cpu_cap)
    name = 'salmon_unpaired_' + os.path.basename(index)
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err,cpu=cpu_cap)
Ejemplo n.º 3
0
def signalp_task(opc, path_orfs, out_dir, tasks):
    out_name = os.path.basename(path_orfs).split('.')[0]
    trgs = ['{0!s}/{1!s}.signalp'.format(out_dir, out_name)]
    cmd = '{0!s} -f short -n {1!s} {2!s}'.format(tool_path_check(TOOLS_DICT['signalp'].full_exe[0]), trgs[-1], path_orfs)
    name = 'signalp_' + out_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def prinseq_task(opc, out_dir, input_1, input_2, basename, opts, tasks):
    '''    Defines prinseq task. Uses GEN_PATH_DIR(), PATH_PRINSEQ
        Params :
            input_1 - a list of 1/left fastq files
            input_2 - a list of 2/right fastq files
            basename - the basename for all output files
            opts - optional params for trinity task.
            tasks = the tasks that this task is dependent on
    '''
    trgs = [
        '{0!s}/{1!s}_1_{2!s}'.format(out_dir, basename,
                                     os.path.basename(input_1)),
        '{0!s}/{1!s}_2_{2!s}'.format(out_dir, basename,
                                     os.path.basename(input_2))
    ]
    pseudo_trgs = [
        '{0!s}/{1!s}_{2!s}.fastq'.format(out_dir, basename, x)
        for x in range(1, 3)
    ]
    cmd = (
        'perl {0!s} -fastq {1!s} -fastq2 {2!s} --out_format 3 --out_good {3!s}/{4!s} '
        '--out_bad null --trim_qual_left 20 --trim_qual_right 20 --trim_qual_type min '
        '--min_len 55 --trim_tail_left 8 --trim_tail_right 8 {5!s} -log; mv {6!s} {7!s};'
        ' mv {8!s} {9!s};').format(
            tool_path_check(TOOLS_DICT['prinseq'].full_exe[0]), input_1,
            input_2, out_dir, basename, opts, pseudo_trgs[0], trgs[0],
            pseudo_trgs[1], trgs[1])
    name = basename
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                name=name,
                stdout=out,
                stderr=err,
                targets=trgs)
def salmon_unpaired_task(opc, index, unpaired, out_name, gene_map, out_dir,
                         cpu_cap, tasks):
    trgs = [
        '{0!s}/{1!s}_quant.sf'.format(out_dir, out_name),
        '{0!s}/{1!s}_quant.genes.sf'.format(out_dir, out_name)
    ]
    if len(gene_map) > 0:
        trans_gene_map = ' --geneMap {0!s}'.format(gene_map)
    cmd = (
        '{0!s} quant -i {1!s} -l U -r {2!s} -o {3!s}/{4!s} {5!s} '
        #'-p {6!s} --extraSensitive; cp {3!s}/{4!s}/quant.sf '
        '-p {6!s} --dumpEq --extraSensitive; cp {3!s}/{4!s}/quant.sf '
        '{3!s}/{4!s}_quant.sf; cp {3!s}/{4!s}/quant.genes.sf '
        '{3!s}/{4!s}_quant.genes.sf').format(
            tool_path_check(TOOLS_DICT['salmon'].full_exe[0]), index, unpaired,
            out_dir, out_name, trans_gene_map, cpu_cap)
    name = 'salmon_unpaired_' + os.path.basename(
        index) + '_' + os.path.basename(unpaired)
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err,
                cpu=cpu_cap)
def kallisto_task(index,out_dir,out_name,left,right,tasks):
    trgs = []
    cmd = '{0!s} quant -i {1!s} -o {2!s}/{3!s} {4!s} {5!s}'.format(
            fg.tool_path_check(TOOLS_DICT['kallisto'].full_exe[0]),index,out_dir,out_name,left,right)
    name = 'kallisto'
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def busco_task(opc, dbs, assembly_path, assembly_name, out_dir, reference_name,
               cpu_cap, tasks):
    ''' Defines the busco task. Uses PATH_DIR, PATH_BUSCO, PATH_BUSCO_REFERENCE
        Params :
            reference_name - Name of the reference file to be used by busco
            cpu_cap - the cpu limit to be gicen to busco.
            tasks - a list of tasks that this task is dependant on.
    '''
    trgs = [
        '{0!s}/run_busco_{1!s}_{2!s}'.format(out_dir, assembly_name,
                                             reference_name)
    ]
    busco_db = dbs['busco_' + reference_name]
    cmd = ('cd {0!s}; python {1!s} '
           '-o busco_{3!s}_{2!s} -i {4!s} -l {5!s}/{2!s}_odb9 -m '
           'tran -f -c {6!s}').format(
               out_dir, tool_path_check(TOOLS_DICT['busco'].full_exe[0]),
               reference_name, assembly_name, assembly_path,
               busco_db.call_path, cpu_cap)
    name = 'busco_' + reference_name + '_' + assembly_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                cpu=cpu_cap,
                stdout=out,
                stderr=err)
def trimmomatic_task(opc, out_dir, left, right, cpu_cap, basename, tasks):
    base_str = '{0!s}/{1!s}'.format(out_dir, basename)
    trgs = [
        base_str + '_1_' + os.path.basename(left),
        base_str + '_2_' + os.path.basename(right)
    ]
    orphans = [
        base_str + '_1s_' + os.path.basename(left),
        base_str + '_2s_' + os.path.basename(right)
    ]
    cmd = ('java -jar {0!s} PE -threads {3!s} {1!s} {2!s} {5!s} {4!s} {7!s} '
           '{6!s} ILLUMINACLIP:{8!s}:2:30:10 LEADING:3 TRAILING:3 '
           'SLIDINGWINDOW:4:15 MINLEN:35').format(
               tool_path_check(TOOLS_DICT['trimmomatic'].full_exe[0]), left,
               right, cpu_cap, orphans[0], trgs[0], orphans[1], trgs[1],
               TOOLS_DICT['trimmomatic'].full_exe[1])
    name = basename
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                name=name,
                stdout=out,
                stderr=err,
                targets=trgs,
                cpu=cpu_cap)
def build_salmon_task(path_assembly,assembly_name,out_dir,cpu_cap,tasks):
    trgs = ['{0!s}/{1!s}_salmon'.format(out_dir, assembly_name)] 
    cmd = '{0!s} index --transcripts {1!s} --index {2!s}/{3!s}_salmon --threads {4!s} --type quasi'.format(
        fg.tool_path_check(TOOLS_DICT['salmon'].full_exe[0]),path_assembly, out_dir, assembly_name, cpu_cap)
    name = 'build_salmon_' + assembly_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err, cpu=cpu_cap)
def gene_trans_map_task(path_assembly,out_dir,tasks):
    assembly_name = os.path.basename(path_assembly).split('.fa')[0]
    trgs = ['{0!s}/{1!s}.gene_trans_map'.format(out_dir, assembly_name)]
    cmd = '{0!s} {1!s} > {2!s}'.format(fg.tool_path_check(TOOLS_DICT['trinity'].full_exe[1]),path_assembly,trgs[0])
    name = 'gene_trans_map_' + assembly_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def tmhmm_task(path_orfs,out_dir,tasks):
    out_name = os.path.basename(path_orfs).split('.')[0]
    trgs = ['{0!s}/{1!s}.tmhmm'.format(out_dir, out_name)] 
    cmd = 'cd {0!s}; {1!s} --short < {2!s} > {3!s}'.format(out_dir,fg.tool_path_check(TOOLS_DICT['tmhmm'].full_exe[0]),path_orfs,trgs[0])
    name = 'tmhmm_' + out_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
 def ret():
     for t in other_dependencies:
         try:
             if( not d.finished()):
                 return False
         except Task.ExitCodeException:
             return False
     assembly_files = sorted(os.listdir(reads_dir))
     assembly_files = [os.path.join(reads_dir, f) for f in assembly_files]
     new_lefts = [[g for g in assembly_files if(os.path.basename(f) in g)] for f in lefts]
     new_lefts = [k[0] for k in new_lefts if(len(k) > 0)]
     new_rights = [[g for g in assembly_files if(os.path.basename(f) in g)] for f in rights]
     new_rights = [k[0] for k in new_rights if(len(k) > 0)]
     new_singles = [[g for g in assembly_files if(os.path.basename(f) in g)] for f in singles]
     new_singles = [k[0] for k in new_singles if(len(k) > 0)]
     if(len(new_lefts) == len(lefts) and len(new_rights) == len(rights) and len(new_singles) == len(singles)
         and len(new_lefts)+len(new_singles) != 0):
         new_lefts = ','.join(new_lefts+new_singles)
         new_rights = ','.join(new_rights) 
         new_lefts = '--left '+new_lefts if(len(new_lefts) > 0) else ''
         new_rights = '--right '+new_rights if(len(new_rights) > 0) else ''
         cmd = '{0!s} --assembly {1!s} {2!s} {3!s} --threads {4!s} {5!s} --output {6!s}'.format(
                fg.tool_path_check(TOOLS_DICT['transrate'].full_exe[0]), assembly_path, new_lefts,
                new_rights, cpu_cap, reference, transrate_dir)
         transrate_task.command = cmd
     else:
         print('Unable to match input files with trimmed output. Continuing transrate using input files instead.')
     return True
def build_kallisto_task(assembly_path, assembly_name,out_dir,tasks):
    trgs = []
    cmd = '{0!s} index -i {1!s}/{2!s}_kallisto {3!s}'.format(
            fg.tool_path_check(TOOLS_DICT['kallisto'].full_exe[0]),out_dir,assembly_name,assembly_path)
    name = 'build_kallisto'
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def salmon_task(opc, index, left, right, out_name, gene_map, out_dir, cpu_cap,
                tasks):
    trgs = [
        '{0!s}/{1!s}/quant.sf'.format(out_dir, out_name),
        '{0!s}/{1!s}/quant.genes.sf'.format(out_dir, out_name)
    ]
    if len(gene_map) > 0:
        trans_gene_map = ' --geneMap {0!s}'.format(gene_map)
    cmd = (
        '{0!s} quant -i {1!s} -l IU -1 {2!s} -2 {3!s} -o {4!s}/{5!s} '
        #'--geneMap {6!s} -p {7!s} --extraSensitive; cp {4!s}/{5!s}/quant.sf '
        '{6!s} -p {7!s} --dumpEq'
    ).format(
        #cmd = '{0!s} quant -i {1!s} -l IU -1 {2!s} -2 {3!s} -o {4!s}/{5!s}
        #--geneMap {6!s} -p {7!s} --extraSensitive --numBootstraps 30 --biasCorrect ; cp ' \
        tool_path_check(TOOLS_DICT['salmon'].full_exe[0]),
        index,
        left,
        right,
        out_dir,
        out_name,
        trans_gene_map,
        cpu_cap)
    name = os.path.basename(index) + '_' + os.path.basename(left)
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err,
                cpu=cpu_cap)
def trinity_task(opc, path_assembly, out_dir, fastq, fastq2, unpaired,
                 cpu_cap_trin, cpu_cap_bfly, mem_trin, mem_bfly,
                 normalize_flag, tasks):
    '''    Defines the trinity task. Uses GEN_PATH_DIR(), PATH_TRINITY, NAME_ASSEMBLY
        Params :
            left - a 1/left fastq files
            right - a 2/right fastq files
            cpu_cap - number of threads used by trinity
            tasks - a list of tasks that this task is dependent on
    '''
    normalize_flag = '--normalize_reads' if (normalize_flag) else ''
    input_str = ''
    if (unpaired != [] and fastq == []):
        input_str += '--single ' + ','.join(unpaired)
    if (fastq != []):
        input_str += '--left ' + ','.join(fastq + unpaired)
        input_str += ' --right ' + ','.join(fastq2)
    trgs = [path_assembly]
    cmd = (
        '{0!s} --seqType fq {1!s} --CPU {2!s} --max_memory {3!s}G --bflyCalculateCPU {4!s} '
        '--output {6!s}/trinity; cp {6!s}/trinity/Trinity.fasta {7!s};'
    ).format(tool_path_check(TOOLS_DICT['trinity'].full_exe[0]), input_str,
             cpu_cap_trin, mem_trin, normalize_flag, mem_bfly, out_dir,
             trgs[0])
    name = 'trinity_assembly'
    out, err = gen_logs(opc.path_logs, name)
    cpu_cap = max(cpu_cap_trin, cpu_cap_bfly)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                cpu=cpu_cap,
                stdout=out,
                stderr=err)
def pfam_seq_task(path_orfs,out_dir, cpu_cap, tasks):
    out_name = os.path.basename(path_orfs).split('.')[0]
    trgs = ['{0!s}/{1!s}.pfam_tblout'.format(out_dir,out_name)]
    cmd = '{0!s} --cpu {1!s} --tblout {2!s} {3!s} {4!s}'.format(
        fg.tool_path_check(TOOLS_DICT['hmmer'].full_exe[0]),cpu_cap,trgs[0],PATH_PFAM_DATABASE, path_orfs)
    name = 'pfam_tblout_' + out_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err,cpu=cpu_cap)
def salmon_task(index,left,right,out_name,gene_map,out_dir,cpu_cap,tasks):
    trgs = ['{0!s}/{1!s}_quant.sf'.format(out_dir,out_name),'{0!s}/{1!s}_quant.genes.sf'.format(out_dir,out_name)]
    cmd = '{0!s} quant -i {1!s} -l IU -1 {2!s} -2 {3!s} -o {4!s}/{5!s} --geneMap {6!s} -p {7!s} --extraSensitive; cp ' \
        '{4!s}/{5!s}/quant.sf {4!s}/{5!s}_quant.sf; cp {4!s}/{5!s}/quant.genes.sf {4!s}/{5!s}_quant.genes.sf'.format(
	fg.tool_path_check(TOOLS_DICT['salmon'].full_exe[0]),index,left,right,out_dir,out_name,gene_map,cpu_cap)
    name = 'salmon_' + os.path.basename(index) + '_' + os.path.basename(left)
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err,cpu=cpu_cap)
Ejemplo n.º 18
0
def tmhmm_task(opc, path_orfs, out_dir, tasks):
    out_name = os.path.basename(path_orfs).split('.')[0]
    trgs = ['{0!s}/{1!s}.tmhmm'.format(out_dir, out_name)]
    cmd = 'cd {0!s}; {1!s} --short < {2!s} > {3!s}'.format(
          out_dir, tool_path_check(TOOLS_DICT['tmhmm'].full_exe[0]),
          path_orfs, trgs[0])
    name = 'tmhmm_' + out_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
Ejemplo n.º 19
0
def pfam_seq_task(opc, dbs, path_orfs, out_dir, cpu_cap, tasks):
    out_name = os.path.basename(path_orfs).split('.')[0]
    trgs = ['{0!s}/{1!s}.pfam_tblout'.format(out_dir, out_name)]
    cmd = '{0!s} --cpu {1!s} --tblout {2!s} {3!s} {4!s}'.format(
          tool_path_check(TOOLS_DICT['hmmer'].full_exe[0]), cpu_cap,
          trgs[0], dbs['pfam'].call_path, path_orfs)
    name = 'pfam_tblout_' + out_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap)
Ejemplo n.º 20
0
def gene_trans_map_task(opc, path_assembly, out_dir, tasks):
    assembly_name = os.path.basename(path_assembly).split('.fa')[0]
    trgs = ['{0!s}/{1!s}.gene_trans_map'.format(out_dir, assembly_name)]
    cmd = 'perl {0!s} {1!s} > {2!s}'.format(
          tool_path_check(TOOLS_DICT['trinity'].full_exe[1]),
          path_assembly, trgs[0])
    name = 'gene_trans_map_' + assembly_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def build_blast_task(path_db,out_dir,dbtype,tasks,log_flag=True):
    trgs = []
    #title doesn't seem to change the out name .. it's still xx.gz.psq, etc? CHECK.
    title = os.path.basename(path_assembly).split('.')[0]
    cmd = 'gunzip -c {0!s} | {1!s} -in - -dbtype {2!s} -title {3!s} -out {4!s}'.format(
    path_assembly, fg.tool_path_check(TOOLS_DICT['blast'].full_exe[0]),dbtype,title,out_dir)
    name = 'build_blastplus_db_' + title
    out, err = fg.GEN_LOGS(name) if(log_flag) else (None, None)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def intersect_bed_task(out_dir,bam_file,bed_reference,output_name,tasks):
    '''
    '''
    trgs = ['{0!s}/{1!s}.bed'.format(out_dir,output_name)]
    cmd = '{0!s} intersect -abam {1!s} -b {2!s} -wb -bed > {3!s}'.format(
        fg.tool_path_check(TOOLS_DICT['bedtools'].full_exe[0]),bam_file,bed_reference,trgs[0])
    name = 'intersect_bed_'+output_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def build_bowtie_task(path_assembly, assembly_name, out_dir, tasks):
    '''
    '''
    trgs = ['{0!s}/{1!s}.1.bt2'.format(out_dir,assembly_name)]  
    cmd = '{0!s} --offrate 1 -f {1!s} {2!s}/{3!s}'.format(
            fg.tool_path_check(TOOLS_DICT['bowtie2'].full_exe[0]), path_assembly, out_dir, assembly_name) 
    name = 'build_bowtie_' + assembly_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def express_task(assembly_path,out_dir,out_name,bam_input,tasks):
    '''
    '''
    trgs = ['{0!s}/{1!s}.xprs'.format(out_dir,out_name)]
    cmd = ('mkdir {1!s}/{2!s}; {0!s} --output-dir {1!s}/{2!s} {3!s} {4!s}; mv '
            '{1!s}/{2!s}/results.xprs {5!s}; rm -rf {1!s}/{2!s};').format(
            fg.tool_path_check(TOOLS_DICT['express'].full_exe[0]),out_dir,out_name,assembly_path,bam_input,trgs[0])
    name = 'express_'+out_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def rnammer_task(path_assembly, out_dir, tasks):
    assembly_name = os.path.basename(path_assembly).split('.fa')[0]
    path_to_rnammer = os.path.dirname(TOOLS_DICT['rnammer'].folder_name)
    trgs = ['{0!s}/{1!s}.fasta.rnammer.gff'.format(out_dir,assembly_name)]
    cmd = ("cd {0!s}; {1!s} --transcriptome {2!s}  --path_to_rnammer {4!s} "
            "--org_type euk; cd -").format(out_dir,fg.tool_path_check(TOOLS_DICT['rnammer'].full_exe[0]),
            path_assembly,path_to_rnammer)
    name = 'rnammer_' + assembly_name
    out,err = GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def transdecoder_longorfs_task(path_assembly, path_transdecoder_output, cpu_cap, tasks):
    assembly_name = os.path.basename(path_assembly).split('.fa')[0]
    longorf_outbase = os.path.join(path_transdecoder_output, assembly_name + '.fasta.transdecoder_dir') 
    trgs = ['{0!s}/longest_orfs.pep'.format(longorf_outbase),'{0!s}/longest_orfs.gff3'.format(longorf_outbase),
    '{0!s}/longest_orfs.cds'.format(longorf_outbase)]
    cmd = ("mkdir -p {0!s}; cd {0!s}; {1!s} -t {2!s}").format(path_transdecoder_output,
            fg.tool_path_check(TOOLS_DICT['transdecoder'].full_exe[0]),path_assembly,cpu_cap)
    name = 'TransDecoder_LongORFs_' + assembly_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err,cpu=cpu_cap) 
Ejemplo n.º 27
0
def rnammer_task(opc, path_assembly, out_dir, tasks):
    assembly_name = os.path.basename(path_assembly).split('.fa')[0]
    path_to_rnammer = os.path.dirname(TOOLS_DICT['rnammer'].folder_name)
    trgs = ['{0!s}/{1!s}.fasta.rnammer.gff'.format(out_dir, assembly_name)]
    cmd = ("cd {0!s}; {1!s} --transcriptome {2!s}  --path_to_rnammer {4!s} "
           "--org_type euk; cd -").format(
           out_dir, tool_path_check(TOOLS_DICT['rnammer'].full_exe[0]),
           path_assembly, path_to_rnammer)
    name = 'rnammer_' + assembly_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)
def trimmomatic_unpaired_task(out_dir,input1, cpu_cap, basename, tasks):
    form = lambda s, i : s.format(out_dir, basename, os.path.basename(i))
    trgs = [form('{0!s}/{1!s}_{2!s}', input1)]
    orphans = [form('{0!s}/{1!s}_orphans_{2!s}', input1)]
    cmd = ('java -jar {0!s} SE -threads {4!s} {1!s} {2!s} {3!s} ILLUMINACLIP:'
           '{5!s}:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:35'
           ).format(fg.tool_path_check(TOOLS_DICT['trimmomatic'].full_exe[0]), input1, trgs[0], orphans[0],cpu_cap,
           TOOLS_DICT['trimmomatic'].full_exe[2]) #PATH_TRIMMOMATIC_ADAPTERS_SINGLE)
    name = basename
    out, err = fg.GEN_LOGS(name)
    return Task(command=cmd, dependencies=tasks, name=name, stdout=out, stderr=err, targets=trgs, cpu=cpu_cap) 
Ejemplo n.º 29
0
def transdecoder_longorfs_task(opc, path_assembly, path_transdecoder_output, cpu_cap, tasks):
    assembly_name = os.path.basename(path_assembly).split('.fa')[0]
    longorf_outbase = os.path.join(path_transdecoder_output, opc.assembly_name + '.fasta.transdecoder_dir')
    trgs = ['{0!s}/longest_orfs.pep'.format(longorf_outbase),
            '{0!s}/longest_orfs.gff3'.format(longorf_outbase),
            '{0!s}/longest_orfs.cds'.format(longorf_outbase)]
    cmd = ("mkdir -p {0!s}; cd {0!s}; {1!s} -t {2!s}").format(path_transdecoder_output,
            tool_path_check(TOOLS_DICT['transdecoder'].full_exe[0]), path_assembly, cpu_cap)
    name = 'TransDecoder_LongORFs_' + assembly_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err, cpu=cpu_cap) 
def bowtie2_task(bowtie2_index,out_dir,fastq1,fastq2,out_name,opt,cpu_cap,tasks):
    '''    
    '''
    opts = ['-a -t --end-to-end', '-t --local']
    trgs = ['{0!s}/{1!s}.bam'.format(out_dir,out_name)]
    cmd = ('{0!s} {1!s} -L {2!s} -N 1 --maxins 800 --threads {3!s} -x {4!s} -1 '
            '{5!s} -2 {6!s} | samtools view -Sb - > {7!s} ').format(fg.tool_path_check(TOOLS_DICT['bowtie2'].full_exe[1]),
            opts[opt],22,cpu_cap,bowtie2_index,fastq1,fastq2,trgs[0])
    name = 'bowtie2_'+out_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err,cpu=cpu_cap)
def prinseq_unpaired_task(out_dir,input1, basename, opts, tasks):
    '''
    '''
    trgs = ['{0!s}/{1!s}_{2!s}'.format(out_dir, basename, os.path.basename(input1))]
    cmd = ('perl {0!s} -fastq {1!s} --out_format 3 --out_good {2!s}/{3!s} --out_bad null '
           '--trim_qual_left 20 --trim_qual_right 20 --trim_qual_type min --min_len 35 '
           '--trim_tail_left 8 --trim_tail_right 8 {4!s} -log; mv {2!s}/{3!s}.fastq {5!s}'
           ).format(fg.tool_path_check(TOOLS_DICT['prinseq'].full_exe[0]), input1, out_dir, 
                    basename, opts, trgs[0])
    name = basename
    out, err = fg.GEN_LOGS(name)
    return Task(command = cmd, dependencies=tasks, name=name, stdout=out, stderr=err, targets=trgs)
def build_bowtie_task(opc, path_assembly, assembly_name, out_dir, tasks):
    trgs = ['{0!s}/{1!s}.1.bt2'.format(out_dir, assembly_name)]
    cmd = '{0!s} --offrate 1 -f {1!s} {2!s}/{3!s}'.format(
        tool_path_check(TOOLS_DICT['bowtie2'].full_exe[0]), path_assembly,
        out_dir, assembly_name)
    name = 'build_bowtie_' + assembly_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def cegma_task(out_dir,assembly,cpu_cap, tasks):
    '''    Defines the cegma task. Uses PATH_DIR, PATH_CEGMA, NAME_ASSEMBLY.
        Params :
            cpu_cap - number of threads to be used by cegma
            tasks - a list of tasks that this task is dependant on (trinity_task)
    '''
    assembly_name = os.path.basename(assembly).split('.fa')[0]
    trgs = ['{0!s}/{1!s}.completeness_report'.format(out_dir,assembly_name)]
    cmd = '{0!s} -g {1!s} -v -o {3!s}/{2!s} -T {4!s}'.format(fg.tool_path_check(TOOLS_DICT['cegma'].full_exe[0]),
            assembly,assembly_name,out_dir,cpu_cap)
    name = 'cegma'
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,cpu=cpu_cap,stdout=out,stderr=err)
def diamond_task(blast_type, out_dir, path_query, ref, cpu_cap, tasks):
    ''' valid blast_types: "blastx", "blastp" ''' 
    base_ref = os.path.basename(ref)
    query_name = os.path.basename(path_query).split('.')[0]
    trgs = ['{0!s}/{1!s}_{2!s}.diamond_{3!s}'.format(out_dir, query_name, base_ref, blast_type)]
    pseudo_trgs = ['{0!s}/diamond_{1!s}_{2!s}'.format(out_dir, base_ref, blast_type)]
    cmd = ('{0!s} {1!s} --db {2!s} --query {3!s} --daa {4!s} --tmpdir {5!s} '
           '--max-target-seqs 20 --sensitive --threads {6!s} --evalue 0.001; {0!s} view '
           '--daa {4!s}.daa --out {7!s};').format(
           fg.tool_path_check(TOOLS_DICT['diamond'].full_exe[0]), blast_type, ref, path_query, pseudo_trgs[0], out_dir,
           cpu_cap, trgs[0])
    name = 'diamond_{0!s}_{1!s}_{2!s}'.format(blast_type, base_ref, query_name)
    out, err = fg.GEN_LOGS(name)
    return Task(command=cmd, dependencies=tasks, cpu=cpu_cap, targets=trgs, name=name, stdout=out, stderr=err)
def trimmomatic_task(out_dir,left, right, cpu_cap, basename, tasks):
    form = lambda s, i : s.format(out_dir, basename, os.path.basename(i))
    trgs = [form('{0!s}/{1!s}_1_{2!s}', left),
            form('{0!s}/{1!s}_2_{2!s}', right)]
    orphans = [form('{0!s}/{1!s}_1s_{2!s}', left),
               form('{0!s}/{1!s}_2s_{2!s}', right)]
    cmd = ('java -jar {0!s} PE -threads {3!s} {1!s} {2!s} {5!s} {4!s} {7!s} '
           '{6!s} ILLUMINACLIP:{8!s}:2:30:10 LEADING:3 TRAILING:3 '
           'SLIDINGWINDOW:4:15 MINLEN:35').format(
           fg.tool_path_check(TOOLS_DICT['trimmomatic'].full_exe[0]), left, right, cpu_cap, orphans[0], trgs[0],
           orphans[1], trgs[1], TOOLS_DICT['trimmomatic'].full_exe[1]) #PATH_TRIMMOMATIC_ADAPTERS_PAIRED)
    name = basename
    out, err = fg.GEN_LOGS(name)
    return Task(command=cmd, dependencies=tasks, name=name, stdout=out, stderr=err, targets=trgs, cpu=cpu_cap) 
Ejemplo n.º 36
0
def diamond_task(opc, blast_type, out_dir, path_query, ref, cpu_cap, tasks):
    ''' valid blast_types: "blastx", "blastp" '''
    base_ref = os.path.basename(ref)
    query_name = os.path.basename(path_query).split('.')[0]
    trgs = ['{0!s}/{1!s}_{2!s}.diamond_{3!s}'.format(out_dir, query_name, base_ref, blast_type)]
    pseudo_trgs = ['{0!s}/diamond_{1!s}_{2!s}'.format(out_dir, base_ref, blast_type)]
    cmd = ('{0!s} {1!s} --db {2!s} --query {3!s} --daa {4!s} --tmpdir {5!s} '
           '--max-target-seqs 20 --sensitive --threads {6!s} --evalue 0.001; {0!s} view '
           '--daa {4!s}.daa --out {7!s};').format(
           tool_path_check(TOOLS_DICT['diamond'].full_exe[0]), blast_type, ref, path_query,
           pseudo_trgs[0], out_dir, cpu_cap, trgs[0])
    name = 'diamond_{0!s}_{1!s}_{2!s}'.format(blast_type, base_ref, query_name)
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, cpu=cpu_cap, targets=trgs, name=name, stdout=out, stderr=err)
def kallisto_task(opc, index, out_dir, out_name, left, right, tasks):
    # NO TARGETS
    trgs = []
    cmd = '{0!s} quant -i {1!s} -o {2!s}/{3!s} {4!s} {5!s}'.format(
        tool_path_check(TOOLS_DICT['kallisto'].full_exe[0]), index, out_dir,
        out_name, left, right)
    name = 'kallisto'
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def build_kallisto_task(opc, assembly_path, assembly_name, out_dir, tasks):
    # NO TARGETS
    trgs = []
    cmd = '{0!s} index -i {1!s}/{2!s}_kallisto {3!s}'.format(
        tool_path_check(TOOLS_DICT['kallisto'].full_exe[0]), out_dir,
        assembly_name, assembly_path)
    name = 'build_kallisto'
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def build_blast_task(path_db, out_dir, dbtype, tasks):
    trgs = []
    # title doesn't seem to change the out name .. it's still xx.gz.psq, etc? CHECK.
    title = os.path.basename(path_db).split('.')[0]
    cmd = 'gunzip -c {0!s} | {1!s} -in - -dbtype {2!s} -title {3!s} -out {4!s}'.format(
        path_db, tool_path_check(TOOLS_DICT['blast'].full_exe[0]), dbtype,
        title, out_dir)
    name = 'build_blastplus_db_' + title
    out, err = gen_db_logs(name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def express_task(opc, bowtie2_index, assembly_path, out_dir, out_name,
                 bam_input, tasks):
    trgs = ['{0!s}/{1!s}.xprs'.format(out_dir, out_name)]
    cmd = ('mkdir {1!s}/{2!s}; {0!s} --output-dir {1!s}/{2!s} {3!s} {4!s}; mv '
           '{1!s}/{2!s}/results.xprs {5!s}; rm -rf {1!s}/{2!s};').format(
               tool_path_check(TOOLS_DICT['express'].full_exe[0]), out_dir,
               out_name, assembly_path, bam_input, trgs[0])
    name = 'express_' + os.path.basename(bowtie2_index) + '_' + out_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def busco_task(assembly_path, assembly_name, out_dir,reference_name, cpu_cap, tasks):
    ''' Defines the busco task. Uses PATH_DIR, PATH_BUSCO, PATH_BUSCO_REFERENCE
        Params :
            reference_name - Name of the reference file to be used by busco
            cpu_cap - the cpu limit to be gicen to busco.
            tasks - a list of tasks that this task is dependant on.
    '''
    trgs = ['{0!s}/run_busco_{1!s}_{2!s}'.format(out_dir,assembly_name,reference_name)]
    cmd = ('cd {0!s}; /matta1/biotools/anaconda/envs/py3k/bin/python {1!s} '
            '-o busco_{3!s}_{2!s} -in {4!s} -l {5!s}/{2!s}_buscos/{2!s} -m trans -f -c {6!s}'
            ).format(out_dir,fg.tool_path_check(TOOLS_DICT['busco_plant'].full_exe[0]),reference_name,assembly_name,assembly_path,
            PATH_BUSCO_REFERENCE,cpu_cap)
    name = 'busco_'+ reference_name + '_' + assembly_name
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,cpu=cpu_cap,stdout=out,stderr=err)
def rnaspades_task(path_assembly, out_dir, left, right, unpaired, cpu_cap, tasks):
    '''
    '''
    virtual_target = '{0!s}/rna_spades_out_dir'.format(out_dir)
    trgs = [path_assembly]
    input_strings = []
    if(left!=[]):
        input_strings.append('-1 '+left[0])
        input_strings.append('-2 '+right[0])
    if(unpaired!=[]):
        input_strings.append('-s '+unpaired[0])
    cmd = '{0!s} {1!s} --threads {2!s} -o {3!s}; cp {3!s}/contigs.fasta {4!s};'.format(
            fg.tool_path_check(TOOLS_DICT['rnaspades'].full_exe[0]),' '.join(input_strings),cpu_cap,virtual_target,trgs[0])
    name = 'rnaSPAdes_assembly'
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err,cpu=cpu_cap)
def intersect_bed_task(opc, out_dir, bam_file, bed_reference, output_name,
                       tasks):
    trgs = ['{0!s}/{1!s}.bed'.format(out_dir, output_name)]
    # cmd = '{0!s} intersect -abam {1!s} -b {2!s} -wb -bed > {3!s}'.format(
    cmd = '{0!s} -abam {1!s} -b {2!s} -wb -bed > {3!s}'.format(
        tool_path_check(TOOLS_DICT['bedtools'].full_exe[0]), bam_file,
        bed_reference, trgs[0])
    name = 'intersect_bed_' + os.path.basename(
        bed_reference) + '_' + output_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def build_salmon_task(opc, path_assembly, assembly_name, out_dir, cpu_cap,
                      tasks):
    trgs = ['{0!s}/{1!s}_salmon'.format(out_dir, assembly_name)]
    cmd = ('{0!s} index --transcripts {1!s} --index {2!s}/{3!s}_salmon '
           '--threads {4!s} --type quasi').format(
               tool_path_check(TOOLS_DICT['salmon'].full_exe[0]),
               path_assembly, out_dir, assembly_name, cpu_cap)
    name = 'build_salmon_' + assembly_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err,
                cpu=cpu_cap)
def build_diamond_task(path_db_fasta, out_path, tasks):
    ''' Is there a reason that we aren't checking for the installation of daimond?
    '''
    title = os.path.basename(out_path)
    trgs = ['{0!s}'.format(out_path + '.dmnd')]
    cmd = '{0!s} makedb --in {1!s} --db {2!s}'.format(
        tool_path_check(TOOLS_DICT['diamond'].full_exe[0]), path_db_fasta,
        out_path)
    name = 'build_diamond_' + title
    out, err = gen_db_logs(name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def blast_task(blast_type, out_dir, path_query, path_db, cpu_cap, tasks):
    exe_index = 1
    if  blast_type == 'blastx':
        exe_index = 1
    elif blast_type == 'blastp':
        exe_index = 2
    assembly_name = os.path.basename(path_query).split('.')[0]
    db_name = os.path.basename(path_db).split('.')[0]
    trgs = ["{0!s}/{1!s}_{2!s}.{3!s}".format(out_dir, assembly_name, db_name, blast_type)] 
    cmd = ('{0!s} -query {1!s} -db {2!s} -num_threads {3!s} -max_target_seqs 1 '
            '-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart '
            'send evalue bitscore stitle slen" -evalue 0.0001 > {4!s}'
            ).format(fg.tool_path_check(TOOLS_DICT['blast'].full_exe[exe_index]), path_query, path_db, cpu_cap, trgs[0])
    name = '{0!s}_{1!s}_{2!s}'.format(assembly_name, blast_type, db_name)
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,cpu=cpu_cap,stdout=out,stderr=err)
Ejemplo n.º 47
0
def blast_task(opc, blast_type, out_dir, path_query, path_db, cpu_cap, tasks):
    exe_index = 1
    if(blast_type == 'blastx'):
        exe_index = 1
    elif(blast_type == 'blastp'):
        exe_index = 2
    assembly_name = os.path.basename(path_query).split('.')[0]
    db_name = os.path.basename(path_db).split('.')[0]
    trgs = ["{0!s}/{1!s}_{2!s}.{3!s}".format(out_dir, assembly_name, db_name, blast_type)]
    cmd = ('{0!s} -query {1!s} -db {2!s} -num_threads {3!s} -max_target_seqs 1 '
           '-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart '
           'send evalue bitscore stitle slen" -evalue 0.0001 > {4!s}'
           ).format(tool_path_check(TOOLS_DICT['blast'].full_exe[exe_index]),
                    path_query, path_db, cpu_cap, trgs[0])
    name = '{0!s}_{1!s}_{2!s}'.format(assembly_name, blast_type, db_name)
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, cpu=cpu_cap, stdout=out, stderr=err)
def bowtie2_unpaired_task(opc, bowtie2_index, out_dir, fastq, out_name, opt,
                          cpu_cap, tasks):
    opts = ['-a -t --end-to-end', '-t --local', '-k 200 --end-to-end']
    trgs = ['{0!s}/{1!s}.bam'.format(out_dir, out_name)]
    cmd = ('{0!s} {1!s} -L {2!s} -N 1 --threads {3!s} -x {4!s} -U '
           '{5!s} | samtools view -Sb - > {6!s} ').format(
               tool_path_check(TOOLS_DICT['bowtie2'].full_exe[1]), opts[opt],
               22, cpu_cap, bowtie2_index, fastq, trgs[0])
    name = 'bowtie2_' + os.path.basename(bowtie2_index) + '_' + out_name
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err,
                cpu=cpu_cap)
def pfam_build_task(source, out_root_path, tasks):
    ''' Trgs seem to be declared without respect to input.
    '''
    trgs = [
        os.path.join(statics.PATH_PFAM_DIR, os.path.basename(source)) + '.h3f'
    ]
    cmd = 'cd {0!s} ; {1!s} -f {2!s};'.format(
        statics.PATH_DATABASES,
        tool_path_check(TOOLS_DICT['hmmer'].full_exe[1]), source)
    name = 'hmmpress_' + os.path.basename(source)
    out, err = gen_db_logs(name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                stdout=out,
                stderr=err)
def rcorrector_task(opc, out_dir, left, right, cpu_cap, basename, tasks):
    trgs = [
        '{0!s}/{1!s}.cor.fq'.format(out_dir, os.path.basename(left)),
        '{0!s}/{1!s}.corr.fq'.format(out_dir, os.path.basename(right))
    ]
    cmd = 'perl {0!s} -1 {1!s} -2 {2!s} -t {3!s}'.format(
        tool_path_check(TOOLS_DICT['rcorrector'].full_exe[0]), left, right,
        cpu_cap)
    name = 'Rcorrector_' + basename
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                name=name,
                stdout=out,
                stderr=err,
                targets=trgs,
                cpu=cpu_cap)
def trimmomatic_unpaired_task(opc, out_dir, input1, cpu_cap, basename, tasks):
    trgs = [
        '{0!s}/{1!s}_{2!s}'.format(out_dir, basename, os.path.basename(input1))
    ]
    cmd = ('java -jar {0!s} SE -threads {3!s} {1!s} {2!s} ILLUMINACLIP:'
           '{4!s}:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:35'
           ).format(tool_path_check(TOOLS_DICT['trimmomatic'].full_exe[0]),
                    input1, trgs[0], cpu_cap, TOOLS_DICT['trimmomatic'].
                    full_exe[2])  # PATH_TRIMMOMATIC_ADAPTERS_SINGLE)
    name = basename
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                name=name,
                stdout=out,
                stderr=err,
                targets=trgs,
                cpu=cpu_cap)
def prinseq_unpaired_task(opc, out_dir, input1, basename, opts, tasks):
    trgs = [
        '{0!s}/{1!s}_{2!s}'.format(out_dir, basename, os.path.basename(input1))
    ]
    cmd = (
        'perl {0!s} -fastq {1!s} --out_format 3 --out_good {2!s}/{3!s} --out_bad null '
        '--trim_qual_left 20 --trim_qual_right 20 --trim_qual_type min --min_len 35 '
        '--trim_tail_left 8 --trim_tail_right 8 {4!s} -log; mv {2!s}/{3!s}.fastq {5!s}'
    ).format(tool_path_check(TOOLS_DICT['prinseq'].full_exe[0]), input1,
             out_dir, basename, opts, trgs[0])
    name = basename
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                name=name,
                stdout=out,
                stderr=err,
                targets=trgs)
def transrate_task(reads_dir, assembly_path, assembly_name,lefts, rights, singles, out_dir, transrate_dir, cpu_cap, tasks, reference = ''): #, cpu_cap, tasks):
    trgs = ['{0!s}/assemblies.csv'.format(transrate_dir),'{0!s}/{1!s}/good.{1!s}.fasta'.format(transrate_dir,assembly_name),'{0!s}/{1!s}/{1!s}.fasta_quant.sf'.format(transrate_dir,assembly_name)]
    orig_lefts = lefts
    orig_rights = rights
    orig_singles = singles
    lefts = ','.join(lefts+singles)
    rights = ','.join(rights) 
    lefts = '--left '+lefts if(len(lefts) > 0) else ''
    rights = '--right '+rights if(len(rights) > 0) else ''
    reference = '--reference ' + reference if(reference != '') else ''
    cmd = '{0!s} --assembly {1!s} {2!s} {3!s} --threads {4!s} {5!s} --output {6!s}'.format(
           fg.tool_path_check(TOOLS_DICT['transrate'].full_exe[0]), assembly_path, lefts,
           rights, cpu_cap, reference, transrate_dir)
    name = 'transrate_' + assembly_name
    out, err = fg.GEN_LOGS(name)
    temp_task = Task(command=cmd, dependencies=[], targets=trgs, name=name, cpu=cpu_cap, stdout=out, stderr=err, max_wall_time=720)
    deps = transrate_dep_generator(reads_dir, temp_task, orig_lefts, orig_rights, orig_singles, reference, assembly_path, cpu_cap, transrate_dir, tasks)
    temp_task.dependencies = [deps]
    return temp_task
def transdecoder_predict_orfs_task(path_assembly,path_transdecoder_output,tasks,pfam_input='',blastp_input=''):
    ''' Use transdecoder to predict ORF's from input fasta file. 
        Required for  downstream blastp, pfam, tmhmm, signalp.
        Targets: *transdecoder.*
    '''
    pfam,blastp,retain_blastp, retain_pfam = '','','',''
    if len(pfam_input) > 0:
        pfam = '--retain_pfam_hits ' + pfam_input
        retain_pfam = '_retain_pfam'
    if len(blastp_input) > 0:
        blastp = '--retain_blastp_hits ' + blastp_input
        retain_blastp = '_retain_blastp'
    assembly_name = os.path.basename(path_assembly).split('.fa')[0]
    trgs = ['{0!s}/{1!s}.fasta.transdecoder.pep'.format(path_transdecoder_output,assembly_name),'{0!s}/{1!s}.fasta.transdecoder.bed'.format(path_transdecoder_output,assembly_name)]
    cmd = ("mkdir -p {0!s}; cd {0!s}; {1!s} -t {2!s} {3!s} {4!s}").format(path_transdecoder_output,
            fg.tool_path_check(TOOLS_DICT['transdecoder'].full_exe[1]),path_assembly,pfam, blastp)
    name = 'TransDecoder_Predict' + assembly_name + retain_pfam + retain_blastp
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,stdout=out,stderr=err)
def transrate_task(opc,
                   reads_dir,
                   assembly_path,
                   assembly_name,
                   lefts,
                   rights,
                   out_dir,
                   transrate_dir,
                   cpu_cap,
                   tasks,
                   reference=''):  #, cpu_cap, tasks):
    trgs = [
        '{0!s}/assemblies.csv'.format(transrate_dir),
        '{0!s}/{1!s}/good.{1!s}.fasta'.format(transrate_dir, assembly_name),
        '{0!s}/{1!s}/{1!s}.fasta_quant.sf'.format(transrate_dir, assembly_name)
    ]
    orig_lefts = lefts
    orig_rights = rights
    lefts = ','.join(lefts)
    rights = ','.join(rights)
    lefts = '--left ' + lefts if (len(lefts) > 0) else ''
    rights = '--right ' + rights if (len(rights) > 0) else ''
    reference = '--reference ' + reference if (reference != '') else ''
    #cmd = '{0!s} --assembly {1!s} --threads {4!s} {5!s} --output {6!s}'.format( #no reads
    cmd = '{0!s} --assembly {1!s} {2!s} {3!s} --threads {4!s} {5!s} --output {6!s}'.format(
        tool_path_check(TOOLS_DICT['transrate'].full_exe[0]), assembly_path,
        lefts, rights, cpu_cap, reference, transrate_dir)
    name = 'transrate_' + assembly_name
    out, err = gen_logs(opc.path_logs, name)
    temp_task = Task(command=cmd,
                     dependencies=[],
                     targets=trgs,
                     name=name,
                     cpu=cpu_cap,
                     stdout=out,
                     stderr=err,
                     max_wall_time=7200000)
    deps = transrate_dep_generator(reads_dir, temp_task, orig_lefts,
                                   orig_rights, reference, assembly_path,
                                   cpu_cap, transrate_dir, tasks)
    temp_task.dependencies = [deps]
    return temp_task
def prinseq_task(out_dir,input_1, input_2, basename, opts, tasks):
    '''    Defines prinseq task. Uses GEN_PATH_DIR(), PATH_PRINSEQ
        Params :
            input_1 - a list of 1/left fastq files
            input_2 - a list of 2/right fastq files
            basename - the basename for all output files
            opts - optional params for trinity task. 
            tasks = the tasks that this task is dependent on
    '''
    trgs = ['{0!s}/{1!s}_1_{2!s}'.format(out_dir,basename,os.path.basename(input_1)),
            '{0!s}/{1!s}_2_{2!s}'.format(out_dir,basename,os.path.basename(input_2))]
    pseudo_trgs = ['{0!s}/{1!s}_{2!s}.fastq'.format(out_dir,basename,x) for x in range(1,3)]
    cmd = ('perl {0!s} -fastq {1!s} -fastq2 {2!s} --out_format 3 --out_good {3!s}/{4!s} '
            '--out_bad null --trim_qual_left 20 --trim_qual_right 20 --trim_qual_type min '
            '--min_len 55 --trim_tail_left 8 --trim_tail_right 8 {5!s} -log; mv {6!s} {7!s};'
            ' mv {8!s} {9!s};').format(fg.tool_path_check(TOOLS_DICT['prinseq'].full_exe[0]), input_1, 
            input_2, out_dir, basename, opts,pseudo_trgs[0],trgs[0],pseudo_trgs[1],trgs[1])
    name = basename
    out,err = fg.GEN_LOGS(name)
    return Task(command = cmd, dependencies=tasks, name=name, stdout=out, stderr=err, targets=trgs)
def cegma_task(opc, out_dir, assembly, cpu_cap, tasks):
    '''    Defines the cegma task. Uses PATH_DIR, PATH_CEGMA, NAME_ASSEMBLY.
        Params :
            cpu_cap - number of threads to be used by cegma
            tasks - a list of tasks that this task is dependant on (trinity_task)
    '''
    assembly_name = os.path.basename(assembly).split('.fa')[0]
    trgs = ['{0!s}/{1!s}.completeness_report'.format(out_dir, assembly_name)]
    cmd = '{0!s} -g {1!s} -v -o {3!s}/{2!s} -T {4!s}'.format(
        tool_path_check(TOOLS_DICT['cegma'].full_exe[0]), assembly,
        assembly_name, out_dir, cpu_cap)
    name = 'cegma'
    out, err = gen_logs(opc.path_logs, name)
    return Task(command=cmd,
                dependencies=tasks,
                targets=trgs,
                name=name,
                cpu=cpu_cap,
                stdout=out,
                stderr=err)
 def ret():
     for t in other_dependencies:
         try:
             if (not t.finished()):
                 return False
         except Task.ExitCodeException:
             return False
     assembly_files = sorted(os.listdir(reads_dir))
     assembly_files = [os.path.join(reads_dir, f) for f in assembly_files]
     new_lefts = [[g for g in assembly_files if (os.path.basename(f) in g)]
                  for f in lefts]
     new_lefts = [k[0] for k in new_lefts if (len(k) > 0)]
     new_rights = [[
         g for g in assembly_files if (os.path.basename(f) in g)
     ] for f in rights]
     new_rights = [k[0] for k in new_rights if (len(k) > 0)]
     # new_singles = [[g for g in assembly_files if(os.path.basename(f) in g)] for f in singles]
     # new_singles = [k[0] for k in new_singles if(len(k) > 0)]
     if (len(new_lefts) == len(lefts) and len(new_rights)
             == len(rights)):  # and len(new_singles) == len(singles)
         # and len(new_lefts)+len(new_singles) != 0):
         new_lefts = ','.join(new_lefts)  # +new_singles)
         new_rights = ','.join(new_rights)
         new_lefts = '--left ' + new_lefts if (len(new_lefts) > 0) else ''
         new_rights = '--right ' + new_rights if (
             len(new_rights) > 0) else ''
         cmd = '{0!s} --assembly {1!s} {2!s} {3!s} --threads {4!s} {5!s} --output {6!s}'.format(
             #cmd = '{0!s} --assembly {1!s} --threads {4!s} {5!s} --output {6!s}'.format( #no reads
             tool_path_check(TOOLS_DICT['transrate'].full_exe[0]),
             assembly_path,
             new_lefts,
             new_rights,
             cpu_cap,
             reference,
             transrate_dir)
         transrate_task.command = cmd
     else:
         warnings.warn(
             'Unable to match input files with trimmed output. Continuing transrate using input files instead.'
         )
     return True
def trinity_task(path_assembly, out_dir, fastq, fastq2, unpaired, cpu_cap_trin, cpu_cap_bfly, mem_trin, mem_bfly, normalize_flag, tasks):
    '''    Defines the trinity task. Uses GEN_PATH_DIR(), PATH_TRINITY, NAME_ASSEMBLY
        Params :    
            left - a 1/left fastq files
            right - a 2/right fastq files
            cpu_cap - number of threads used by trinity
            tasks - a list of tasks that this task is dependent on
    '''
    normalize_flag = '--normalize_reads' if(normalize_flag) else ''
    input_str = ''
    if(unpaired!=[] and fastq==[]):
        input_str+='--single '+','.join(unpaired)
    if(fastq!=[]):
        input_str+='--left '+','.join(fastq+unpaired)
        input_str+=' --right '+','.join(fastq2)
    trgs = [path_assembly]
    cmd = ('{0!s} --seqType fq {1!s} --CPU {2!s} --max_memory {3!s}G --bflyCalculateCPU {4!s} '
            '--output {6!s}/trinity; cp {6!s}/trinity/Trinity.fasta {7!s};'
            ).format(fg.tool_path_check(TOOLS_DICT['trinity'].full_exe[0]), input_str, cpu_cap_trin, mem_trin, normalize_flag,
                     mem_bfly, out_dir, trgs[0])
    name = 'trinity_assembly'
    out,err = fg.GEN_LOGS(name)
    return Task(command=cmd,dependencies=tasks,targets=trgs,name=name,cpu=max(cpu_cap_trin,cpu_cap_bfly),stdout=out,stderr=err)
def pfam_build_task(source, tasks, log_flag=True):
    trgs = [PATH_PFAM_DATABASE+'.h3f']
    cmd = 'cd {0!s} ; {1!s} -f {2!s};'.format(fg.PATH_DATABASES, fg.tool_path_check(TOOLS_DICT['hmmer'].full_exe[1]), source)
    name = 'hmmpress'
    out, err = fg.GEN_LOGS(name) if(log_flag) else (None, None)
    return Task(command=cmd, dependencies=tasks, targets=trgs, name=name, stdout=out, stderr=err)