def run_trinity(assemblydir): assemblydirs=os.listdir(assemblydir) for genus_species in assemblydirs: print genus_species genus_species_dir=assemblydir+genus_species+"/" listoffiles=os.listdir(genus_species_dir) print listoffiles trinity_command=""" set -x # stops execution if there is an error set -e if [ -f {}trinity_out/Trinity.fasta ]; then exit 0 ; fi if [ -d {}trinity_out ]; then mv {}trinity_out_dir {}trinity_out_dir0 || true ; fi Trinity --left {}{}.left.fq \\ --right {}{}.right.fq \\ --output {}trinity_out --seqType fq --monitoring --bflyCalculateCPU --max_memory 150G \\ --CPU 32 """.format(genus_species_dir,genus_species_dir,genus_species_dir,genus_species_dir,genus_species_dir,genus_species,genus_species_dir,genus_species,genus_species_dir) print trinity_command trinity_command=[trinity_command] #module_load_list=["trinity/2.0.5"] module_load_list=["trinity/2.2.0"] process_name="trinity" clusterfunc.qsub_file(genus_species_dir,process_name,module_load_list,genus_species,trinity_command)
def run_move_files(trimdir,sra): orphan_string=make_orphans(trimdir,sra) mv_string1,mv_string2 = move_files(trimdir,sra) commands = [orphan_string,mv_string1,mv_string2] process_name="move" module_name_list="" filename=sra clusterfunc.qsub_file(trimdir,process_name,module_name_list,filename,commands)
def rename_files(trinitydir, diginormdir, mmetsp): rename_string1 = "cat " + diginormdir + "*.1 > " + trinitydir + "/" + mmetsp + ".left.fq" rename_string2 = "cat " + diginormdir + "*.2 > " + trinitydir + "/" + mmetsp + ".right.fq" rename_string3 = "gunzip -c " + diginormdir + "orphans.keep.abundfilt.fq.gz >> " + trinitydir + mmetsp + ".left.fq" commands = [rename_string1, rename_string2, rename_string3] process_name = "rename" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(diginormdir,process_name,module_name_list,filename,commands)
def send_to_cluster(newdir,command_list,sra,names): commands = [] for string in command_list: commands.append(string) process_name = names module_name_list = "" filename = sra clusterfunc.qsub_file(newdir, process_name, module_name_list, filename, commands)
def run_trimmomatic_TruSeq(missing, trimmed, remaining, trimdir, file1, file2, sra): bash_filename = trimdir + sra + ".trim.TruSeq.sh" clusterfunc.check_dir(trimdir + "qsub_files/") listoffile = os.listdir(trimdir + "qsub_files/") # print listoffile trim_file = trimdir + "qsub_files/" "trim." + sra + ".log" # print trim_file matching = [s for s in listoffile if "trim." + sra + ".log" in s] matching_string = "TrimmomaticPE: Completed successfully" if os.path.isfile(trim_file): with open(trim_file) as f: content = f.readlines() if len(matching) != 0: trim_complete = [m for m in content if matching_string in m] if len(trim_complete) != 0: print "Already trimmed:", matching trimmed.append(sra) else: missing.append(trimdir) j = """ java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra, file1, file2, sra) orphan_string = make_orphans(trimdir, sra) commands = [j, orphan_string] process_name = "trim" module_name_list = "" filename = sra clusterfunc.qsub_file(trimdir, process_name, module_name_list, filename, commands) else: remaining.append(trimdir) j = """ java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra, file1, file2, sra) orphan_string = make_orphans(trimdir, sra) commands = [j, orphan_string] process_name = "trim" module_name_list = "" filename = sra clusterfunc.qsub_file(trimdir, process_name, module_name_list, filename, commands) return missing, trimmed, remaining
def send_to_cluster(newdir, command_list, sra, names): commands = [] for string in command_list: commands.append(string) process_name = names module_name_list = "" filename = sra clusterfunc.qsub_file(newdir, process_name, module_name_list, filename, commands)
def rename_files(trinitydir, diginormdir, mmetsp): rename_string1 = "cat " + diginormdir + "*.1 > " + trinitydir + "/" + mmetsp + ".left.fq" rename_string2 = "cat " + diginormdir + "*.2 > " + trinitydir + "/" + mmetsp + ".right.fq" rename_string3 = "gunzip -c " + diginormdir + "orphans.keep.abundfilt.fq.gz >> " + trinitydir + mmetsp + ".left.fq" commands = [rename_string1, rename_string2, rename_string3] process_name = "rename" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, commands)
def run_trimmomatic_TruSeq(missing, trimmed, remaining, trimdir, file1, file2, sra): bash_filename=trimdir+sra+".trim.TruSeq.sh" clusterfunc.check_dir(trimdir+"qsub_files/") listoffile = os.listdir(trimdir+"qsub_files/") # print listoffile trim_file = trimdir+"qsub_files/""trim."+sra+".log" # print trim_file matching = [s for s in listoffile if "trim."+sra+".log" in s] matching_string = "TrimmomaticPE: Completed successfully" if os.path.isfile(trim_file): with open(trim_file) as f: content = f.readlines() if len(matching)!=0: trim_complete = [m for m in content if matching_string in m] if len(trim_complete)!=0: print "Already trimmed:",matching trimmed.append(sra) else: missing.append(trimdir) j=""" java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra,file1,file2,sra) orphan_string=make_orphans(trimdir,sra) commands = [j,orphan_string] process_name="trim" module_name_list="" filename=sra clusterfunc.qsub_file(trimdir,process_name,module_name_list,filename,commands) else: remaining.append(trimdir) j=""" java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra,file1,file2,sra) orphan_string=make_orphans(trimdir,sra) commands = [j,orphan_string] process_name="trim" module_name_list="" filename=sra clusterfunc.qsub_file(trimdir,process_name,module_name_list,filename,commands) return missing,trimmed,remaining
def run_busco(busco_dir,trinity_fasta,sample): busco_command=""" busco -m trans -in {} \ --cpu 16 -l /mnt/research/ged/lisa/busco/metazoa -o {}.metazoa """.format(trinity_fasta,sample) print busco_command commands = [busco_command] process_name = "busco" module_name_list = "" filename = sample clusterfunc.qsub_file(busco_dir,process_name,module_name_list,filename,commands)
def split_paired_reads(trinitydir, diginormdir, mmetsp): diginorm_files_dir = diginormdir + "qsub_files/" listoffiles = os.listdir(diginorm_files_dir) for digi_filename in listoffiles: if digi_filename.endswith(".pe"): split_paired = "split-paired-reads.py -d " + diginormdir + " " + diginorm_files_dir + digi_filename commands = [split_paired] process_name = "split" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(diginormdir,process_name,module_name_list,filename,commands)
def run_busco(busco_dir,trinity_fasta,sample,sra): busco_command=""" busco -m trans -in {} \ --cpu 30 -l /mnt/research/ged/lisa/busco/eukaryota -o {}.euk """.format(trinity_fasta,sample) print busco_command commands = [busco_command] process_name = "busco" module_name_list = "" filename = sra clusterfunc.qsub_file(busco_dir,process_name,module_name_list,filename,commands)
def run_rap_clust(salmondir,rapclustdir,sra): quant_file=get_quant_file(salmondir,sra) config_file=get_config_file(quant_file,rapclustdir,sra) config_filename=rapclustdir+sra+"_config.yaml" rapclust_string="RapClust --config "+str(config_filename) print rapclust_string commands=[rapclust_string] process_name="rapclust" module_name_list="" filename=sra clusterfunc.qsub_file(rapclustdir,process_name,module_name_list,filename,commands)
def run_filter_abund(diginormdir, sra): keep_dir = diginormdir + "qsub_files/" filter_string = """ filter-abund.py -V -Z 18 {}norm.C20k20.ct {}*.keep """.format(diginormdir, keep_dir) extract_paired_string = extract_paired() commands = [filter_string, extract_paired_string] process_name = "filtabund" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = sra clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, commands)
def run_rap_clust(salmondir, rapclustdir, sra): quant_file = get_quant_file(salmondir, sra) config_file = get_config_file(quant_file, rapclustdir, sra) config_filename = rapclustdir + sra + "_config.yaml" rapclust_string = "RapClust --config " + str(config_filename) print rapclust_string commands = [rapclust_string] process_name = "rapclust" module_name_list = "" filename = sra clusterfunc.qsub_file(rapclustdir, process_name, module_name_list, filename, commands)
def rename_files(trinitydir,diginormdir,diginormfile,SRA): # takes diginormfile in,splits reads and put into newdir rename_orphans = combine_orphans(diginormdir) split_paired = "split-paired-reads.py -d "+diginormdir+" "+diginormfile rename_string1 = "cat "+diginormdir+"*.1 > "+trinitydir+SRA+".left.fq" rename_string2 = "cat "+diginormdir+"*.2 > "+trinitydir+SRA+".right.fq" rename_string3 = "gunzip -c "+diginormdir+"orphans.keep.abundfilt.fq.gz >> "+trinitydir+SRA+".left.fq" commands=[rename_orphans,split_paired,rename_string1,rename_string2,rename_string3] process_name="rename" module_name_list=["GNU/4.8.3","khmer/2.0"] filename=SRA clusterfunc.qsub_file(diginormdir,process_name,module_name_list,filename,commands)
def split_paired_reads(trinitydir, diginormdir, mmetsp): diginorm_files_dir = diginormdir + "qsub_files/" listoffiles = os.listdir(diginorm_files_dir) for digi_filename in listoffiles: if digi_filename.endswith(".pe"): split_paired = "split-paired-reads.py -d " + diginormdir + " " + diginorm_files_dir + digi_filename commands = [split_paired] process_name = "split" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, commands)
def run_normalize_by_median(diginormdir, mmetsp): normalize_median_string = """ normalize-by-median.py -p -k 20 -C 20 -M 4e9 \\ --savegraph {}norm.C20k20.ct \\ {}*.fq """.format(diginormdir, diginormdir) normalize_median_command = [normalize_median_string] process_name = "diginorm" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, normalize_median_command)
def run_diginorm(diginormdir, interleavedir, trimdir, sra): normalize_median_string = """ normalize-by-median.py -p -k 20 -C 20 -M 4e9 \\ --savegraph {}norm.C20k20.ct \\ -u {}orphans.fq.gz \\ {}*.fq """.format(diginormdir, trimdir, interleavedir) normalize_median_command = [normalize_median_string] process_name = "diginorm" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = sra clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, normalize_median_command)
def interleave_reads(trimdir, sra, interleavedir): interleavefile = interleavedir + sra + ".trimmed.interleaved.fq" if os.path.isfile(interleavefile): print "already interleaved" else: interleave_string = "interleave-reads.py " + trimdir + sra + ".trim_1P.fq " + trimdir + sra + ".trim_2P.fq > " + interleavefile print interleave_string interleave_command = [interleave_string] process_name = "interleave" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = sra clusterfunc.qsub_file(interleavedir, process_name, module_name_list, filename, interleave_command)
def interleave_reads(trimdir, sra, interleavedir): interleavefile = interleavedir + sra + ".trimmed.interleaved.fq" if os.path.isfile(interleavefile): print "already interleaved" else: interleave_string = "interleave-reads.py " + trimdir + sra + \ ".trim_1P.fq " + trimdir + sra + ".trim_2P.fq > " + interleavefile print interleave_string interleave_command = [interleave_string] process_name = "interleave" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = sra clusterfunc.qsub_file(interleavedir, process_name, module_name_list, filename, interleave_command)
def combine_orphans(diginormdir,mmetsp): diginorm_files_dir = diginormdir + "qsub_files/" rename_orphans = """ touch {}orphans.keep.abundfilt.fq.gz for file in {}*.se do gzip -9c ${{file}} >> {}orphans.keep.abundfilt.fq.gz done """.format(diginormdir,diginorm_files_dir, diginormdir) commands = [rename_orphans] process_name = "orphans" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(diginormdir,process_name,module_name_list,filename,commands)
def combine_orphans(diginormdir, mmetsp): diginorm_files_dir = diginormdir + "qsub_files/" rename_orphans = """ touch {}orphans.keep.abundfilt.fq.gz for file in {}*.se do gzip -9c ${{file}} >> {}orphans.keep.abundfilt.fq.gz done """.format(diginormdir, diginorm_files_dir, diginormdir) commands = [rename_orphans] process_name = "orphans" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, commands)
def run_diginorm(mmetsp_dir,mmetsp): normalize_median_string = """ normalize-by-median.py -p -k 20 -C 20 -M 4e9 \\ --savegraph {}norm.C20k20.ct \\ -u {}orphans.fq.gz \\ {}*.interleaved.fq """.format(mmetsp_dir,mmetsp_dir,mmetsp_dir) #s=subprocess.Popen("cat diginorm.sh",shell=True) # s.wait() normalize_median_command = [normalize_median_string] process_name = "diginorm" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(mmetsp_dir, process_name, module_name_list, filename, normalize_median_command)
def get_trinity(trinitydir, left, right, SRA): trinity_command = """ set -x # stops execution if there is an error set -e if [ -f {}trinity_out/Trinity.fasta ]; then exit 0 ; fi #if [ -d {}trinity_out ]; then mv {}trinity_out_dir {}trinity_out_dir0 || true ; fi Trinity --left {} \\ --right {} --output {}trinity_out --seqType fq --JM 20G --CPU 16 """.format(trinitydir, trinitydir, trinitydir, trinitydir, left, right, trinitydir) commands = [trinity_command] process_name = "trinity" module_name_list = ["trinity/20140413p1"] filename = SRA clusterfunc.qsub_file(trinitydir, process_name, module_name_list, filename, commands)
def interleave_reads(fastq_list,mmetsp_dir, mmetsp, diginormdir): for i in range(0,len(fastq_list),2): print i left = fastq_list[i] print left right = fastq_list[i+1] print right interleave_file = diginormdir + left.split(".")[0] + ".interleaved.fq" interleave_string = "interleave-reads.py " + mmetsp_dir + left + " " + mmetsp_dir + right + " > " + interleave_file print interleave_string interleave_command = [interleave_string] process_name = "interleave" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = left.split(".")[0] clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, interleave_command)
def interleave_reads(fastq_list, mmetsp_dir, mmetsp, diginormdir): for i in range(0, len(fastq_list), 2): print i left = fastq_list[i] print left right = fastq_list[i + 1] print right interleave_file = diginormdir + left.split(".")[0] + ".interleaved.fq" interleave_string = "interleave-reads.py " + mmetsp_dir + left + " " + mmetsp_dir + right + " > " + interleave_file print interleave_string interleave_command = [interleave_string] process_name = "interleave" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = left.split(".")[0] clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, interleave_command)
def rename_files(trinitydir, diginormdir, diginormfile, SRA): # takes diginormfile in,splits reads and put into newdir rename_orphans = combine_orphans(diginormdir) split_paired = "split-paired-reads.py -d " + diginormdir + " " + diginormfile rename_string1 = "cat " + diginormdir + "*.1 > " + trinitydir + SRA + ".left.fq" rename_string2 = "cat " + diginormdir + "*.2 > " + trinitydir + SRA + ".right.fq" rename_string3 = "gunzip -c " + diginormdir + "orphans.keep.abundfilt.fq.gz >> " + trinitydir + SRA + ".left.fq" commands = [ rename_orphans, split_paired, rename_string1, rename_string2, rename_string3 ] process_name = "rename" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = SRA clusterfunc.qsub_file(diginormdir, process_name, module_name_list, filename, commands)
def run_trinity(trinitydir, left, right, SRA): trinity_command = """ set -x # stops execution if there is an error set -e if [ -f {}trinity_out/Trinity.fasta ]; then exit 0 ; fi #if [ -d {}trinity_out ]; then mv {}trinity_out_dir {}trinity_out_dir0 || true ; fi Trinity --left {} \\ --right {} --output {}trinity_out --seqType fq --JM 20G --CPU 16 """.format(trinitydir, trinitydir, trinitydir, trinitydir, left, right, trinitydir) commands = [trinity_command] process_name = "trinity" module_name_list = ["trinity/20140413p1"] filename = SRA clusterfunc.qsub_file(trinitydir, process_name, module_name_list, filename, commands)
def run_trinity(trinitydir, left, right, mmetsp, output_dir, file_extension): trinity_command = """ set -x # stops execution if there is an error set -e Trinity --left {} \\ --right {} --output /tmp/{}{} --full_cleanup --seqType fq --max_memory 20G --CPU 16 cp /tmp/{}*.fasta {} rm -rf /tmp/{}* """.format(left, right, mmetsp, file_extension, mmetsp, output_dir, mmetsp) commands = [trinity_command] process_name = "trinity_2.2.0" module_name_list = ["trinity/2.2.0"] filename = mmetsp clusterfunc.qsub_file(trinitydir, process_name, module_name_list, filename, commands)
def quant_salmon(salmondir, sra, newdir, trinity_fasta): file1 = newdir + "trim/" + sra + ".trim_1P.fq" file2 = newdir + "trim/" + sra + ".trim_2P.fq" if os.path.isfile(file1): print "file exists:", file1 else: print "missing:", file1 if os.path.isfile(file2): print "file exists:", file2 index, salmon_index_string = salmon_index(salmondir, sra, trinity_fasta) salmon_string = "salmon quant -i " + index + " --libType IU -1 " + file1 + \ " -2 " + file2 + " -o " + salmondir + sra + ".quant --dumpEq --auxDir aux" commands = [salmon_index_string, salmon_string] process_name = "salmon" module_name_list = "" filename = sra clusterfunc.qsub_file(salmondir, process_name, module_name_list, filename, commands)
def transrate(trinitydir, transrate_dir, transrate_out, trinity_fasta, sample, trimdir, sra): #transrate_command=""" #transrate -o {} --assembly {} #""".format(transrate_out,trinity_fasta) transrate_command = """ transrate --assembly={}{}.Trinity.fixed.fa --threads=27 \ --left={}{}.trim_1P.fq \ --right={}{}.trim_2P.fq \ --output={} """.format(trinitydir, sample, trimdir, sra, trimdir, sra, transrate_out) print transrate_command commands = [transrate_command] process_name = "transrate" module_name_list = "" filename = sra clusterfunc.qsub_file(transrate_dir, process_name, module_name_list, filename, commands)
def run_trinity(trinitydir,left,right,mmetsp): trinity_command = """ set -x # stops execution if there is an error set -e if [ -f {}trinity_out_2.2.0.Trinity.fasta ]; then exit 0 ; fi Trinity --left {} \\ --right {} --output /tmp/{}.trinity_out_2.2.0 --full_cleanup --seqType fq --max_memory 20G --CPU 16 cp /tmp/{}.trinity_out_2.2.0.Trinity.fasta /mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/ rm -rf /tmp/{}.trinity_out_2.2.0* """.format(trinitydir, left, right, mmetsp, mmetsp,mmetsp) commands = [trinity_command] process_name = "trinity_2.2.0" module_name_list = ["trinity/2.2.0"] filename = mmetsp clusterfunc.qsub_file(trinitydir, process_name, module_name_list, filename, commands)
def fastqc_report(fastq_file_list, newdir, fastqcdir, filename): # imports list of files in each directory print fastq_file_list print fastqcdir + filename if glob.glob(fastqcdir + filename + "_*_fastqc.zip"): print "fastqc already complete:", filename else: # creates command to generate fastqc reports from all files in list file_string = str(fastq_file_list) # print fastq_file_list file_string = " ".join(fastq_file_list) # print file_string fastqc_string = "fastqc -o " + fastqcdir + " " + file_string print "fastqc reports being generated for: " + str(fastq_file_list) fastqc_command = [fastqc_string] process_name = "fastqc" module_name_list = "" filename = filename clusterfunc.qsub_file(fastqcdir, process_name, module_name_list, filename, fastqc_command)
def get_sourmash_command(SRA,trinitydir): filename=SRA+".left.fq" full_filename=trinitydir+filename if os.path.isfile(full_filename): if os.stat(full_filename).st_size!=0: #sourmash_command=""" #head -4000000 {} > /mnt/scratch/ljcohen/mmetsp_tmp/{}.head #""".format(full_filename,filename) sourmash_command=""" sourmash compute --protein -k 18,21 -f /mnt/scratch/ljcohen/mmetsp_tmp/{}.head """.format(filename) #s=subprocess.Popen(sourmash_command,shell=True) #s.wait() commands=[sourmash_command] process_name="sourmash" module_name_list=[""] filename=SRA clusterfunc.qsub_file("/mnt/scratch/ljcohen/mmetsp_tmp/",process_name,module_name_list,filename,commands) else: print "File is empty:",filename
def interleave_reads(mmetsp_dir, mmetsp): interleave_string = """ cd {} for filename in *.trim_1P.fq do base=$(basename $filename .fq) echo $base base2=${{base/_1P/_2P}} echo $base2 output=${{base/_1P/}}.interleaved.fq #echo $output (interleave-reads.py ${{base}}.fq ${{base2}}.fq | gzip > $output) done """.format(mmetsp_dir) print interleave_string interleave_command = [interleave_string] process_name = "interleave" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc.qsub_file(mmetsp_dir, process_name, module_name_list, filename, interleave_command)
def run_streaming_diginorm(trimdir, SRA, diginormdir): # from Jessica's streaming protocol: diginormfile = diginormdir + SRA + ".stream.diginorm.sh" # os.chdir(diginormdir) stream_string = """#!/bin/bash (interleave-reads.py {}{}.trim_1P.fq {}{}.trim_2P.fq && zcat {}orphans.fq.gz)| \\ (trim-low-abund.py -V -k 20 -Z 18 -C 2 - -o - -M 4e9 --diginorm --diginorm-coverage=20) | \\ (extract-paired-reads.py --gzip -p {}{}.paired.gz -s {}{}.single.gz) > /dev/null """.format(trimdir, SRA, trimdir, SRA, trimdir, diginormdir, SRA, diginormdir, SRA) print stream_string # with open(diginormfile,"w") as diginorm_script: # diginorm_script.write(stream_string) #s=subprocess.Popen("sudo bash "+diginormfile,shell=True) # s.wait() # print "file written:",diginormfile # os.chdir("/home/ubuntu/MMETSP/") streaming_diginorm_command = [stream_string] module_load_list = [] process_name = "diginorm_stream" clusterfunc.qsub_file(diginormdir, process_name, module_load_list, SRA, streaming_diginorm_command)
def run_streaming_diginorm(trimdir, SRA, diginormdir): # from Jessica's streaming protocol: diginormfile = diginormdir + SRA + ".stream.diginorm.sh" #os.chdir(diginormdir) stream_string = """#!/bin/bash (interleave-reads.py {}{}.trim_1P.fq {}{}.trim_2P.fq && zcat {}orphans.fq.gz)| \\ (trim-low-abund.py -V -k 20 -Z 18 -C 2 - -o - -M 4e9 --diginorm --diginorm-coverage=20) | \\ (extract-paired-reads.py --gzip -p {}{}.paired.gz -s {}{}.single.gz) > /dev/null """.format(trimdir, SRA, trimdir, SRA, trimdir, diginormdir, SRA, diginormdir, SRA) print stream_string #with open(diginormfile,"w") as diginorm_script: # diginorm_script.write(stream_string) #s=subprocess.Popen("sudo bash "+diginormfile,shell=True) #s.wait() #print "file written:",diginormfile #os.chdir("/home/ubuntu/MMETSP/") streaming_diginorm_command = [stream_string] module_load_list = [] process_name = "diginorm_stream" clusterfunc.qsub_file(diginormdir, process_name, module_load_list, SRA, streaming_diginorm_command)
def get_assemblies(assemblydir): genus_species_dirs=os.listdir(assemblydir) for genus_species in genus_species_dirs: left=assemblydir+genus_species+"/"+genus_species+".left.fq" right=assemblydir+genus_species+"/"+genus_species+".right.fq" if os.path.isfile(left): print left else: print "there's a problem:",left if os.path.isfile(right): print right else: print "there's a problem:",right trinity_out_dir=assemblydir+genus_species+"/trinity_out/" trinity_fasta=trinity_out_dir+"Trinity.fasta" if os.path.isfile(trinity_fasta): fixed_trinity_fasta=fix_fasta(trinity_fasta,trinity_out_dir,genus_species) transrate_command=transrate(fixed_trinity_fasta,genus_species,left,right) transrate_command=[transrate_command] module_load_list=["BLAST+/2.2.31"] process_name="transrate" clusterfunc.qsub_file(trinity_out_dir,process_name,module_load_list,genus_species,transrate_command) else: print "Assembly not completed:",genus_species
def send_to_cluster(basedir,commands,name): process_name = "delete" module_name_list = "" filename = name clusterfunc.qsub_file(basedir, process_name, module_name_list, filename, commands)
#6. Create symbolic link from data files to working directory def sym_link(newdir): listoffiles=os.listdir(newdir) for i in listoffiles: if i.endswith(".subset100k.fastq"): symlink_string="ln -fs "+newdir+i+" /mnt/mmetsp/"+i print symlink_string ======= print "fastqc reports being generated for: "+str(fastq_file_list) fastqc_command=[fastqc_string] process_name="fastqc" module_name_list="" filename=filename clusterfunc.qsub_file(fastqcdir,process_name,module_name_list,filename,fastqc_command) >>>>>>> .merge_file_eM2D9z # this is the main function to execute def execute(basedir,url_data): for item in url_data.keys(): #Creates directory for each file to be downloaded #Directory will be located according to organism and read type (single or paired) organism=item[0] seqtype=item[1] org_seq_dir=basedir+organism+"/" print org_seq_dir clusterfunc.check_dir(org_seq_dir) url_list=url_data[item] for url in url_list:
def send_to_cluster(basedir, commands, name): process_name = "delete" module_name_list = "" filename = name clusterfunc.qsub_file(basedir, process_name, module_name_list, filename, commands)
print "file exists:",file2 <<<<<<< .merge_file_ylVy1d else: print "missing:",file2 salmon_string="salmon quant -i "+index+" --libType IU -1 "+file1+" -2 "+file2+" -o "+salmondir+sra+".quant" s=subprocess.Popen(salmon_string,shell=True) s.wait() os.chdir("/home/ubuntu/MMETSP/") ======= index,salmon_index_string = salmon_index(salmondir,sra,trinity_fasta) salmon_string="salmon quant -i "+index+" --libType IU -1 "+file1+" -2 "+file2+" -o "+salmondir+sra+".quant --dumpEq --auxDir aux" commands = [salmon_index_string,salmon_string] process_name = "salmon" module_name_list = "" filename = sra clusterfunc.qsub_file(salmondir,process_name,module_name_list,filename,commands) def gather_counts(): gather_counts_string="python /home/ubuntu/MMETSP/gather-counts.py" return gather_counts_string >>>>>>> .merge_file_hxAl3W def gather_counts(salmondir): os.chdir(salmondir) gather_counts="python /home/ubuntu/MMETSP/gather-counts.py" print os.getcwd() print gather_counts #s=subprocess.Popen(gather_counts,shell=True) #s.wait()