def execute(basedir,url_data): trinity_scripts=[] for item in url_data.keys(): #Creates directory for each file to be downloaded #Directory will be located according to organism and read type (single or paired) organism=item[0] seqtype=item[1] org_seq_dir=basedir+organism+"/" # from here, split paired reads # then go do assembly clusterfunc.check_dir(org_seq_dir) url_list=url_data[item] for url in url_list: SRA=basename(urlparse(url).path) newdir=org_seq_dir+SRA+"/" diginormdir=newdir+"diginorm/" diginormfile=diginormdir+SRA+".trimmed.interleaved.keep.abundfilt.fq.gz" trinitydir=newdir+"trinity/" clusterfunc.check_dir(trinitydir) if os.path.isfile(diginormfile): print "file exists:",diginormfile trinity_script=get_trinity_script(trinitydir,SRA) trinity_scripts.append(trinity_script) #build_files(trinitydir,diginormfile,SRA) run_trinity(trinity_scripts)
def move_files(url_data, basedir, newdir): for item in url_data: organism = item[0].replace("'", "") sra = item[1] mmetsp = item[2] if mmetsp.endswith("_2"): mmetsp = mmetsp.split("_")[0] org_seq_dir = basedir + organism + "/" + sra + "/" mmetsp_dir = newdir + mmetsp + "/" print mmetsp_dir clusterfunc.check_dir(mmetsp_dir) file1_old = org_seq_dir + "trinity/" + sra + ".left.fq" file2_old = org_seq_dir + "trinity/" + sra + ".right.fq" file1_new = mmetsp_dir + sra + ".left.fq" file2_new = mmetsp_dir + sra + ".right.fq" if os.path.isfile(file1_new): if os.path.isfile(file2_new): print file1_new print file2_new else: cp_string1 = copy_fastq_filesdir(mmetsp_dir, file1_old) cp_string2 = copy_fastq_filesdir(mmetsp_dir, file2_old) commands = [cp_string1, cp_string2] id = sra + "_" + mmetsp send_to_cluster(basedir, commands, id) print cp_string1 print cp_string2
def sim_link(salmondir,sra): counts_files_dir="/home/ubuntu/MMETSP/counts/" clusterfunc.check_dir(counts_files_dir) link_command="cp "+salmondir+sra+".quant.counts "+counts_files_dir+sra+".counts" print link_command s=subprocess.Popen(link_command,shell=True) s.wait()
def execute(url_data,datadir): for item in url_data.keys(): organism=item[0] org_seq_dir=datadir+organism+"/" url_list=url=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trimdir=newdir+"trim/" interleavedir=newdir+"interleave/" clusterfunc.check_dir(trimdir) interleavedir=newdir+"interleave/" clusterfunc.check_dir(interleavedir) file1=newdir+sra+"_1.fastq" file2=newdir+sra+"_2.fastq" if os.path.isfile(file1) and os.path.isfile(file2): print file1 print file2 #fastqc_report(datadir,fastqcdir) ### need to fix so the following steps run themselves: run_trimmomatic_TruSeq(trimdir,file1,file2,sra) interleave_reads(trimdir,sra,interleavedir) #run_jellyfish(trimdir,sra) make_orphans(trimdir) else: print "Files do not exist:",file1,file2
def move_files(url_data,basedir,newdir): for item in url_data: organism = item[0].replace("'","") sra = item[1] mmetsp = item[2] if mmetsp.endswith("_2"): mmetsp = mmetsp.split("_")[0] org_seq_dir = basedir + organism + "/" + sra + "/" mmetsp_dir = newdir + mmetsp + "/" print mmetsp_dir clusterfunc.check_dir(mmetsp_dir) file1_old = org_seq_dir + "trinity/" + sra + ".left.fq" file2_old = org_seq_dir + "trinity/" + sra + ".right.fq" file1_new = mmetsp_dir + sra + ".left.fq" file2_new = mmetsp_dir + sra + ".right.fq" if os.path.isfile(file1_new): if os.path.isfile(file2_new): print file1_new print file2_new else: cp_string1 = copy_fastq_filesdir(mmetsp_dir,file1_old) cp_string2 = copy_fastq_filesdir(mmetsp_dir,file2_old) commands = [cp_string1,cp_string2] id = sra + "_" + mmetsp send_to_cluster(basedir,commands,id) print cp_string1 print cp_string2
def check_assemblies(url_data, assemblies, mmetsp_data): different = [] for item in url_data: organism = item[0].replace("'", "") seqtype = item[1] mmetsp_id = item[2].replace("'", "") strain, organism_mmetsp, different, alt = get_strain( different, mmetsp_id, organism, mmetsp_data) org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) if alt == "blank": sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id else: sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id + "_alt_" + alt newdir = org_seq_dir + sra + "/" if sra in assemblies: trinitydir = newdir + "trinity/" #trinity_fasta = trinitydir+"trinity_out/"+"Trinity.fasta" trinity_fasta_new = trinitydir + sample + ".Trinity.fixed.fasta" #trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra) trinity_fasta_old = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta" assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/" if os.path.isfile(trinity_fasta_old) == True: #trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra) #copy_string="cp "+trinity_fasta_old+" "+trinity_fasta_new copy_string = "cp " + trinity_fasta_new + " " + assemblydir print copy_string #s=subprocess.Popen(copy_string,shell=True) #s.wait() else: print "Trinity finished but don't have fixed version to copy."
def execute(url_data,datadir): missing = [] trimmed = [] remaining = [] for item in url_data.keys(): organism=item[0].replace("'","") org_seq_dir=datadir+organism+"/" url_list=url=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trimdir=newdir+"trim/" interleavedir=newdir+"interleave/" clusterfunc.check_dir(trimdir) clusterfunc.check_dir(interleavedir) file1=newdir+sra+"_1.fastq" file2=newdir+sra+"_2.fastq" #if os.path.isfile(file1) and os.path.isfile(file2): # print file1 # print file2 missing,trimmed,remaining= run_trimmomatic_TruSeq(missing,trimmed,remaining,trimdir,file1,file2,sra) #run_move_files(trimdir,sra) # check_files(trimdir,sra) # else: # print "Files do not exist:",file1,file2 print "Missing trimmed:",len(missing) print missing print "Trimmed:",len(trimmed) print "remaining:",len(remaining) print remaining
def check_assemblies(url_data,assemblies,mmetsp_data): different = [] for item in url_data: organism = item[0].replace("'","") seqtype = item[1] mmetsp_id = item[2].replace("'","") strain,organism_mmetsp,different,alt = get_strain(different,mmetsp_id,organism,mmetsp_data) org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) if alt == "blank": sample = organism+"_"+strain+"_"+sra+"_"+mmetsp_id else: sample = organism+"_"+strain+"_"+sra+"_"+mmetsp_id+"_alt_"+alt newdir = org_seq_dir + sra + "/" if sra in assemblies: trinitydir = newdir + "trinity/" #trinity_fasta = trinitydir+"trinity_out/"+"Trinity.fasta" trinity_fasta_new =trinitydir+sample+".Trinity.fixed.fasta" #trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra) trinity_fasta_old = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta" assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/" if os.path.isfile(trinity_fasta_old) == True: #trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra) #copy_string="cp "+trinity_fasta_old+" "+trinity_fasta_new copy_string="cp "+trinity_fasta_new+" "+assemblydir print copy_string #s=subprocess.Popen(copy_string,shell=True) #s.wait() else: print "Trinity finished but don't have fixed version to copy."
def execute(data_frame,url_data,basedir): trinity_fail=[] count = 0 # construct an empty pandas dataframe to add on each assembly.csv to for item in url_data.keys(): #print item organism=item[0] sample="_".join(item) org_seq_dir=basedir+organism+"/" url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trimdir=newdir+"trim/" trinitydir=newdir+"trinity/" busco_dir=newdir+"busco/qsub_files/" clusterfunc.check_dir(busco_dir) trinity_fasta=trinitydir+sample+".Trinity.fixed.fasta" busco_file=busco_dir+"run_"+sample+".euk/short_summary_"+sample+".euk" print busco_file if os.path.isfile(busco_file): count+=1 #run_busco(busco_dir,trinity_fasta,sample,sra) data=parse_busco_stats(busco_file,sample) data_frame=build_DataFrame(data_frame,data) else: print "Trinity failed:",trinity_fasta trinity_fail.append(newdir) print "This is the number of Trinity de novo transcriptome assemblies:" print count print "This is the number of times Trinity failed:" print len(trinity_fail) print trinity_fail return data_frame
def run_trimmomatic_TruSeq(missing, trimmed, remaining, trimdir, file1, file2, sra): bash_filename = trimdir + sra + ".trim.TruSeq.sh" clusterfunc.check_dir(trimdir + "qsub_files/") listoffile = os.listdir(trimdir + "qsub_files/") # print listoffile trim_file = trimdir + "qsub_files/" "trim." + sra + ".log" # print trim_file matching = [s for s in listoffile if "trim." + sra + ".log" in s] matching_string = "TrimmomaticPE: Completed successfully" if os.path.isfile(trim_file): with open(trim_file) as f: content = f.readlines() if len(matching) != 0: trim_complete = [m for m in content if matching_string in m] if len(trim_complete) != 0: print "Already trimmed:", matching trimmed.append(sra) else: missing.append(trimdir) j = """ java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra, file1, file2, sra) orphan_string = make_orphans(trimdir, sra) commands = [j, orphan_string] process_name = "trim" module_name_list = "" filename = sra clusterfunc.qsub_file(trimdir, process_name, module_name_list, filename, commands) else: remaining.append(trimdir) j = """ java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra, file1, file2, sra) orphan_string = make_orphans(trimdir, sra) commands = [j, orphan_string] process_name = "trim" module_name_list = "" filename = sra clusterfunc.qsub_file(trimdir, process_name, module_name_list, filename, commands) return missing, trimmed, remaining
def execute(data_frame1,data_frame2,mmetsp_data,basedir,mmetsp_assemblies): trinity_fail=[] reference_filename = "blank" # construct an empty pandas dataframe to add on each assembly.csv to for item in mmetsp_data.keys(): #print item organism=item[0] sample="_".join(item) org_seq_dir=basedir+organism+"/" mmetsp_list=mmetsp_data[item] for mmetsp in mmetsp_list: print mmetsp assemblyfileslist=os.listdir(mmetsp_assemblies) for filename in assemblyfileslist: if filename.startswith(mmetsp): if filename.endswith(".fixed.fa"): print "This is not the one you want." else: print "MMETSP assembly found:",filename reference_filename=filename if reference_filename == "blank": print "No MMETSP file found:",mmetsp break else: sra=item[1] newdir=org_seq_dir+sra+"/" trinitydir=newdir+"trinity/" transrate_dir=newdir+"transrate/" transrate_reference_dir=newdir+"transrate_dib_v_ncgr_cds/" clusterfunc.check_dir(transrate_reference_dir) transrate_reverse_dir=newdir+"transrate_ncgr_cds_v_dib/" clusterfunc.check_dir(transrate_reverse_dir) trinity_fasta=trinitydir+sample+".Trinity.fixed.fasta" if os.path.isfile(trinity_fasta): print trinity_fasta fixed_mmetsp_ref=fix_fasta_reference(reference_filename,mmetsp_assemblies) transrate(transrate_reference_dir,sample,trinity_fasta,mmetsp_assemblies_dir,fixed_mmetsp_ref) transrate_reverse(transrate_reverse_dir,sample,trinity_fasta,mmetsp_assemblies_dir,fixed_mmetsp_ref) else: print "Trinity failed:",newdir trinity_fail.append(newdir) transrate_assemblies_ref=transrate_reference_dir+sample+"/assemblies.csv" transrate_reverse_assemblies=transrate_reverse_dir+sample+"/assemblies.csv" print transrate_assemblies_ref print transrate_reverse_assemblies if os.path.isfile(transrate_assemblies_ref): data1=parse_transrate_stats(transrate_assemblies_ref) data_frame1=build_DataFrame(data_frame1,data1) if os.path.isfile(transrate_reverse_assemblies): data2=parse_transrate_stats(transrate_reverse_assemblies) data_frame2=build_DataFrame(data_frame2,data2) print "This is the number of times Trinity failed:" print len(trinity_fail) print trinity_fail return data_frame1,data_frame2
def execute(url_data): for item in url_data.keys(): organism=item[0] org_seq_dir=basedir+organism+"/" url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trinitydir=newdir+"trinity/trinity_out/" salmondir=newdir+"salmon/" clusterfunc.check_dir(salmondir)
def run_trimmomatic_TruSeq(missing, trimmed, remaining, trimdir, file1, file2, sra): bash_filename=trimdir+sra+".trim.TruSeq.sh" clusterfunc.check_dir(trimdir+"qsub_files/") listoffile = os.listdir(trimdir+"qsub_files/") # print listoffile trim_file = trimdir+"qsub_files/""trim."+sra+".log" # print trim_file matching = [s for s in listoffile if "trim."+sra+".log" in s] matching_string = "TrimmomaticPE: Completed successfully" if os.path.isfile(trim_file): with open(trim_file) as f: content = f.readlines() if len(matching)!=0: trim_complete = [m for m in content if matching_string in m] if len(trim_complete)!=0: print "Already trimmed:",matching trimmed.append(sra) else: missing.append(trimdir) j=""" java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra,file1,file2,sra) orphan_string=make_orphans(trimdir,sra) commands = [j,orphan_string] process_name="trim" module_name_list="" filename=sra clusterfunc.qsub_file(trimdir,process_name,module_name_list,filename,commands) else: remaining.append(trimdir) j=""" java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra,file1,file2,sra) orphan_string=make_orphans(trimdir,sra) commands = [j,orphan_string] process_name="trim" module_name_list="" filename=sra clusterfunc.qsub_file(trimdir,process_name,module_name_list,filename,commands) return missing,trimmed,remaining
def execute(assemblydirs,salmondir,assemblydir,basedir,trimdir): for genus_species_names in assemblydirs: genus_species = genus_species_names.split(".")[0] species = genus_species+genus_species_names.split(".")[1] print genus_species print species dirname=trimdir+genus_species+"/" newdir=salmondir+genus_species_names+"/" clusterfunc.check_dir(newdir) trinity_fasta=assemblydir+genus_species_names+"/"+genus_species_names+".Trinity.fixed.fa" quant_salmon(newdir,dirname,genus_species_names,trinity_fasta,species)
def execute(url_data): for item in url_data.keys(): organism=item[0] org_seq_dir=basedir+organism+"/" url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trinitydir=newdir+"trinity/trinity_out/" clusterfunc.check_dir(trinitydir) trinity_fasta=trinitydir+"Trinity.fasta" fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sra)
def execute(url_data): for item in url_data.keys(): organism=item[0] org_seq_dir=basedir+organism+"/" url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trinitydir=newdir+"trinity/trinity_out/" salmondir=newdir+"salmon/" clusterfunc.check_dir(salmondir) trinity_fasta=trinitydir+"Trinity.fasta" quant_salmon(salmondir,sra,newdir,trinity_fasta)
def execute(url_data): for item in url_data.keys(): organism = item[0] org_seq_dir = basedir + organism + "/" url_list = url_data[item] for url in url_list: sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" salmondir = newdir + "salmon/" rapclustdir = newdir + "rapclust/" clusterfunc.check_dir(rapclustdir) clusterfunc.check_dir(salmondir) run_rap_clust(salmondir, rapclustdir, sra)
def check_sra(url_data, no_files): for item in url_data: organism = item[0].replace("'", "") seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" if sra in no_files: if os.path.isdir(newdir): print "Directory exists:", sra if os.path.isfile(sra): print "Exists:", sra else: print "Missing:", newdir clusterfunc.check_dir(newdir) print url filestring = newdir + sra if os.path.isfile(filestring): print "file exists:", filestring else: urlstring = download(url, newdir, sra) command_list.append(urlstring) if glob.glob(newdir + "*.fastq"): print "SRA has already been extracted", filestring else: sra_string = sra_extract(newdir, sra) command_list.append(sra_string) names = "download_extract" print command_list if len(command_list) >= 1: send_to_cluster(newdir, command_list, sra, names) else: print "Pipeline already run." fastqcdir = newdir + "fastqc/" clusterfunc.check_dir(fastqcdir) fastqc(newdir, fastqcdir, sra) trimdir = newdir + "trim/" interleavedir = newdir + "interleave/" clusterfunc.check_dir(trimdir) clusterfunc.check_dir(interleavedir) file1 = newdir + sra + "_1.fastq" file2 = newdir + sra + "_2.fastq" if os.path.isfile(file1) and os.path.isfile(file2): print file1 print file2 run_trimmomatic_TruSeq(trimdir, file1, file2, sra)
def check_sra(url_data,no_files): for item in url_data: organism = item[0].replace("'","") seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" if sra in no_files: if os.path.isdir(newdir): print "Directory exists:",sra if os.path.isfile(sra): print "Exists:",sra else: print "Missing:",newdir clusterfunc.check_dir(newdir) print url filestring = newdir + sra if os.path.isfile(filestring): print "file exists:", filestring else: urlstring = download(url,newdir,sra) command_list.append(urlstring) if glob.glob(newdir + "*.fastq"): print "SRA has already been extracted", filestring else: sra_string = sra_extract(newdir,sra) command_list.append(sra_string) names = "download_extract" print command_list if len(command_list) >=1: send_to_cluster(newdir,command_list,sra,names) else: print "Pipeline already run." fastqcdir = newdir + "fastqc/" clusterfunc.check_dir(fastqcdir) fastqc(newdir, fastqcdir, sra) trimdir=newdir+"trim/" interleavedir=newdir+"interleave/" clusterfunc.check_dir(trimdir) clusterfunc.check_dir(interleavedir) file1=newdir+sra+"_1.fastq" file2=newdir+sra+"_2.fastq" if os.path.isfile(file1) and os.path.isfile(file2): print file1 print file2 run_trimmomatic_TruSeq(trimdir,file1,file2,sra)
def execute(trinity_fail, count, basedir): id_list = os.listdir(basedir) for mmetsp in id_list: if mmetsp != "qsub_files": mmetspdir = basedir + mmetsp + "/" trinitydir = basedir + mmetsp + "/" + "trinity/" trinity_files = os.listdir(mmetspdir) trinity_fasta=trinitydir+"trinity_out_2.2.0.Trinity.fasta" #trinity_fasta = trinitydir + sample + ".Trinity.fixed.fasta" clusterfunc.check_dir(trinitydir) if os.path.isfile(trinity_fasta) == False: if os.path.isfile("/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"+mmetsp+".trinity_out_2.2.0.Trinity.fasta"): print "Trinity finished." count +=1 else: print mmetspdir right = [s for s in trinity_files if s.endswith(".right.fq")][0] left = [s for s in trinity_files if s.endswith(".left.fq")][0] right = mmetspdir + right left = mmetspdir + left if os.path.isfile(left) and os.path.isfile(right): right = [s for s in trinity_files if s.endswith(".right.fq")][0] left = [s for s in trinity_files if s.endswith(".left.fq")][0] right = mmetspdir + right left = mmetspdir + left run_trinity(trinitydir,left,right,mmetsp) #print "Trinity failed:", trinity_fasta #trinity_fail.append(trinitydir) else: print "No files:",left else: print "Trinity completed successfully.", trinity_fasta count += 1 assemblydir = "/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/" copy_string = "cp " + trinity_fasta + " " + assemblydir + mmetsp + ".trinity_out_2.2.0.Trinity.fasta" print copy_string s = subprocess.Popen(copy_string, shell=True) s.wait() # trinity_out=fix_fasta(trinity_fasta,trinitydir,sample) # print "Needs to be fixed:",trinity_fasta # print trinity_out #"Re-run diginorm:",diginormfile #count = check_trinity(newdir,SRA,count) print "Number of Trinity de novo transcriptome assemblies:" print count print "Number of times Trinity failed:" print len(trinity_fail) print trinity_fail return trinity_fail, count
def get_no_files(url_data): assemblies = [] trinity_fail = [] empty_files = [] no_files = [] for item in url_data.keys(): organism = item[0].replace("'", "") seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" filename = newdir + sra # check if trinity exists trinitydir = newdir + "trinity/" left = trinitydir + sra + ".left.fq" right = trinitydir + sra + ".right.fq" if os.path.isfile(left): empty_files = check_empty(empty_files, left, sra) if os.path.isfile(right): empty_files = check_empty(empty_files, right, sra) trinity_outputdir = trinitydir + "trinity_out/" #trinity_file = trinity_outputdir + "Trinity.fasta" trinity_file = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta" trinity_fail, assemblies = check_trinity( assemblies, trinity_fail, trinity_file, sra) else: print "Missing right:", right if sra not in trinity_fail: no_files.append(sra) else: print "Missing left:", left if sra not in no_files: if sra not in trinity_fail: no_files.append(sra) print "Empty files:" print empty_files print len(empty_files) print "Trinity needs to be run again:" print trinity_fail print len(trinity_fail) print "Pipeline needs to be run again:" print no_files print len(no_files) print "Assemblies:" print len(assemblies) return trinity_fail, assemblies
def get_no_files(url_data): assemblies = [] trinity_fail = [] empty_files = [] no_files = [] for item in url_data.keys(): organism = item[0].replace("'","") seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" filename = newdir + sra # check if trinity exists trinitydir = newdir + "trinity/" left = trinitydir + sra + ".left.fq" right = trinitydir + sra + ".right.fq" if os.path.isfile(left): empty_files = check_empty(empty_files, left, sra) if os.path.isfile(right): empty_files = check_empty(empty_files, right, sra) trinity_outputdir = trinitydir + "trinity_out/" #trinity_file = trinity_outputdir + "Trinity.fasta" trinity_file = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta" trinity_fail,assemblies = check_trinity(assemblies,trinity_fail, trinity_file, sra) else: print "Missing right:",right if sra not in trinity_fail: no_files.append(sra) else: print "Missing left:",left if sra not in no_files: if sra not in trinity_fail: no_files.append(sra) print "Empty files:" print empty_files print len(empty_files) print "Trinity needs to be run again:" print trinity_fail print len(trinity_fail) print "Pipeline needs to be run again:" print no_files print len(no_files) print "Assemblies:" print len(assemblies) return trinity_fail,assemblies
def execute(basedir, newdir, url_data): for item in url_data: organism = item[0].replace("'", "") org_seq_dir = basedir + organism + "/" mmetsp = item[2] if mmetsp.endswith("_2"): mmetsp = mmetsp.split("_")[0] sra = item[1] newdir_sra = org_seq_dir + sra + "/" sra_transrate_1 = newdir_sra + "transrate_dib_v_ncgr_cds/" sra_transrate_2 = newdir_sra + "transrate_ncgr_cds_v_dib/" sra_transrate = newdir_sra + "transrate/" sra_trim = newdir_sra + "trim/" sra_trim_1P = sra_trim + sra + ".trim_1P.fq" sra_trim_2P = sra_trim + sra + ".trim_2P.fq" sra_busco = newdir_sra + "busco/" newdir_mmetsp = newdir + mmetsp + "/" newdir_mmetsp_sra = newdir_mmetsp + "sra/" newdir_mmetsp_sra_transrate = newdir_mmetsp_sra + "transrate/" newdir_mmetsp_sra_trim = newdir_mmetsp_sra + "trim/" newdir_mmetsp_sra_busco = newdir_mmetsp_sra + "busco/" newdir_mmetsp_sra_fastqc = newdir_mmetsp_sra + "fastqc_raw/" clusterfunc.check_dir(newdir_mmetsp_sra) clusterfunc.check_dir(newdir_mmetsp_sra_transrate) clusterfunc.check_dir(newdir_mmetsp_sra_trim) clusterfunc.check_dir(newdir_mmetsp_sra_busco) clusterfunc.check_dir(newdir_mmetsp_sra_fastqc) if os.path.isdir(newdir_sra): print "Exists:", newdir_sra else: print "Missing:", newdir_sra if os.path.isdir(newdir_mmetsp): print "Exists:", newdir_mmetsp else: print "Missing:", newdir_mmetsp # copy_transrate1 = #copy_transrate2 = #copy_transrate_scores = #copy_trim_reads = ged_trim = "/mnt/research/ged/data/mmetsp/trimmed_reads/" copy_file(sra_trim_1P, ged_trim) copy_file(sra_trim_2P, ged_trim)
def execute(listoffiles,trimdir,interleavedir,diginormdir,assemblydir): files_dictionary=get_files(listoffiles,trimdir) for sample in files_dictionary.keys(): fileslist=sorted(files_dictionary[sample]) matching = [s for s in fileslist if "orphans" in s] interleavefileslist = os.listdir(interleavedir) interleave_files = get_samples(interleavefileslist,interleavedir) print interleave_files for sample in interleave_files: orphansfile = combine_orphans(trimdir,sample) sample_diginormdir = diginormdir + sample + "/" clusterfunc.check_dir(sample_diginormdir) #run_diginorm(trimdir,sample_diginormdir,orphansfile,interleavedir,sample) #run_filt_abund(sample_diginormdir,sample) #run_extract_paired(sample_diginormdir,sample) combine_orphans_after_diginorm(sample_diginormdir,sample)
def execute(url_data,basedir): for item in url_data.keys(): organism=item[0] org_seq_dir=basedir+organism+"/" url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trinitydir=newdir+"trinity/trinity_out/" dammitdir=newdir+"dammit_dir/" clusterfunc.check_dir(dammitdir) trinity_fasta=trinitydir+"Trinity.fasta" print trinity_fasta if os.path.isfile(trinity_fasta): print "File exists:",trinity_fasta dammit_string(basedir,dammitdir,sra,trinity_fasta)
def execute(data_frame,species,basedir): # construct an empty pandas dataframe to add on each assembly.csv to newdir=basedir+species+"/" trinitydir=newdir+"trinity_out/" busco_dir=newdir+"busco/" clusterfunc.check_dir(busco_dir) trinity_fasta=trinitydir+"Trinity.fasta" busco_file=busco_dir+"qsub_files/"+"run_"+species+".metazoa/short_summary_"+species+".metazoa" if os.path.isfile(trinity_fasta): fixed_trinity_fasta=fix_fasta(trinity_fasta,trinitydir,species) #run_busco(busco_dir,fixed_trinity_fasta,species) data=parse_busco_stats(busco_file,species) data_frame=build_DataFrame(data_frame,data) else: print "Trinity failed:",trinity_fasta return data_frame
def execute(url_data, basedir): for item in url_data.keys(): organism = item[0] org_seq_dir = basedir + organism + "/" url_list = url_data[item] for url in url_list: sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" trinitydir = newdir + "trinity/trinity_out/" dammitdir = newdir + "dammit_dir/" clusterfunc.check_dir(dammitdir) trinity_fasta = trinitydir + "Trinity.fasta" print trinity_fasta if os.path.isfile(trinity_fasta): print "File exists:", trinity_fasta dammit_string(basedir, dammitdir, sra, trinity_fasta)
def group_assembly_files(diginormdir,assemblydir): abund_filt_files=get_abund_filt_files(diginormdir) genus_species_files=get_genus_species(abund_filt_files) for genus_species in genus_species_files: genus_species_dir=assemblydir+genus_species+"/" clusterfunc.check_dir(genus_species_dir) for filename in genus_species_files[genus_species]: abund_filt_filename=diginormdir+filename sample_info=filename.split("_") extension=sample_info[-1].split(".") sample="_".join(sample_info[:-1]) sample=sample+"_"+extension[0]+"_"+extension[2] rename_command=get_rename(genus_species_dir,sample,abund_filt_filename) process_name="split" rename_command=[rename_command] module_name_list="" filename=sample
def execute(trinity_fail, count, basedir): id_list = os.listdir(basedir) for mmetsp in id_list: if mmetsp != "qsub_files": mmetspdir = basedir + mmetsp + "/" trinitydir = basedir + mmetsp + "/" + "trinity/" trinity_files = os.listdir(mmetspdir) trinity_fasta = trinitydir + dib_conf.output_extension clusterfunc.check_dir(trinitydir) if os.path.isfile(trinity_fasta) == False: if os.path.isfile(dib_conf.output_dir + mmetsp + dib_conf.output_extension): print("Trinity finished.") count += 1 else: print(mmetspdir) right = [ s for s in trinity_files if s.endswith(".right.fq") ][0] left = [ s for s in trinity_files if s.endswith(".left.fq") ][0] right = mmetspdir + right left = mmetspdir + left if os.path.isfile(left) and os.path.isfile(right): right = [ s for s in trinity_files if s.endswith(".right.fq") ][0] left = [ s for s in trinity_files if s.endswith(".left.fq") ][0] right = mmetspdir + right left = mmetspdir + left run_trinity(trinitydir, left, right, mmetsp, dib_conf.output_dir, dib_conf.output_extension) else: print("No files:", left) else: print("Trinity completed successfully.", trinity_fasta) count += 1 assemblydir = dib_conf.output_dir print("Number of Trinity de novo transcriptome assemblies:", count) print("Number of times Trinity failed:", len(trinity_fail), trinity_fail) return trinity_fail, count
def execute(basedir, url_data): for item in url_data.keys(): organism = item[0] seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: SRA = basename(urlparse(url).path) newdir = org_seq_dir + SRA + "/" interleavedir = newdir + "interleave/" diginormdir = newdir + "diginorm/" clusterfunc.check_dir(diginormdir) trimdir = newdir + "trim/" #run_streaming_diginorm(trimdir,SRA,diginormdir) #interleave_reads(trimdir,SRA,interleavedir) #run_diginorm(diginormdir,interleavedir,trimdir,SRA) run_filter_abund(diginormdir, SRA)
def execute(basedir, url_data): for item in url_data.keys(): organism = item[0] seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: SRA = basename(urlparse(url).path) newdir = org_seq_dir + SRA + "/" interleavedir = newdir + "interleave/" diginormdir = newdir + "diginorm/" clusterfunc.check_dir(diginormdir) trimdir = newdir + "trim/" # run_streaming_diginorm(trimdir,SRA,diginormdir) # interleave_reads(trimdir,SRA,interleavedir) # run_diginorm(diginormdir,interleavedir,trimdir,SRA) run_filter_abund(diginormdir, SRA)
def get_num_files(filesdir, workingdir): listoffiles = os.listdir(workingdir) diamond_dir = workingdir + "diamond/" clusterfunc.check_dir(diamond_dir) species_filenames = [] for filename in listoffiles: if filename.startswith("Species"): if filename.endswith(".fa"): species_filenames.append(filename) print species_filenames species_num = len(species_filenames) print "This is the num of species:", species_num split_num = 4000 print split_num i = 0 count = 0 #block_list = [species_filenames[x:x+split_num] for x in range(0,len(species_filenames),split_num)] process_string = [] db_files = os.listdir(diamond_dir) print len(db_files) db_out_dir = workingdir + "diamond_out/" clusterfunc.check_dir(db_out_dir) for species_filename in species_filenames: print species_filename for db in db_files: if db.endswith(".dmnd"): # print i # db_filename=os.path.splitext(species_filename)[0] db_filename = db ortho_command = run_diamond_loop(diamond_dir, workingdir, db_out_dir, species_filename, db_filename) process_string.append(ortho_command) i += 1 else: i = 0 basedir = db_out_dir process_name = "OrthoFinder" module_name_list = ["GNU/4.4.5", "diamond/0.7.9"] filename = "Group_" + str(count) # clusterfunc.qsub_file(basedir,process_name,module_name_list,filename,process_string) process_string = [] count += 1
def get_num_files(filesdir, workingdir): listoffiles = os.listdir(workingdir) diamond_dir = workingdir + "diamond/" clusterfunc.check_dir(diamond_dir) species_filenames = [] for filename in listoffiles: if filename.startswith("Species"): if filename.endswith(".fa"): species_filenames.append(filename) print species_filenames species_num = len(species_filenames) print "This is the num of species:", species_num split_num = 4000 print split_num i = 0 count = 0 #block_list = [species_filenames[x:x+split_num] for x in range(0,len(species_filenames),split_num)] process_string = [] db_files = os.listdir(diamond_dir) print len(db_files) db_out_dir = workingdir + "diamond_out/" clusterfunc.check_dir(db_out_dir) for species_filename in species_filenames: print species_filename for db in db_files: if db.endswith(".dmnd"): # print i # db_filename=os.path.splitext(species_filename)[0] db_filename = db ortho_command = run_diamond_loop( diamond_dir, workingdir, db_out_dir, species_filename, db_filename) process_string.append(ortho_command) i += 1 else: i = 0 basedir = db_out_dir process_name = "OrthoFinder" module_name_list = ["GNU/4.4.5", "diamond/0.7.9"] filename = "Group_" + str(count) # clusterfunc.qsub_file(basedir,process_name,module_name_list,filename,process_string) process_string = [] count += 1
def execute(data_frame, url_data, basedir): trinity_fail = [] count = 0 # construct an empty pandas dataframe to add on each assembly.csv to for item in url_data.keys(): # print item organism = item[0].replace("'","") sample = "_".join(item).replace("'","") org_seq_dir = basedir + organism + "/" url_list = url_data[item] for url in url_list: sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" trimdir = newdir + "trim/" #trinitydir = newdir + "trinity/trinity_out/" trinitydir = newdir + "trinity/" transrate_dir = newdir + "transrate/" clusterfunc.check_dir(transrate_dir) trinity_fasta = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta" transrate_out = transrate_dir + "transrate_out." + sample + "/" transrate_assemblies = transrate_out + "assemblies.csv" if os.path.isfile(trinity_fasta): # print transrate_out count += 1 # fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sample) if os.path.isfile(transrate_assemblies): data = parse_transrate_stats(transrate_assemblies) data_frame = build_DataFrame(data_frame, data) else: print "Transrate still needs to run:", transrate_assemblies transrate(trinitydir,transrate_dir,transrate_out,trinity_fasta,sample,trimdir,sra) transrate_assemblies = transrate_out + "assemblies.csv" else: print "Trinity failed:", trinity_fasta trinity_fail.append(newdir) print "This is the number of Trinity de novo transcriptome assemblies:" print count print "This is the number of times Trinity failed:" print len(trinity_fail) print trinity_fail return data_frame
def execute(url_data): sample_dictionary = {} missing = [] trimmed = [] for item in url_data: organism = item[0].replace("'","") seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" trimdir = newdir + "trim/qsub_files/" listoffile = os.listdir(trimdir) #print listoffile trim_file = trimdir+"trim."+sra+".log" #print trim_file matching = [s for s in listoffile if "trim."+sra+".log" in s] matching_string = "TrimmomaticPE: Completed successfully" if os.path.isfile(trim_file): with open(trim_file) as f: content = f.readlines() if len(matching)!=0: trim_complete = [m for m in content if matching_string in m] if len(trim_complete)!=0: print "Already trimmed:",matching sample_dictionary=get_sample_dictionary(sample_dictionary,trim_file,sra) trimmed.append(sra) else: missing.append(trimdir) print "Missing:",trimdir #print sample_dictionary trim_table(sample_dictionary) print "Missing trimmed:",len(missing) print missing print "Trimmed:",len(trimmed) print "Out of" print len(url_data.keys()) return missing
def combine_files(files,basedir,combine_dir): clusterfunc.check_dir(combine_dir) for species in files.keys(): fields=files[species][0].split("_") extension=fields[-1] parsed_extension1=extension.split(".") parsed_extension2=parsed_extension1[1:] new_extension=".".join(parsed_extension2) newfilename=get_newfilename(fields,new_extension) print species print files[species] #print newfilename newfilename=combine_dir+newfilename files_string=" ".join(files[species]) combine_string="cat "+files_string+" > "+newfilename print combine_string workingdir=os.getcwd() os.chdir(basedir) #s=subprocess.Popen(combine_string,shell=True) #s.wait() os.chdir(workingdir)
def execute(url_data): sample_dictionary = {} for item in url_data: organism = item[0].replace("'","") seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" trimdir = newdir + "trim/qsub_files/" trim_out_file = trimdir + "trim." + sra + ".log" if os.path.isfile(trim_out_file): print trim_out_file sample_dictionary=get_sample_dictionary(sample_dictionary,trim_out_file,sra) else: print "No trim out log available:",trim_out_file print sample_dictionary trim_table(sample_dictionary)
def check_sra(url_data,missing): num_download = [] for item in url_data: organism = item[0].replace("'","") seqtype = item[1] org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" trimdir = newdir + "trim/qsub_files/" if trimdir in missing: if os.path.isdir(newdir): print "Directory exists:",sra if os.path.isfile(sra): print "Exists:",sra else: num_download.append(newdir) print "Missing:",newdir clusterfunc.check_dir(newdir) print url filestring = newdir + sra if os.path.isfile(filestring): print "file exists:", filestring else: urlstring = download(url,newdir,sra) command_list.append(urlstring) if glob.glob(newdir + "*.fastq"): print "SRA has already been extracted", filestring else: sra_string = sra_extract(newdir,sra) command_list.append(sra_string) names = "download_extract" print command_list if len(command_list) >=1: send_to_cluster(newdir,command_list,sra,names) print "Num to download:",len(num_download) print num_download
def execute(basedir, url_data): for item in url_data.keys(): # Creates directory for each file to be downloaded # Directory will be located according to organism and read type (single # or paired) organism = item[0] seqtype = item[1] org_seq_dir = basedir + organism + "/" print org_seq_dir clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: filename = basename(urlparse(url).path) print filename newdir = org_seq_dir + filename + "/" full_filename = newdir + filename clusterfunc.check_dir(newdir) fastqcdir = newdir + "fastqc/" clusterfunc.check_dir(fastqcdir) # check to see if filename exists in newdir if filename in os.listdir(newdir): print "sra exists:", filename if os.stat(full_filename).st_size == 0: print "SRA file is empty:", filename os.remove(full_filename) else: print "file will be downloaded:", filename download(url, newdir, filename) sra_extract(newdir, filename) fastqc(newdir, fastqcdir, filename)
def execute(data_frame, url_data, basedir): trinity_fail = [] count = 0 # construct an empty pandas dataframe to add on each assembly.csv to for item in url_data.keys(): #print item organism = item[0] sample = "_".join(item) org_seq_dir = basedir + organism + "/" url_list = url_data[item] for url in url_list: sra = basename(urlparse(url).path) newdir = org_seq_dir + sra + "/" trimdir = newdir + "trim/" trinitydir = newdir + "trinity/trinity_out/" transrate_dir = newdir + "transrate/" clusterfunc.check_dir(transrate_dir) trinity_fasta = trinitydir + "Trinity.fasta" transrate_out = transrate_dir + "transrate_out." + sample + "/" if os.path.isfile(trinity_fasta): #transrate(dammit_dir) #print transrate_out count += 1 #fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sample) #transrate(trinitydir,transrate_dir,transrate_out,trinity_fasta,sample,trimdir,sra) transrate_assemblies = transrate_out + "assemblies.csv" if os.path.isfile(transrate_assemblies): data = parse_transrate_stats(transrate_assemblies) data_frame = build_DataFrame(data_frame, data) else: print "Transrate did not complete:", transrate_assemblies else: print "Trinity failed:", trinity_fasta trinity_fail.append(newdir) print "This is the number of Trinity de novo transcriptome assemblies:" print count print "This is the number of times Trinity failed:" print len(trinity_fail) print trinity_fail return data_frame
def execute(trinity_fail, count, basedir): id_list = os.listdir(basedir) for mmetsp in id_list: if mmetsp != "qsub_files": mmetspdir = basedir + mmetsp + "/" trinitydir = basedir + mmetsp + "/" + "trinity/" trinity_files = os.listdir(mmetspdir) trinity_fasta=trinitydir+"trinity_out_2.2.0.Trinity.fasta" #trinity_fasta = trinitydir + sample + ".Trinity.fixed.fasta" clusterfunc.check_dir(trinitydir) if os.path.isfile(trinity_fasta) == False: if os.path.isfile("/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"+mmetsp+".trinity_out_2.2.0.Trinity.fasta"): print("Trinity finished.") count +=1 else: print(mmetspdir) right = [s for s in trinity_files if s.endswith(".right.fq")][0] left = [s for s in trinity_files if s.endswith(".left.fq")][0] right = mmetspdir + right left = mmetspdir + left if os.path.isfile(left) and os.path.isfile(right): right = [s for s in trinity_files if s.endswith(".right.fq")][0] left = [s for s in trinity_files if s.endswith(".left.fq")][0] right = mmetspdir + right left = mmetspdir + left run_trinity(trinitydir,left,right,mmetsp) else: print("No files:",left) else: print("Trinity completed successfully.", trinity_fasta) count += 1 assemblydir = "/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/" copy_string = "cp " + trinity_fasta + " " + assemblydir + mmetsp + ".trinity_out_2.2.0.Trinity.fasta" print(copy_string) s = subprocess.Popen(copy_string, shell=True) s.wait() print("Number of Trinity de novo transcriptome assemblies:", count) print("Number of times Trinity failed:", len(trinity_fail), trinity_fail)
def execute(url_data): trinity_fail=[] empty_files=[] for item in url_data.keys(): organism=item[0] seqtype=item[1] org_seq_dir=basedir+organism+"/" clusterfunc.check_dir(org_seq_dir) url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" filename=newdir+sra ## check if trinity exists trinitydir=newdir+"trinity/" left=trinitydir+"left.fq" right=trinitydir+"right.fq" if os.stat(left).st_size == 0: print "File is empty:",left if sra not in empty_files: empty_files.append(sra) if os.stat(right).st_size == 0: print "File is empty:",right if sra not in empty_files: empty_files.append(sra) trinity_outputdir=trinitydir+"trinity_out/" trinity_file=trinity_outputdir+"Trinity.fasta" if os.path.isfile(trinity_file): print "Trinity completed successfully:",trinity_file else: print "Trinity needs to be run again:",filename trinity_fail.append(sra) diginormdir=newdir+"diginorm/" trimdir=newdir+"trim/" print "List of empty files:" print empty_files print "Trinity needs to be run again:" print trinity_fail
def execute(basedir,url_data): for item in url_data.keys(): organism=item[0] seqtype=item[1] org_seq_dir=basedir+organism+"/" print org_seq_dir clusterfunc.check_dir(org_seq_dir) url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" sample="_".join(item) filename=newdir+sra print filename ## # run this to delete SRA file: ## if os.path.isfile(filename): delete_file(filename) else: print "Already deleted:",filename
def execute(basedir,url_data): for item in url_data.keys(): organism=item[0] org_seq_dir=basedir+organism+"/" url_list=url_data[item] for url in url_list: sra=basename(urlparse(url).path) newdir=org_seq_dir+sra+"/" trinitydir=newdir+"trinity/trinity_out/" transdecoderdir=newdir+"transdecoder/" clusterfunc.check_dir(transdecoderdir) trinity_in_fasta=trinitydir+"Trinity.fasta" #trinity_fasta_prefix=sra+".Trinity.fa" trinity_fasta_prefix=sra+".Trinity.fixed.fa" trinity_fasta=fix_fasta(trinity_in_fasta,trinitydir,sra) #copy_files(trinity_fasta,trinity_fasta_prefix,transdecoderdir) #transdecoder_LongOrf(transdecoderdir,trinity_fasta_prefix) #transdecoder_Predict(transdecoderdir,trinity_fasta_prefix) #get_longest_ORF(transdecoderdir,trinity_fasta_prefix) #new_trinity_fasta="/mnt/mmetsp_trinity_finished/" #clusterfunc.check_dir(new_trinity_fasta) #fix(transdecoderdir,trinity_fasta_prefix,sra,new_trinity_fasta) copy_files(trinity_fasta_prefix,trinity_fasta,transdecoderdir,sra)
def execute(url_data,basedir,mmetsp_assemblies): trinity_fail=[] # construct an empty pandas dataframe to add on each assembly.csv to for item in mmetsp_data.keys(): #print item organism=item[0] sample="_".join(item) org_seq_dir=basedir+organism+"/" mmetsp_list=mmetsp_data[item] for mmetsp in mmetsp_list: print mmetsp assemblyfileslist=os.listdir(mmetsp_assemblies) for filename in assemblyfileslist: if filename.startswith(mmetsp): if filename.endswith(".fixed.fa"): print "This is not the one you want." else: print "MMETSP assembly found:",filename reference_filename=filename sra=item[1] newdir=org_seq_dir+sra+"/" trinitydir=newdir+"trinity/trinity_out/" dammit_dir=trinitydir+"dammit_dir/" transrate_dir="/mnt/comparisons/" reverse_transrate_dir="/mnt/comparisons_reverse/" clusterfunc.check_dir(transrate_dir) clusterfunc.check_dir(dammit_dir) clusterfunc.check_dir(reverse_transrate_dir) #trinity_fasta=dammit_dir+"Trinity.fasta.dammit.fasta" trinity_fasta=trinitydir+"Trinity.fasta" if os.path.isfile(trinity_fasta): print trinity_fasta fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sample) fixed_mmetsp_ref=fix_fasta_reference(reference_filename,mmetsp_assemblies) #transrate(transrate_dir,fixed_trinity,mmetsp_assemblies,fixed_mmetsp_ref) transrate_reverse(reverse_transrate_dir,sample,fixed_trinity,mmetsp_assemblies_dir,fixed_mmetsp_ref) else: print "Trinity failed:",newdir trinity_fail.append(newdir) print "This is the number of times Trinity failed:" print len(trinity_fail) print trinity_fail
def run_diginorm(fastq_list, mmetsp_dir, mmetsp): diginormdir = mmetsp_dir + "diginorm/" clusterfunc.check_dir(diginormdir)
assemblyfileslist = os.listdir(mmetsp_assemblies) for filename in assemblyfileslist: if filename.startswith(mmetsp): if filename.endswith(".fixed.fa"): print "This is not the one you want." else: print "MMETSP assembly found:", filename reference_filename = filename sra = item[1] newdir = org_seq_dir + sra + "/" << << << < .merge_file_oAh5J2 trinitydir = newdir + "trinity/trinity_out/" dammit_dir = trinitydir + "dammit_dir/" transrate_dir = "/mnt/comparisons/" reverse_transrate_dir = "/mnt/comparisons_reverse/" clusterfunc.check_dir(transrate_dir) clusterfunc.check_dir(dammit_dir) clusterfunc.check_dir(reverse_transrate_dir) # trinity_fasta=dammit_dir+"Trinity.fasta.dammit.fasta" trinity_fasta = trinitydir + "Trinity.fasta" if os.path.isfile(trinity_fasta): print trinity_fasta fixed_trinity = fix_fasta(trinity_fasta, trinitydir, sample) fixed_mmetsp_ref = fix_fasta_reference( reference_filename, mmetsp_assemblies) # transrate(transrate_dir,fixed_trinity,mmetsp_assemblies,fixed_mmetsp_ref) transrate_reverse(reverse_transrate_dir, sample, fixed_trinity, mmetsp_assemblies_dir, fixed_mmetsp_ref) == == == = trinitydir = newdir + "trinity/" # dammit_dir=trinitydir+"dammit_dir/"
def check_sra(url_data, no_files, mmetsp_data): different = [] for item in url_data: organism = item[0].replace("'", "") seqtype = item[1] mmetsp_id = item[2].replace("'", "") strain, organism_mmetsp, different, alt = get_strain( different, mmetsp_id, organism, mmetsp_data) org_seq_dir = basedir + organism + "/" clusterfunc.check_dir(org_seq_dir) url_list = url_data[item] for url in url_list: command_list = [] sra = basename(urlparse(url).path) if alt == "blank": sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id else: sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id + "_alt_" + alt newdir = org_seq_dir + sra + "/" if sra in no_files: if os.path.isdir(newdir): print "Directory exists:", sra if os.path.isfile(sra): print "Exists:", sra else: print "Missing:", newdir clusterfunc.check_dir(newdir) print url filestring = newdir + sra if os.path.isfile(filestring): print "file exists:", filestring else: urlstring = download(url, newdir, sra) command_list.append(urlstring) if glob.glob(newdir + "*.fastq"): print "SRA has already been extracted", filestring else: sra_string = sra_extract(newdir, sra) command_list.append(sra_string) names = "download_extract" print command_list if len(command_list) >= 1: send_to_cluster(newdir, command_list, sra, names) else: print "Pipeline already run." fastqcdir = newdir + "fastqc/" clusterfunc.check_dir(fastqcdir) fastqc(newdir, fastqcdir, sra) trimdir = newdir + "trim/" interleavedir = newdir + "interleave/" clusterfunc.check_dir(trimdir) clusterfunc.check_dir(interleavedir) diginormdir = newdir + "diginormdir/" clusterfunc.check_dir(diginormdir) trinitydir = newdir + "trinity/" clusterfunc.check_dir(trinitydir) diginormfile = diginormdir + "qsub_files/" + sra + ".trimmed.interleaved.fq.keep.abundfilt.pe" trinity_fasta = trinitydir + "trinity_out/" + "Trinity.fasta" #trinity_fasta_new =trinitydir+sample+".Trinity.fixed.fasta" trinity_fasta_new = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta" file1 = newdir + sra + "_1.fastq" file2 = newdir + sra + "_2.fastq" assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/" if os.path.isfile(file1) and os.path.isfile(file2): print file1 print file2 run_trimmomatic_TruSeq(trimdir, file1, file2, sra) file1_trim = trimdir + sra + ".trim_1P.fq" file2_trim = trimdir + sra + ".trim_2P.fq" if os.path.isfile(file1_trim) and os.path.isfile( file2): #interleave_reads(trimdir, sra, interleavedir) #run_diginorm(diginormdir,interleavedir,trimdir,sra) #run_filter_abund(diginormdir,interleavedir,trimdir,sra) #rename_files(trinitydir,diginormdir,diginormfile,sra) if os.path.isfile(trinity_fasta) == False: run_trinity(trinitydir, sra) else: print "Trinity completed!", trinity_fasta trinity_fixed_fasta = fix_fasta( trinity_fasta, trinitydir, sra) #copy_string="cp "+trinity_fixed_fasta+" "+trinity_fasta_new copy_string = "cp " + trinity_fasta_new + " " + assemblydir print copy_string
def run_Trinity(ncgr_dir, mmetsp_dir, mmetsp, data_frame1, data_frame2): trinitydir = mmetsp_dir + "trinity/" clusterfunc.check_dir(trinitydir) diginormdir = mmetsp_dir + "diginorm/" transratedir = mmetsp_dir + "transrate/" clusterfunc.check_dir(transratedir) #split_paired_reads(trinitydir, diginormdir, mmetsp) #combine_orphans(diginormdir,mmetsp) right = trinitydir + mmetsp + ".right.fq" left = trinitydir + mmetsp + ".left.fq" trinity_fasta = trinitydir + "trinity_out/" + "Trinity.fasta" #rename_files(trinitydir, diginormdir, mmetsp) if os.path.isfile(right) and os.path.isfile(left): if os.path.isfile(trinity_fasta): print trinity_fasta cp_string = "cp " + trinity_fasta + " " + mmetsp_dir + mmetsp + ".Trinity.fasta" fixed_fasta = fix_fasta(trinity_fasta, mmetsp_dir, mmetsp) #print cp_string old_assemblies = sorted([ s for s in os.listdir(mmetsp_dir) if s.endswith(".fixed.fasta") and s.split("_")[-1].startswith("SRR") ]) #print old_assemblies #for old_assembly in old_assemblies: #transrate(transratedir, mmetsp, fixed_fasta, mmetsp_dir, old_assembly) #transrate_reverse(transratedir, mmetsp, fixed_fasta, mmetsp_dir, old_assembly) # sra = old_assembly.split("_")[-1].split(".")[0] # sample = mmetsp + "_" + sra # reverse_sample = "reverse_" + mmetsp + "_" + sra # transrate_assemblies_ref = transratedir + sample + "/assemblies.csv" # transrate_reverse_assemblies = transratedir + reverse_sample + "/assemblies.csv" #if os.path.isfile(transrate_assemblies_ref): # data1 = parse_transrate_stats(transrate_assemblies_ref,sra,mmetsp) # data_frame1 = build_DataFrame(data_frame1,data1) #else: # "Transrate failed:",transrate_assemblies_ref #if os.path.isfile(transrate_reverse_assemblies): # data2 = parse_transrate_stats(transrate_reverse_assemblies,sra,mmetsp) # data_frame2 = build_DataFrame(data_frame2,data2) #else: # print "Reverse failed:",transrate_reverse_assemblies #s = subprocess.Popen(cp_string, shell = True) #s.wait() ncgr_assembly = mmetsp + ".nt.fa.fixed.fa" sample = mmetsp + "_" + mmetsp reverse_sample = "reverse_" + mmetsp + "_" + mmetsp transrate(transratedir, mmetsp, fixed_fasta, ncgr_dir, ncgr_assembly) transrate_reverse(transratedir, mmetsp, fixed_fasta, ncgr_dir, ncgr_assembly) transrate_assemblies_ref = transratedir + sample + "/assemblies.csv" transrate_reverse_assemblies = transratedir + reverse_sample + "/assemblies.csv" if os.path.isfile(transrate_assemblies_ref): data1 = parse_transrate_stats(transrate_assemblies_ref, mmetsp, mmetsp) data_frame1 = build_DataFrame(data_frame1, data1) else: print "Transrate failed:", transrate_assemblies_ref if os.path.isfile(transrate_reverse_assemblies): data2 = parse_transrate_stats(transrate_reverse_assemblies, mmetsp, mmetsp) data_frame2 = build_DataFrame(data_frame2, data2) else: print "Transrate failed:", transrate_reverse_assemblies else: get_trinity(trinitydir, left, right, mmetsp) #cp_string1 = "cp " + right + " " + mmetsp_dir #cp_string2 = "cp " + left + " " + mmetsp_dir #s = subprocess.Popen(cp_string1, shell=True) #print cp_string1 #s.wait() #t = subprocess.Popen(cp_string2, shell=True) #print cp_string2 #t.wait() return data_frame1, data_frame2
trinity_fasta = org_seq_dir + "trinity/" + organism + "_" + sra + ".Trinity.fixed.fasta" #if os.path.isfile(file1) and os.path.isfile(file2): # print file1 # print file2 # run_trinity(mmetsp_dir,file1,file2,mmetsp) # else: # print "missing:",file1 if os.path.isfile(trinity_fasta): print trinity_fasta count.append(trinity_fasta) cp_string = "cp " + trinity_fasta + " " + mmetsp_dir print cp_string s = subprocess.Popen(cp_string, shell=True) s.wait() else: print "Missing:", trinity_fasta missing.append(trinity_fasta) print len(count) print missing basedir = "/mnt/scratch/ljcohen/mmetsp_sra/" newdir = "/mnt/scratch/ljcohen/mmetsp/" clusterfunc.check_dir(newdir) datafile = "../SraRunInfo_719.csv" url_data = get_data(datafile) print url_data print len(url_data) #move_files(url_data,basedir,newdir) get_trinity(url_data, newdir, basedir)
def execute(data_frame1,data_frame2,ncgr_dir,trinity_fail, count, basedir): assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/" old_files = os.listdir(assemblydir) id_list = os.listdir(basedir) for mmetsp in id_list: if mmetsp != "qsub_files": alt_mmetsp = mmetsp + "_2" mmetspdir = basedir + mmetsp + "/" trinitydir = basedir + mmetsp + "/" + "trinity/" trinity_files = os.listdir(mmetspdir) transrate_dir = mmetspdir + "transrate/" clusterfunc.check_dir(transrate_dir) trinity_fasta=trinitydir+"trinity_out_2.2.0.Trinity.fasta" alt_trinity_fasta = "/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"+mmetsp+".trinity_out_2.2.0.Trinity.fasta" #trinity_fasta = trinitydir + sample + ".Trinity.fixed.fasta" clusterfunc.check_dir(trinitydir) if os.path.isfile(trinity_fasta) == False and os.path.isfile(alt_trinity_fasta) == False: right = [s for s in trinity_files if s.endswith(".right.fq")][0] left = [s for s in trinity_files if s.endswith(".left.fq")][0] right = mmetspdir + right left = mmetspdir + left if os.path.isfile(left) and os.path.isfile(right): #run_trinity(trinitydir,left,right,mmetsp) print "Trinity not finished:", trinity_fasta trinity_fail.append(trinitydir) else: print "No files:",left elif os.path.isfile(trinity_fasta) == True: or os.path.isfile(alt_trinity_fasta) == True: print "Trinity completed successfully.", trinity_fasta count += 1 old_assemblies = glob.glob(assemblydir+"*"+mmetsp+"*") if len(old_assemblies) >= 1: full_assembly = old_assemblies[0] else: print glob.glob(assemblydir + "*" + mmetsp + "*") #copy_string = "cp " + trinity_fasta + " " + assemblydir #print copy_string #s = subprocess.Popen(copy_string, shell=True) #s.wait() fixed_fasta = fix_fasta(trinity_fasta,trinitydir,mmetsp) #sra = old_assembly.split("_")[-1].split(".")[0] ncgr_assembly = ncgr_dir + mmetsp + ".nt.fa.fixed.fa" sample = "trinity2.2.0_"+mmetsp+"_trinity2014" reverse_sample = "reverse_trinity2014_" + mmetsp + "_trinity2.2.0" transrate_out = transrate_dir + sample + "/" + "assemblies.csv" transrate_reverse_assemblies = transrate_dir + reverse_sample + "/" + "assemblies.csv" if os.path.isfile(transrate_out): print "Transrate completed:",transrate_out data1 = parse_transrate_stats(transrate_out,mmetsp,mmetsp) data_frame1 = build_DataFrame(data_frame1,data1) else: transrate(transrate_dir, mmetsp, fixed_fasta, mmetspdir, full_assembly) if os.path.isfile(transrate_reverse_assemblies): print "Transrate complete:",transrate_reverse_assemblies data2 = parse_transrate_stats(transrate_reverse_assemblies,mmetsp,mmetsp) data_frame2 = build_DataFrame(data_frame2,data2) else: transrate_reverse(transrate_dir, mmetsp, fixed_fasta, mmetspdir, full_assembly)
if os.path.isfile(left) and os.path.isfile(right): print left print right else: print "Does not exist.",left,right else: print "Does not exist:",diginormdir trinity_fasta = assemblydir + sample + "/" + sample + ".Trinity.fixed.fa" transrate_out = transratedir + sample + "/" transrate_assemblies = transrate_out + "/" + "assemblies.csv" if os.path.isfile(trinity_fasta): print trinity_fasta else: print "Trinity failed:", trinity_fasta if os.path.isfile(transrate_assemblies): data = parse_transrate_stats(transrate_assemblies) data_frame = build_DataFrame(data_frame, data) else: print "Running transrate..." transrate(transratedir,transrate_out,trinity_fasta,sample,left,right) return data_frame assemblydir = "/home/ljcohen/msu_assemblies_finished/" basedir = "/home/ljcohen/osmotic_combined/" transratedir = "/home/ljcohen/osmotic_transrate_scores/" clusterfunc.check_dir(transratedir) listoffiles = os.listdir(basedir) data_frame = pd.DataFrame() data_frame = execute(data_frame,listoffiles, assemblydir, transratedir) #data_frame.to_csv("transrate_scores.csv")
# check to see if filename exists in newdir if filename in os.listdir(newdir): print "sra exists:", filename if os.stat(full_filename).st_size == 0: print "SRA file is empty:", filename os.remove(full_filename) else: print "file will be downloaded:", filename download(url, newdir, filename) sra_extract(newdir, filename) fastqc(newdir, fastqcdir, filename) def fastqc(newdir, fastqcdir, filename): listoffiles = os.listdir(newdir) print listoffiles fastq_file_list = [] for i in listoffiles: if i.endswith(".fastq"): fastq_file_list.append(newdir + i) fastqc_report(fastq_file_list, newdir, fastqcdir, filename) datafile = "SraRunInfo.csv" basedir = "~/" clusterfunc.check_dir(basedir) for datafile in datafiles: url_data = get_data(datafile) print url_data execute(basedir, url_data)