def execute(listoffiles,trimdir,interleavedir,diginormdir,assemblydir): #files_dictionary=get_files(listoffiles,trimdir) #print files_dictionary #for sample in files_dictionary: #fileslist=sorted(files_dictionary[sample]) #interleavefile=interleave_reads(trimdir,interleavedir,fileslist,sample) interleave_files=os.listdir(interleavedir) interleave_files_dictionary=get_files_population(interleave_files,interleavedir) print interleave_files_dictionary #diginorm_files=get_diginorm_files(diginormdir) for population in interleave_files_dictionary.keys(): #fileslist=sorted(files_dictionary[population]) #get_orphans(fileslist,trimdir,sample) #orphansfile=interleavedir+population+".trim.orphans.fq.gz" #graph_count_filename=run_diginorm(diginormdir,orphansfile,interleavedir,population) #diginorm_files=get_diginorm_files(diginormdir) #for diginormfile in diginorm_files[sample]: #run_filt_abund(diginormdir,graph_count_filename,population) genus_species_dir=assemblydir+population+"/" rename_command=get_rename(diginormdir,population) process_name="split" rename_command=[rename_command] module_name_list="" filename=population #clusterfunc.sbatch_file(diginormdir,process_name,module_name_list,filename,rename_command) split_command1=split_reads(diginormdir,population) process_name="split" module_name_list="" split_command=[split_command1] clusterfunc.sbatch_file(diginormdir,process_name,module_name_list,filename,split_command)
def combine_files(merge_dictionary, basedir, combine_dir): for sample in merge_dictionary: R1 = [] R2 = [] for i in merge_dictionary[sample]: file_fields = i.split("/") fields = file_fields[5].split("_") if fields[3] == "R1": R1.append(i) elif fields[3] == "R2": R2.append(i) else: print "Wrong field.", fields print sample, "R1 files:", R1 print sample, "R2 files:", R2 fields_read = R1[0].split("/")[5].split("_") sample = fields_read[0] extension = fields_read[4][3:] newfilename_R1 = combine_dir + sample + "_R1" + extension newfilename_R2 = combine_dir + sample + "_R2" + extension files_string_R1 = " ".join(R1) files_string_R2 = " ".join(R2) combine_string_R1 = "cat " + files_string_R1 + " > " + newfilename_R1 combine_string_R2 = "cat " + files_string_R2 + " > " + newfilename_R2 print combine_string_R1 print combine_string_R2 #s=subprocess.Popen(combine_string,shell=True) #s.wait() combine_command = [combine_string_R1, combine_string_R2] module_load_list = [""] process_name = "combine" clusterfunc.sbatch_file(basedir, process_name, module_load_list, sample, combine_command)
def run_filt_abund(sample_diginormdir,sample): abund_filt=get_filter_abund(sample_diginormdir,sample) abund_filt_command=[abund_filt] process_name="abundfilt" module_name_list="" filename=sample clusterfunc.sbatch_file(trimdir,process_name,module_name_list,filename,abund_filt_command)
def split_reads(assemblydir): assemblydirs=os.listdir(assemblydir) for genus_species in assemblydirs: genus_species_dir=assemblydir+genus_species+"/" listoffiles=os.listdir(genus_species_dir) for filename in listoffiles: if filename.endswith("pe.keep.abundfilt.fq"): # next time you run this, specify output file, # otherwise output will be put in sbatch_files directory # moved manually 2/7/2016 split_command=""" split-paired-reads.py -1 {}{}.1 -2 {}{}.2 {}{} """.format(genus_species_dir,filename,genus_species_dir,filename,genus_species_dir,filename) process_name="split" module_name_list="" split_command=[split_command] #clusterfunc.sbatch_file(genus_species_dir,process_name,module_name_list,filename,split_command) #else: # print "Not found:",filename combine=""" cat {}*.1 > {}{}.left.fq cat {}*.2 > {}{}.right.fq gunzip -c {}*orphans.keep.abundfilt.fq.gz >> {}{}.left.fq """.format(genus_species_dir,genus_species_dir,genus_species,genus_species_dir,genus_species_dir,genus_species,genus_species_dir,genus_species_dir,genus_species) combine_command=[combine] process_name="combine" module_name_list="" clusterfunc.sbatch_file(genus_species_dir,process_name,module_name_list,genus_species,combine_command)
def combine_orphans_after_diginorm(genus_species_diginormdir,sample): consolidate_command=consolidate(genus_species_diginormdir,sample) consolidate_command=[consolidate_command] process_name="gzip" module_name_list="" filename=sample clusterfunc.sbatch_file(genus_species_diginormdir,process_name,module_name_list,filename,consolidate_command)
def run_extract_paired(genus_species_dir,sample): abund_filt_filename = genus_species_dir + sample + ".abundfilt" extract_paired=get_extract_paired(genus_species_dir,sample,abund_filt_filename) extract_command=[extract_paired] process_name="extract" module_name_list="" filename=sample clusterfunc.sbatch_file(genus_species_dir,process_name,module_name_list,filename,extract_command)
def interleave_reads(trimdir,interleavedir,files_list,sample): interleavefile=interleavedir+sample+".interleaved.fq.gz" paired_list=get_pairs(files_list,trimdir,sample) interleave_string=[get_interleave_string(paired_list,interleavefile)] process_name="interleave" module_name_list="" filename=sample clusterfunc.sbatch_file(interleavedir,process_name,module_name_list,filename,interleave_string) return interleavefile
def execute(assemblydirs,assemblydir): #files_dictionary=get_files(assemblydirs,assemblydir) for genus_species in assemblydirs: genus_species_dir=assemblydir+genus_species+"/" cat_command=get_cat_command(genus_species_dir,genus_species) command=[cat_command] module_load_list=[""] process_name="cat" clusterfunc.sbatch_file(genus_species_dir,process_name,module_load_list,genus_species,command)
def combine_split(): combine=""" cat {}*.1 > {}{}.left.fq cat {}*.2 > {}{}.right.fq gunzip -c {}*orphans.keep.abundfilt.fq >> {}{}.left.fq """.format(genus_species_dir,genus_species_dir,genus_species,genus_species_dir,genus_species_dir,genus_species,genus_species_dir,genus_species_dir,genus_species) combine_command=[combine] process_name="combine" module_name_list="" clusterfunc.sbatch_file(genus_species_dir,process_name,module_name_list,genus_species,combine_command)
def run_filt_abund(diginormdir,graph_count_filename,diginorm_sample): # if diginormfile.endswith("orphans.fq.gz.keep"): # abundfilt_filename=diginormdir+diginorm_sample+".orphans" # else: # abundfilt_filename=diginormdir+diginorm_sample abund_filt=get_filter_abund(diginormdir,diginorm_sample) abund_filt_command=[abund_filt] process_name="abundfilt" module_name_list="" filename=diginorm_sample clusterfunc.sbatch_file(diginormdir,process_name,module_name_list,filename,abund_filt_command)
def transrate(transratedir,transrate_out,trinity_fasta,sample,left,right): transrate_command = """ transrate --assembly={} --threads=4 \ --left={} \ --right={} \ --output={} """.format(trinity_fasta,left,right,transrate_out) print transrate_command commands = [transrate_command] process_name = "transrate" module_name_list = "" filename = sample clusterfunc.sbatch_file(transratedir, process_name,module_name_list, filename, commands)
def get_assemblies(assemblydir): #genus_species_dirs=os.listdir(assemblydir) genus_species_dirs=["F_heteroclitus.MDPP","F_heteroclitus.MDPL"] for genus_species in genus_species_dirs: print genus_species genus_species_dir=assemblydir+genus_species+"/" assemblyfile=genus_species_dir+"Trinity.fasta" bam_out=genus_species_dir+genus_species+".bam" flagstat_out=genus_species_dir+genus_species+".flagstat.txt" module_load_list=["bwa/0.7.9a","samtools/1.2"] bwa_command=[bwa_mem(assemblyfile,bam_out,flagstat_out)] process_name="bwa" clusterfunc.sbatch_file(genus_species_dir,process_name,module_load_list,genus_species,bwa_command)
def get_orphans(files_list,trimdir,sample): orphans_list=[] for filename in files_list: #print filename if filename.endswith("1U.fq"): orphans_list.append(filename) elif filename.endswith("2U.fq"): orphans_list.append(filename) orphans_string=[make_orphans(trimdir,orphans_list,sample)] process_name="orphans" module_name_list="" filename=sample clusterfunc.sbatch_file(trimdir,process_name,module_name_list,filename,orphans_string)
def quant_salmon(newdir,dirname,genus_species,trinity_fasta,species): salmon_index_string,index=salmon_index(newdir,genus_species,trinity_fasta) print salmon_index_string salmon_string=""" for i in {}{}*.trim_1P.fq do BASE=$(basename $i .trim_1P.fq) salmon quant -i {}{} --libType IU -1 {}$BASE.trim_1P.fq -2 {}$BASE.trim_2P.fq -o {}$BASE.quant; done """.format(dirname,species,newdir,index,dirname,dirname,newdir) print salmon_string salmonstring=[salmon_index_string,salmon_string] process_name="salmon" module_name_list="" clusterfunc.sbatch_file(newdir,process_name,module_name_list,genus_species,salmonstring)
def run_trinity(assemblydir): assemblydirs=os.listdir(assemblydir) for genus_species in assemblydirs: genus_species_dir=assemblydir+genus_species+"/" listoffiles=os.listdir(genus_species_dir) trinity_command=""" set -x # stops execution if there is an error set -e if [ -f {}trinity_out/Trinity.fasta ]; then exit 0 ; fi if [ -d {}trinity_out ]; then mv {}trinity_out_dir {}trinity_out_dir0 || true ; fi Trinity --left {}{}.left.fq \\ --right {}{}.right.fq \\ --output {}trinity_out --seqType fq --max_memory 14G \\ --CPU ${{THREADS:-2}} """.format(genus_species_dir,genus_species_dir,genus_species_dir,genus_species_dir,genus_species_dir,genus_species,genus_species_dir,genus_species,genus_species_dir) print trinity_command trinity_command=[trinity_command] module_load_list=["rsem/1.2.23","trinity/2.0.5"] process_name="trinity" clusterfunc.sbatch_file(genus_species_dir,process_name,module_load_list,genus_species,trinity_command)
def fastqc_report(fastqcdir, fastq_file, sample_name): fastqc_string = "fastqc -o " + fastqcdir + " " + fastq_file process_string = [fastqc_string] process_name = "fastqc" module_load_list = ["fastqc/0.10.1"] clusterfunc.sbatch_file(basedir, process_name, module_load_list, sample_name, process_string)
def run_trimmomatic(trimdir,file1,file2,sample): trimmomatic_string=get_trimmomatic(trimdir,file1,file2,sample) process_string=[trimmomatic_string] module_load_list="" process_name="trim" clusterfunc.sbatch_file(trimdir,process_name,module_load_list,sample,process_string)