Beispiel #1
0
def execute(basedir,url_data):
	trinity_scripts=[]
	for item in url_data.keys():
        #Creates directory for each file to be downloaded
        #Directory will be located according to organism and read type (single or paired)
        	organism=item[0]
        	seqtype=item[1]
        	org_seq_dir=basedir+organism+"/"
		# from here, split paired reads
		# then go do assembly
        	clusterfunc.check_dir(org_seq_dir)
        	url_list=url_data[item]
        	for url in url_list:
			SRA=basename(urlparse(url).path)
			newdir=org_seq_dir+SRA+"/"
			diginormdir=newdir+"diginorm/"
			diginormfile=diginormdir+SRA+".trimmed.interleaved.keep.abundfilt.fq.gz"
			trinitydir=newdir+"trinity/"
			clusterfunc.check_dir(trinitydir)
			if os.path.isfile(diginormfile):
				print "file exists:",diginormfile
			trinity_script=get_trinity_script(trinitydir,SRA)
			trinity_scripts.append(trinity_script)
			#build_files(trinitydir,diginormfile,SRA)
	run_trinity(trinity_scripts)			
Beispiel #2
0
def move_files(url_data, basedir, newdir):
    for item in url_data:
        organism = item[0].replace("'", "")
        sra = item[1]
        mmetsp = item[2]
        if mmetsp.endswith("_2"):
            mmetsp = mmetsp.split("_")[0]
        org_seq_dir = basedir + organism + "/" + sra + "/"
        mmetsp_dir = newdir + mmetsp + "/"
        print mmetsp_dir
        clusterfunc.check_dir(mmetsp_dir)
        file1_old = org_seq_dir + "trinity/" + sra + ".left.fq"
        file2_old = org_seq_dir + "trinity/" + sra + ".right.fq"
        file1_new = mmetsp_dir + sra + ".left.fq"
        file2_new = mmetsp_dir + sra + ".right.fq"
        if os.path.isfile(file1_new):
            if os.path.isfile(file2_new):
                print file1_new
                print file2_new
        else:
            cp_string1 = copy_fastq_filesdir(mmetsp_dir, file1_old)
            cp_string2 = copy_fastq_filesdir(mmetsp_dir, file2_old)
            commands = [cp_string1, cp_string2]
            id = sra + "_" + mmetsp
            send_to_cluster(basedir, commands, id)
            print cp_string1
            print cp_string2
Beispiel #3
0
def sim_link(salmondir,sra):
	counts_files_dir="/home/ubuntu/MMETSP/counts/"
	clusterfunc.check_dir(counts_files_dir)
	link_command="cp "+salmondir+sra+".quant.counts "+counts_files_dir+sra+".counts" 
	print link_command
	s=subprocess.Popen(link_command,shell=True)
        s.wait()
Beispiel #4
0
def execute(url_data,datadir):
    for item in url_data.keys():
	organism=item[0]
	org_seq_dir=datadir+organism+"/"
	url_list=url=url_data[item]
	for url in url_list:
		sra=basename(urlparse(url).path)
		newdir=org_seq_dir+sra+"/"
		trimdir=newdir+"trim/"
		interleavedir=newdir+"interleave/"
		clusterfunc.check_dir(trimdir)
		interleavedir=newdir+"interleave/"
		clusterfunc.check_dir(interleavedir)
		file1=newdir+sra+"_1.fastq"
		file2=newdir+sra+"_2.fastq"
		if os.path.isfile(file1) and os.path.isfile(file2):
			print file1
			print file2
			#fastqc_report(datadir,fastqcdir)
			### need to fix so the following steps run themselves:
			run_trimmomatic_TruSeq(trimdir,file1,file2,sra)
			interleave_reads(trimdir,sra,interleavedir)
                	#run_jellyfish(trimdir,sra)
			make_orphans(trimdir)
		else:
			print "Files do not exist:",file1,file2 	
Beispiel #5
0
def move_files(url_data,basedir,newdir):
	for item in url_data:
                organism = item[0].replace("'","")
                sra = item[1]
                mmetsp = item[2]
		if mmetsp.endswith("_2"):
			mmetsp = mmetsp.split("_")[0]
		org_seq_dir = basedir + organism + "/" + sra + "/"
		mmetsp_dir = newdir + mmetsp + "/"
	        print mmetsp_dir
		clusterfunc.check_dir(mmetsp_dir)
		file1_old = org_seq_dir + "trinity/" + sra + ".left.fq"
		file2_old = org_seq_dir + "trinity/" + sra + ".right.fq"
		file1_new = mmetsp_dir + sra + ".left.fq"
		file2_new = mmetsp_dir + sra + ".right.fq"
		if os.path.isfile(file1_new):
			if os.path.isfile(file2_new):
				print file1_new
				print file2_new
		else:			
			cp_string1 = copy_fastq_filesdir(mmetsp_dir,file1_old)
			cp_string2 = copy_fastq_filesdir(mmetsp_dir,file2_old)
			commands = [cp_string1,cp_string2]
			id = sra + "_" + mmetsp
			send_to_cluster(basedir,commands,id)
			print cp_string1
			print cp_string2
Beispiel #6
0
def check_assemblies(url_data, assemblies, mmetsp_data):
    different = []
    for item in url_data:
        organism = item[0].replace("'", "")
        seqtype = item[1]
        mmetsp_id = item[2].replace("'", "")
        strain, organism_mmetsp, different, alt = get_strain(
            different, mmetsp_id, organism, mmetsp_data)
        org_seq_dir = basedir + organism + "/"
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            command_list = []
            sra = basename(urlparse(url).path)
            if alt == "blank":
                sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id
            else:
                sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id + "_alt_" + alt
            newdir = org_seq_dir + sra + "/"
            if sra in assemblies:
                trinitydir = newdir + "trinity/"
                #trinity_fasta = trinitydir+"trinity_out/"+"Trinity.fasta"
                trinity_fasta_new = trinitydir + sample + ".Trinity.fixed.fasta"
                #trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra)
                trinity_fasta_old = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta"
                assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/"
                if os.path.isfile(trinity_fasta_old) == True:
                    #trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra)
                    #copy_string="cp "+trinity_fasta_old+" "+trinity_fasta_new
                    copy_string = "cp " + trinity_fasta_new + " " + assemblydir
                    print copy_string
                    #s=subprocess.Popen(copy_string,shell=True)
                    #s.wait()
                else:
                    print "Trinity finished but don't have fixed version to copy."
Beispiel #7
0
def execute(url_data,datadir):
    missing = []
    trimmed = []
    remaining = []
    for item in url_data.keys():
	organism=item[0].replace("'","")
	org_seq_dir=datadir+organism+"/"
	url_list=url=url_data[item]
	for url in url_list:
		sra=basename(urlparse(url).path)
		newdir=org_seq_dir+sra+"/"
		trimdir=newdir+"trim/"
		interleavedir=newdir+"interleave/"
		clusterfunc.check_dir(trimdir)
		clusterfunc.check_dir(interleavedir)
		file1=newdir+sra+"_1.fastq"
		file2=newdir+sra+"_2.fastq"
		#if os.path.isfile(file1) and os.path.isfile(file2):
		#	print file1
		#	print file2
		missing,trimmed,remaining= run_trimmomatic_TruSeq(missing,trimmed,remaining,trimdir,file1,file2,sra)
		#run_move_files(trimdir,sra)
		# check_files(trimdir,sra)
		# else:
		#	print "Files do not exist:",file1,file2 	
    print "Missing trimmed:",len(missing)
    print missing
    print "Trimmed:",len(trimmed)
    print "remaining:",len(remaining)
    print remaining
Beispiel #8
0
def check_assemblies(url_data,assemblies,mmetsp_data):
        different = []
        for item in url_data:
                organism = item[0].replace("'","")
                seqtype = item[1]
                mmetsp_id = item[2].replace("'","")
                strain,organism_mmetsp,different,alt = get_strain(different,mmetsp_id,organism,mmetsp_data)
                org_seq_dir = basedir + organism + "/"
                clusterfunc.check_dir(org_seq_dir)
                url_list = url_data[item]
                for url in url_list:
                        command_list = []
                        sra = basename(urlparse(url).path)
                        if alt == "blank":
                                sample = organism+"_"+strain+"_"+sra+"_"+mmetsp_id
                        else:
                                sample = organism+"_"+strain+"_"+sra+"_"+mmetsp_id+"_alt_"+alt
                        newdir = org_seq_dir + sra + "/"
                        if sra in assemblies:
                        	trinitydir = newdir + "trinity/"
                                #trinity_fasta = trinitydir+"trinity_out/"+"Trinity.fasta"
                                trinity_fasta_new =trinitydir+sample+".Trinity.fixed.fasta"
				#trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra)
                                trinity_fasta_old = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta"
                                assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/"
                                if os.path.isfile(trinity_fasta_old) == True:
                                        #trinity_fixed_fasta = fix_fasta(trinity_fasta,trinitydir,sra)
                                        #copy_string="cp "+trinity_fasta_old+" "+trinity_fasta_new
                                        copy_string="cp "+trinity_fasta_new+" "+assemblydir
                                        print copy_string
                                        #s=subprocess.Popen(copy_string,shell=True)
                                        #s.wait()
				else:
					print "Trinity finished but don't have fixed version to copy."
Beispiel #9
0
def execute(url_data,datadir):
    missing = []
    trimmed = []
    remaining = []
    for item in url_data.keys():
	organism=item[0].replace("'","")
	org_seq_dir=datadir+organism+"/"
	url_list=url=url_data[item]
	for url in url_list:
		sra=basename(urlparse(url).path)
		newdir=org_seq_dir+sra+"/"
		trimdir=newdir+"trim/"
		interleavedir=newdir+"interleave/"
		clusterfunc.check_dir(trimdir)
		clusterfunc.check_dir(interleavedir)
		file1=newdir+sra+"_1.fastq"
		file2=newdir+sra+"_2.fastq"
		#if os.path.isfile(file1) and os.path.isfile(file2):
		#	print file1
		#	print file2
		missing,trimmed,remaining= run_trimmomatic_TruSeq(missing,trimmed,remaining,trimdir,file1,file2,sra)
		#run_move_files(trimdir,sra)
		# check_files(trimdir,sra)
		# else:
		#	print "Files do not exist:",file1,file2 	
    print "Missing trimmed:",len(missing)
    print missing
    print "Trimmed:",len(trimmed)
    print "remaining:",len(remaining)
    print remaining
Beispiel #10
0
def execute(data_frame,url_data,basedir):
	trinity_fail=[]
	count = 0
	# construct an empty pandas dataframe to add on each assembly.csv to
	for item in url_data.keys():
		#print item
		organism=item[0]
		sample="_".join(item)
		org_seq_dir=basedir+organism+"/"
		url_list=url_data[item]
		for url in url_list:
			sra=basename(urlparse(url).path)
			newdir=org_seq_dir+sra+"/"
			trimdir=newdir+"trim/"
			trinitydir=newdir+"trinity/"
			busco_dir=newdir+"busco/qsub_files/"
			clusterfunc.check_dir(busco_dir)
			trinity_fasta=trinitydir+sample+".Trinity.fixed.fasta"
			busco_file=busco_dir+"run_"+sample+".euk/short_summary_"+sample+".euk"
			print busco_file
			if os.path.isfile(busco_file):
				count+=1
				#run_busco(busco_dir,trinity_fasta,sample,sra)
				data=parse_busco_stats(busco_file,sample)
				data_frame=build_DataFrame(data_frame,data)
			else:
				print "Trinity failed:",trinity_fasta
				trinity_fail.append(newdir)	
	print "This is the number of Trinity de novo transcriptome assemblies:"
	print count
	print "This is the number of times Trinity failed:"
	print len(trinity_fail)
	print trinity_fail
	return data_frame
Beispiel #11
0
def run_trimmomatic_TruSeq(missing, trimmed, remaining, trimdir, file1, file2,
                           sra):
    bash_filename = trimdir + sra + ".trim.TruSeq.sh"
    clusterfunc.check_dir(trimdir + "qsub_files/")
    listoffile = os.listdir(trimdir + "qsub_files/")
    # print listoffile
    trim_file = trimdir + "qsub_files/" "trim." + sra + ".log"
    # print trim_file
    matching = [s for s in listoffile if "trim." + sra + ".log" in s]
    matching_string = "TrimmomaticPE: Completed successfully"
    if os.path.isfile(trim_file):
        with open(trim_file) as f:
            content = f.readlines()
    if len(matching) != 0:
        trim_complete = [m for m in content if matching_string in m]
        if len(trim_complete) != 0:
            print "Already trimmed:", matching
            trimmed.append(sra)
        else:
            missing.append(trimdir)
            j = """
java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\
-baseout {}.trim.fq \\
{} {} \\
ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\
SLIDINGWINDOW:4:2 \\
LEADING:2 \\
TRAILING:2 \\
MINLEN:25 &> trim.{}.log
""".format(sra, file1, file2, sra)
            orphan_string = make_orphans(trimdir, sra)
            commands = [j, orphan_string]
            process_name = "trim"
            module_name_list = ""
            filename = sra
            clusterfunc.qsub_file(trimdir, process_name, module_name_list,
                                  filename, commands)
    else:
        remaining.append(trimdir)
        j = """
java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\
-baseout {}.trim.fq \\
{} {} \\
ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\
SLIDINGWINDOW:4:2 \\
LEADING:2 \\
TRAILING:2 \\
MINLEN:25 &> trim.{}.log
""".format(sra, file1, file2, sra)
        orphan_string = make_orphans(trimdir, sra)
        commands = [j, orphan_string]
        process_name = "trim"
        module_name_list = ""
        filename = sra
        clusterfunc.qsub_file(trimdir, process_name, module_name_list,
                              filename, commands)
    return missing, trimmed, remaining
Beispiel #12
0
def execute(data_frame1,data_frame2,mmetsp_data,basedir,mmetsp_assemblies):
	trinity_fail=[]
	reference_filename = "blank"
	# construct an empty pandas dataframe to add on each assembly.csv to
	for item in mmetsp_data.keys():
		#print item
		organism=item[0]
		sample="_".join(item)
		org_seq_dir=basedir+organism+"/"
		mmetsp_list=mmetsp_data[item]
		for mmetsp in mmetsp_list:
			print mmetsp
			assemblyfileslist=os.listdir(mmetsp_assemblies)
			for filename in assemblyfileslist:
				if filename.startswith(mmetsp):
					if filename.endswith(".fixed.fa"):
						print "This is not the one you want."	
					else:
						print "MMETSP assembly found:",filename
						reference_filename=filename
			if reference_filename == "blank":
				print "No MMETSP file found:",mmetsp
				break 
			else:
				sra=item[1]
				newdir=org_seq_dir+sra+"/"
				trinitydir=newdir+"trinity/"
				transrate_dir=newdir+"transrate/"
				transrate_reference_dir=newdir+"transrate_dib_v_ncgr_cds/"
				clusterfunc.check_dir(transrate_reference_dir)
				transrate_reverse_dir=newdir+"transrate_ncgr_cds_v_dib/"
				clusterfunc.check_dir(transrate_reverse_dir)
				trinity_fasta=trinitydir+sample+".Trinity.fixed.fasta"
				if os.path.isfile(trinity_fasta):	
					print trinity_fasta
					fixed_mmetsp_ref=fix_fasta_reference(reference_filename,mmetsp_assemblies)
					transrate(transrate_reference_dir,sample,trinity_fasta,mmetsp_assemblies_dir,fixed_mmetsp_ref)
					transrate_reverse(transrate_reverse_dir,sample,trinity_fasta,mmetsp_assemblies_dir,fixed_mmetsp_ref)
				else:
					print "Trinity failed:",newdir
					trinity_fail.append(newdir)	
				transrate_assemblies_ref=transrate_reference_dir+sample+"/assemblies.csv"
				transrate_reverse_assemblies=transrate_reverse_dir+sample+"/assemblies.csv"
				print transrate_assemblies_ref
				print transrate_reverse_assemblies
				if os.path.isfile(transrate_assemblies_ref):
					data1=parse_transrate_stats(transrate_assemblies_ref)
					data_frame1=build_DataFrame(data_frame1,data1)
				if os.path.isfile(transrate_reverse_assemblies):
					data2=parse_transrate_stats(transrate_reverse_assemblies)
					data_frame2=build_DataFrame(data_frame2,data2)
	print "This is the number of times Trinity failed:"
	print len(trinity_fail)
	print trinity_fail
	return data_frame1,data_frame2
Beispiel #13
0
def execute(url_data):
	for item in url_data.keys():
		organism=item[0]
		org_seq_dir=basedir+organism+"/"
		url_list=url_data[item]
		for url in url_list:
			sra=basename(urlparse(url).path)
			newdir=org_seq_dir+sra+"/"
			trinitydir=newdir+"trinity/trinity_out/"
			salmondir=newdir+"salmon/"
			clusterfunc.check_dir(salmondir)
Beispiel #14
0
def run_trimmomatic_TruSeq(missing, trimmed, remaining, trimdir, file1, file2, sra):
	bash_filename=trimdir+sra+".trim.TruSeq.sh"
	clusterfunc.check_dir(trimdir+"qsub_files/")
	listoffile = os.listdir(trimdir+"qsub_files/")
	# print listoffile
	trim_file = trimdir+"qsub_files/""trim."+sra+".log"
	# print trim_file
	matching = [s for s in listoffile if "trim."+sra+".log" in s]
	matching_string = "TrimmomaticPE: Completed successfully"
	if os.path.isfile(trim_file):
		with open(trim_file) as f:
    			content = f.readlines()
	if len(matching)!=0:
		trim_complete = [m for m in content if matching_string in m]
		if len(trim_complete)!=0:
			print "Already trimmed:",matching
			trimmed.append(sra)
		else:
			missing.append(trimdir)
			j="""
java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\
-baseout {}.trim.fq \\
{} {} \\
ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\
SLIDINGWINDOW:4:2 \\
LEADING:2 \\
TRAILING:2 \\
MINLEN:25 &> trim.{}.log
""".format(sra,file1,file2,sra)
			orphan_string=make_orphans(trimdir,sra)
			commands = [j,orphan_string]
        		process_name="trim"
        		module_name_list=""
        		filename=sra
        		clusterfunc.qsub_file(trimdir,process_name,module_name_list,filename,commands)
	else:
		remaining.append(trimdir)
		j="""
java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\
-baseout {}.trim.fq \\
{} {} \\
ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\
SLIDINGWINDOW:4:2 \\
LEADING:2 \\
TRAILING:2 \\
MINLEN:25 &> trim.{}.log
""".format(sra,file1,file2,sra)
                orphan_string=make_orphans(trimdir,sra)
                commands = [j,orphan_string]
                process_name="trim"
                module_name_list=""
                filename=sra
                clusterfunc.qsub_file(trimdir,process_name,module_name_list,filename,commands)
	return missing,trimmed,remaining
Beispiel #15
0
def execute(assemblydirs,salmondir,assemblydir,basedir,trimdir):
	for genus_species_names in assemblydirs:
		genus_species = genus_species_names.split(".")[0]
		species = genus_species+genus_species_names.split(".")[1]
		print genus_species
		print species
		dirname=trimdir+genus_species+"/"
		newdir=salmondir+genus_species_names+"/"
		clusterfunc.check_dir(newdir)
		
		trinity_fasta=assemblydir+genus_species_names+"/"+genus_species_names+".Trinity.fixed.fa"
		quant_salmon(newdir,dirname,genus_species_names,trinity_fasta,species)
Beispiel #16
0
def execute(url_data):
	for item in url_data.keys():
		organism=item[0]
		org_seq_dir=basedir+organism+"/"
		url_list=url_data[item]
		for url in url_list:
			sra=basename(urlparse(url).path)
			newdir=org_seq_dir+sra+"/"
			trinitydir=newdir+"trinity/trinity_out/"
			clusterfunc.check_dir(trinitydir)
			trinity_fasta=trinitydir+"Trinity.fasta"
			fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sra)
Beispiel #17
0
def execute(url_data):
        for item in url_data.keys():
                organism=item[0]
                org_seq_dir=basedir+organism+"/"
                url_list=url_data[item]
                for url in url_list:
                        sra=basename(urlparse(url).path)
                        newdir=org_seq_dir+sra+"/"
                        trinitydir=newdir+"trinity/trinity_out/"
                        salmondir=newdir+"salmon/"
                        clusterfunc.check_dir(salmondir)
                        trinity_fasta=trinitydir+"Trinity.fasta"
                        quant_salmon(salmondir,sra,newdir,trinity_fasta)
Beispiel #18
0
def execute(url_data):
    for item in url_data.keys():
        organism = item[0]
        org_seq_dir = basedir + organism + "/"
        url_list = url_data[item]
        for url in url_list:
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            salmondir = newdir + "salmon/"
            rapclustdir = newdir + "rapclust/"
            clusterfunc.check_dir(rapclustdir)
            clusterfunc.check_dir(salmondir)
            run_rap_clust(salmondir, rapclustdir, sra)
Beispiel #19
0
def execute(url_data):
    for item in url_data.keys():
        organism = item[0]
        org_seq_dir = basedir + organism + "/"
        url_list = url_data[item]
        for url in url_list:
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            salmondir = newdir + "salmon/"
            rapclustdir = newdir + "rapclust/"
            clusterfunc.check_dir(rapclustdir)
            clusterfunc.check_dir(salmondir)
            run_rap_clust(salmondir, rapclustdir, sra)
Beispiel #20
0
def check_sra(url_data, no_files):
    for item in url_data:
        organism = item[0].replace("'", "")
        seqtype = item[1]
        org_seq_dir = basedir + organism + "/"
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            command_list = []
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            if sra in no_files:
                if os.path.isdir(newdir):
                    print "Directory exists:", sra
                    if os.path.isfile(sra):
                        print "Exists:", sra
                    else:
                        print "Missing:", newdir
                        clusterfunc.check_dir(newdir)
                        print url
                        filestring = newdir + sra
                        if os.path.isfile(filestring):
                            print "file exists:", filestring
                        else:
                            urlstring = download(url, newdir, sra)
                            command_list.append(urlstring)
                        if glob.glob(newdir + "*.fastq"):
                            print "SRA has already been extracted", filestring
                        else:
                            sra_string = sra_extract(newdir, sra)
                            command_list.append(sra_string)
                        names = "download_extract"
                        print command_list
                        if len(command_list) >= 1:
                            send_to_cluster(newdir, command_list, sra, names)
                        else:
                            print "Pipeline already run."
                            fastqcdir = newdir + "fastqc/"
                            clusterfunc.check_dir(fastqcdir)
                            fastqc(newdir, fastqcdir, sra)
                            trimdir = newdir + "trim/"
                            interleavedir = newdir + "interleave/"
                            clusterfunc.check_dir(trimdir)
                            clusterfunc.check_dir(interleavedir)
                            file1 = newdir + sra + "_1.fastq"
                            file2 = newdir + sra + "_2.fastq"
                            if os.path.isfile(file1) and os.path.isfile(file2):
                                print file1
                                print file2
                                run_trimmomatic_TruSeq(trimdir, file1, file2,
                                                       sra)
Beispiel #21
0
def check_sra(url_data,no_files):
	for item in url_data:
                organism = item[0].replace("'","")
                seqtype = item[1]
                org_seq_dir = basedir + organism + "/"
                clusterfunc.check_dir(org_seq_dir)
                url_list = url_data[item]
                for url in url_list:
			command_list = []
                        sra = basename(urlparse(url).path)
                        newdir = org_seq_dir + sra + "/"
			if sra in no_files:
				if os.path.isdir(newdir):
					print "Directory exists:",sra
					if os.path.isfile(sra):
						print "Exists:",sra
					else:
						print "Missing:",newdir
						clusterfunc.check_dir(newdir)
						print url	
						filestring = newdir + sra
    						if os.path.isfile(filestring):
        						print "file exists:", filestring
						else:
							urlstring = download(url,newdir,sra)
							command_list.append(urlstring)
						if glob.glob(newdir + "*.fastq"):
        						print "SRA has already been extracted", filestring
						else:
							sra_string = sra_extract(newdir,sra)
							command_list.append(sra_string)	
						names = "download_extract"
						print command_list
						if len(command_list) >=1:
							send_to_cluster(newdir,command_list,sra,names)
						else:
							print "Pipeline already run."
							fastqcdir = newdir + "fastqc/"
							clusterfunc.check_dir(fastqcdir)
							fastqc(newdir, fastqcdir, sra)
							trimdir=newdir+"trim/"
							interleavedir=newdir+"interleave/"
                					clusterfunc.check_dir(trimdir)
                					clusterfunc.check_dir(interleavedir)
                					file1=newdir+sra+"_1.fastq"
                					file2=newdir+sra+"_2.fastq"
							if os.path.isfile(file1) and os.path.isfile(file2):
                        					print file1
                        					print file2
								run_trimmomatic_TruSeq(trimdir,file1,file2,sra)
def execute(trinity_fail, count, basedir):
	id_list = os.listdir(basedir)
        for mmetsp in id_list:
		if mmetsp != "qsub_files":
			mmetspdir = basedir + mmetsp + "/"
            		trinitydir = basedir + mmetsp + "/" + "trinity/"
			trinity_files = os.listdir(mmetspdir)
            		trinity_fasta=trinitydir+"trinity_out_2.2.0.Trinity.fasta"
            		#trinity_fasta = trinitydir + sample + ".Trinity.fixed.fasta"
            		clusterfunc.check_dir(trinitydir)
            		if os.path.isfile(trinity_fasta) == False:
				if os.path.isfile("/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"+mmetsp+".trinity_out_2.2.0.Trinity.fasta"):
					print "Trinity finished."
					count +=1
				else:
					print mmetspdir
					right = [s for s in trinity_files if s.endswith(".right.fq")][0]
                        		left = [s for s in trinity_files if s.endswith(".left.fq")][0]
                        		right = mmetspdir + right
                        		left = mmetspdir + left
                        		if os.path.isfile(left) and os.path.isfile(right):
						right = [s for s in trinity_files if s.endswith(".right.fq")][0]
                        			left = [s for s in trinity_files if s.endswith(".left.fq")][0]
                        			right = mmetspdir + right
                       	 			left = mmetspdir + left
						run_trinity(trinitydir,left,right,mmetsp)
                				#print "Trinity failed:", trinity_fasta
                				#trinity_fail.append(trinitydir)
            				else:
						print "No files:",left
			else:
                		print "Trinity completed successfully.", trinity_fasta
                		count += 1
                		assemblydir = "/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"
                		copy_string = "cp " + trinity_fasta + " " + assemblydir + mmetsp + ".trinity_out_2.2.0.Trinity.fasta" 
                		print copy_string
                		s = subprocess.Popen(copy_string, shell=True)
                		s.wait()
                		# trinity_out=fix_fasta(trinity_fasta,trinitydir,sample)
                		# print "Needs to be fixed:",trinity_fasta
                		# print trinity_out
                		#"Re-run diginorm:",diginormfile
            			#count = check_trinity(newdir,SRA,count)
    	print "Number of Trinity de novo transcriptome assemblies:"
    	print count
   	print "Number of times Trinity failed:"
    	print len(trinity_fail)
    	print trinity_fail
    	return trinity_fail, count
Beispiel #23
0
def get_no_files(url_data):
    assemblies = []
    trinity_fail = []
    empty_files = []
    no_files = []
    for item in url_data.keys():
        organism = item[0].replace("'", "")
        seqtype = item[1]
        org_seq_dir = basedir + organism + "/"
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            filename = newdir + sra
            # check if trinity exists
            trinitydir = newdir + "trinity/"
            left = trinitydir + sra + ".left.fq"
            right = trinitydir + sra + ".right.fq"
            if os.path.isfile(left):
                empty_files = check_empty(empty_files, left, sra)
                if os.path.isfile(right):
                    empty_files = check_empty(empty_files, right, sra)
                    trinity_outputdir = trinitydir + "trinity_out/"
                    #trinity_file = trinity_outputdir + "Trinity.fasta"
                    trinity_file = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta"
                    trinity_fail, assemblies = check_trinity(
                        assemblies, trinity_fail, trinity_file, sra)
                else:
                    print "Missing right:", right
                    if sra not in trinity_fail:
                        no_files.append(sra)
            else:
                print "Missing left:", left
                if sra not in no_files:
                    if sra not in trinity_fail:
                        no_files.append(sra)
    print "Empty files:"
    print empty_files
    print len(empty_files)
    print "Trinity needs to be run again:"
    print trinity_fail
    print len(trinity_fail)
    print "Pipeline needs to be run again:"
    print no_files
    print len(no_files)
    print "Assemblies:"
    print len(assemblies)
    return trinity_fail, assemblies
Beispiel #24
0
def get_no_files(url_data):
    assemblies = []
    trinity_fail = []
    empty_files = []
    no_files = []
    for item in url_data.keys():
        organism = item[0].replace("'","")
        seqtype = item[1]
        org_seq_dir = basedir + organism + "/"
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            filename = newdir + sra
            # check if trinity exists
            trinitydir = newdir + "trinity/"
            left = trinitydir + sra + ".left.fq"
            right = trinitydir + sra + ".right.fq"
            if os.path.isfile(left):
	    	empty_files = check_empty(empty_files, left, sra)
		if os.path.isfile(right):
			empty_files = check_empty(empty_files, right, sra)
			trinity_outputdir = trinitydir + "trinity_out/"
            		#trinity_file = trinity_outputdir + "Trinity.fasta"
            		trinity_file = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta"
			trinity_fail,assemblies = check_trinity(assemblies,trinity_fail, trinity_file, sra)
		else:
			print "Missing right:",right
			if sra not in trinity_fail:
				no_files.append(sra)
            else:
		print "Missing left:",left
		if sra not in no_files:
			if sra not in trinity_fail:
				no_files.append(sra)
    print "Empty files:"
    print empty_files
    print len(empty_files)
    print "Trinity needs to be run again:"
    print trinity_fail
    print len(trinity_fail)
    print "Pipeline needs to be run again:"
    print no_files
    print len(no_files)
    print "Assemblies:"
    print len(assemblies)
    return trinity_fail,assemblies
Beispiel #25
0
def execute(basedir, newdir, url_data):
    for item in url_data:
        organism = item[0].replace("'", "")
        org_seq_dir = basedir + organism + "/"
        mmetsp = item[2]
        if mmetsp.endswith("_2"):
            mmetsp = mmetsp.split("_")[0]
        sra = item[1]
        newdir_sra = org_seq_dir + sra + "/"
        sra_transrate_1 = newdir_sra + "transrate_dib_v_ncgr_cds/"
        sra_transrate_2 = newdir_sra + "transrate_ncgr_cds_v_dib/"
        sra_transrate = newdir_sra + "transrate/"
        sra_trim = newdir_sra + "trim/"
        sra_trim_1P = sra_trim + sra + ".trim_1P.fq"
        sra_trim_2P = sra_trim + sra + ".trim_2P.fq"
        sra_busco = newdir_sra + "busco/"

        newdir_mmetsp = newdir + mmetsp + "/"
        newdir_mmetsp_sra = newdir_mmetsp + "sra/"
        newdir_mmetsp_sra_transrate = newdir_mmetsp_sra + "transrate/"
        newdir_mmetsp_sra_trim = newdir_mmetsp_sra + "trim/"
        newdir_mmetsp_sra_busco = newdir_mmetsp_sra + "busco/"
        newdir_mmetsp_sra_fastqc = newdir_mmetsp_sra + "fastqc_raw/"
        clusterfunc.check_dir(newdir_mmetsp_sra)
        clusterfunc.check_dir(newdir_mmetsp_sra_transrate)
        clusterfunc.check_dir(newdir_mmetsp_sra_trim)
        clusterfunc.check_dir(newdir_mmetsp_sra_busco)
        clusterfunc.check_dir(newdir_mmetsp_sra_fastqc)

        if os.path.isdir(newdir_sra):
            print "Exists:", newdir_sra
        else:
            print "Missing:", newdir_sra
        if os.path.isdir(newdir_mmetsp):
            print "Exists:", newdir_mmetsp
        else:
            print "Missing:", newdir_mmetsp

# copy_transrate1 =

#copy_transrate2 =

#copy_transrate_scores =

#copy_trim_reads =
        ged_trim = "/mnt/research/ged/data/mmetsp/trimmed_reads/"
        copy_file(sra_trim_1P, ged_trim)
        copy_file(sra_trim_2P, ged_trim)
Beispiel #26
0
def execute(listoffiles,trimdir,interleavedir,diginormdir,assemblydir):
        files_dictionary=get_files(listoffiles,trimdir)
	for sample in files_dictionary.keys():
                fileslist=sorted(files_dictionary[sample])
		matching = [s for s in fileslist if "orphans" in s]
	interleavefileslist = os.listdir(interleavedir)
	interleave_files = get_samples(interleavefileslist,interleavedir)
	print interleave_files
	for sample in interleave_files:
		orphansfile = combine_orphans(trimdir,sample)
		sample_diginormdir = diginormdir + sample + "/"
		clusterfunc.check_dir(sample_diginormdir)
		#run_diginorm(trimdir,sample_diginormdir,orphansfile,interleavedir,sample)
		#run_filt_abund(sample_diginormdir,sample)
		#run_extract_paired(sample_diginormdir,sample)
		combine_orphans_after_diginorm(sample_diginormdir,sample)
Beispiel #27
0
def execute(url_data,basedir):
	for item in url_data.keys():
		organism=item[0]
		org_seq_dir=basedir+organism+"/"
		url_list=url_data[item]
		for url in url_list:
			sra=basename(urlparse(url).path)
			newdir=org_seq_dir+sra+"/"
			trinitydir=newdir+"trinity/trinity_out/"
			dammitdir=newdir+"dammit_dir/"
			clusterfunc.check_dir(dammitdir)
			trinity_fasta=trinitydir+"Trinity.fasta"
			print trinity_fasta
			if os.path.isfile(trinity_fasta):
				print "File exists:",trinity_fasta
				dammit_string(basedir,dammitdir,sra,trinity_fasta)
Beispiel #28
0
def execute(data_frame,species,basedir):
	# construct an empty pandas dataframe to add on each assembly.csv to
	newdir=basedir+species+"/"
	trinitydir=newdir+"trinity_out/"
	busco_dir=newdir+"busco/"
	clusterfunc.check_dir(busco_dir)
	trinity_fasta=trinitydir+"Trinity.fasta"
	busco_file=busco_dir+"qsub_files/"+"run_"+species+".metazoa/short_summary_"+species+".metazoa"
	if os.path.isfile(trinity_fasta):
		fixed_trinity_fasta=fix_fasta(trinity_fasta,trinitydir,species)
		#run_busco(busco_dir,fixed_trinity_fasta,species)
		data=parse_busco_stats(busco_file,species)
		data_frame=build_DataFrame(data_frame,data)
	else:
		print "Trinity failed:",trinity_fasta
	return data_frame
Beispiel #29
0
def execute(url_data, basedir):
    for item in url_data.keys():
        organism = item[0]
        org_seq_dir = basedir + organism + "/"
        url_list = url_data[item]
        for url in url_list:
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            trinitydir = newdir + "trinity/trinity_out/"
            dammitdir = newdir + "dammit_dir/"
            clusterfunc.check_dir(dammitdir)
            trinity_fasta = trinitydir + "Trinity.fasta"
            print trinity_fasta
            if os.path.isfile(trinity_fasta):
                print "File exists:", trinity_fasta
                dammit_string(basedir, dammitdir, sra, trinity_fasta)
Beispiel #30
0
def group_assembly_files(diginormdir,assemblydir):		
	abund_filt_files=get_abund_filt_files(diginormdir)
	genus_species_files=get_genus_species(abund_filt_files)
	for genus_species in genus_species_files:
		genus_species_dir=assemblydir+genus_species+"/"
		clusterfunc.check_dir(genus_species_dir)
		for filename in genus_species_files[genus_species]:
			abund_filt_filename=diginormdir+filename
			sample_info=filename.split("_")
			extension=sample_info[-1].split(".")
			sample="_".join(sample_info[:-1])
			sample=sample+"_"+extension[0]+"_"+extension[2]
			rename_command=get_rename(genus_species_dir,sample,abund_filt_filename)
			process_name="split"
			rename_command=[rename_command]
        		module_name_list=""
        		filename=sample
def execute(trinity_fail, count, basedir):
    id_list = os.listdir(basedir)
    for mmetsp in id_list:
        if mmetsp != "qsub_files":
            mmetspdir = basedir + mmetsp + "/"
            trinitydir = basedir + mmetsp + "/" + "trinity/"
            trinity_files = os.listdir(mmetspdir)
            trinity_fasta = trinitydir + dib_conf.output_extension
            clusterfunc.check_dir(trinitydir)
            if os.path.isfile(trinity_fasta) == False:
                if os.path.isfile(dib_conf.output_dir + mmetsp +
                                  dib_conf.output_extension):
                    print("Trinity finished.")
                    count += 1
                else:
                    print(mmetspdir)
                    right = [
                        s for s in trinity_files if s.endswith(".right.fq")
                    ][0]
                    left = [
                        s for s in trinity_files if s.endswith(".left.fq")
                    ][0]
                    right = mmetspdir + right
                    left = mmetspdir + left
                    if os.path.isfile(left) and os.path.isfile(right):
                        right = [
                            s for s in trinity_files if s.endswith(".right.fq")
                        ][0]
                        left = [
                            s for s in trinity_files if s.endswith(".left.fq")
                        ][0]
                        right = mmetspdir + right
                        left = mmetspdir + left
                        run_trinity(trinitydir, left, right, mmetsp,
                                    dib_conf.output_dir,
                                    dib_conf.output_extension)
                    else:
                        print("No files:", left)
            else:
                print("Trinity completed successfully.", trinity_fasta)
                count += 1
                assemblydir = dib_conf.output_dir
    print("Number of Trinity de novo transcriptome assemblies:", count)
    print("Number of times Trinity failed:", len(trinity_fail), trinity_fail)
    return trinity_fail, count
Beispiel #32
0
def execute(basedir, url_data):
    for item in url_data.keys():
        organism = item[0]
        seqtype = item[1]
        org_seq_dir = basedir + organism + "/"
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            SRA = basename(urlparse(url).path)
            newdir = org_seq_dir + SRA + "/"
            interleavedir = newdir + "interleave/"
            diginormdir = newdir + "diginorm/"
            clusterfunc.check_dir(diginormdir)
            trimdir = newdir + "trim/"
            #run_streaming_diginorm(trimdir,SRA,diginormdir)
            #interleave_reads(trimdir,SRA,interleavedir)
            #run_diginorm(diginormdir,interleavedir,trimdir,SRA)
            run_filter_abund(diginormdir, SRA)
Beispiel #33
0
def execute(basedir, url_data):
    for item in url_data.keys():
        organism = item[0]
        seqtype = item[1]
        org_seq_dir = basedir + organism + "/"
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            SRA = basename(urlparse(url).path)
            newdir = org_seq_dir + SRA + "/"
            interleavedir = newdir + "interleave/"
            diginormdir = newdir + "diginorm/"
            clusterfunc.check_dir(diginormdir)
            trimdir = newdir + "trim/"
            # run_streaming_diginorm(trimdir,SRA,diginormdir)
            # interleave_reads(trimdir,SRA,interleavedir)
            # run_diginorm(diginormdir,interleavedir,trimdir,SRA)
            run_filter_abund(diginormdir, SRA)
Beispiel #34
0
def get_num_files(filesdir, workingdir):
    listoffiles = os.listdir(workingdir)
    diamond_dir = workingdir + "diamond/"
    clusterfunc.check_dir(diamond_dir)
    species_filenames = []
    for filename in listoffiles:
        if filename.startswith("Species"):
            if filename.endswith(".fa"):
                species_filenames.append(filename)
    print species_filenames
    species_num = len(species_filenames)
    print "This is the num of species:", species_num
    split_num = 4000
    print split_num
    i = 0
    count = 0
    #block_list = [species_filenames[x:x+split_num] for x in range(0,len(species_filenames),split_num)]
    process_string = []
    db_files = os.listdir(diamond_dir)
    print len(db_files)
    db_out_dir = workingdir + "diamond_out/"
    clusterfunc.check_dir(db_out_dir)
    for species_filename in species_filenames:
        print species_filename
        for db in db_files:
            if db.endswith(".dmnd"):
                # print i
                # db_filename=os.path.splitext(species_filename)[0]
                db_filename = db
                ortho_command = run_diamond_loop(diamond_dir, workingdir,
                                                 db_out_dir, species_filename,
                                                 db_filename)
                process_string.append(ortho_command)
                i += 1
            else:
                i = 0
                basedir = db_out_dir
                process_name = "OrthoFinder"
                module_name_list = ["GNU/4.4.5", "diamond/0.7.9"]
                filename = "Group_" + str(count)
                # clusterfunc.qsub_file(basedir,process_name,module_name_list,filename,process_string)
                process_string = []
                count += 1
Beispiel #35
0
def get_num_files(filesdir, workingdir):
    listoffiles = os.listdir(workingdir)
    diamond_dir = workingdir + "diamond/"
    clusterfunc.check_dir(diamond_dir)
    species_filenames = []
    for filename in listoffiles:
        if filename.startswith("Species"):
            if filename.endswith(".fa"):
                species_filenames.append(filename)
    print species_filenames
    species_num = len(species_filenames)
    print "This is the num of species:", species_num
    split_num = 4000
    print split_num
    i = 0
    count = 0
    #block_list = [species_filenames[x:x+split_num] for x in range(0,len(species_filenames),split_num)]
    process_string = []
    db_files = os.listdir(diamond_dir)
    print len(db_files)
    db_out_dir = workingdir + "diamond_out/"
    clusterfunc.check_dir(db_out_dir)
    for species_filename in species_filenames:
        print species_filename
        for db in db_files:
            if db.endswith(".dmnd"):
                # print i
                # db_filename=os.path.splitext(species_filename)[0]
                db_filename = db
                ortho_command = run_diamond_loop(
                    diamond_dir, workingdir, db_out_dir, species_filename, db_filename)
                process_string.append(ortho_command)
                i += 1
            else:
                i = 0
                basedir = db_out_dir
                process_name = "OrthoFinder"
                module_name_list = ["GNU/4.4.5", "diamond/0.7.9"]
                filename = "Group_" + str(count)
                # clusterfunc.qsub_file(basedir,process_name,module_name_list,filename,process_string)
                process_string = []
                count += 1
Beispiel #36
0
def execute(data_frame, url_data, basedir):
    trinity_fail = []
    count = 0
    # construct an empty pandas dataframe to add on each assembly.csv to
    for item in url_data.keys():
        # print item
        organism = item[0].replace("'","")
        sample = "_".join(item).replace("'","")
        org_seq_dir = basedir + organism + "/"
        url_list = url_data[item]
        for url in url_list:
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            trimdir = newdir + "trim/"
            #trinitydir = newdir + "trinity/trinity_out/"
            trinitydir = newdir + "trinity/"
	    transrate_dir = newdir + "transrate/"
            clusterfunc.check_dir(transrate_dir)
            trinity_fasta = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta"
            transrate_out = transrate_dir + "transrate_out." + sample + "/"
	    transrate_assemblies = transrate_out + "assemblies.csv"
            if os.path.isfile(trinity_fasta):
                # print transrate_out
                count += 1 
		# fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sample)
                if os.path.isfile(transrate_assemblies):
                    data = parse_transrate_stats(transrate_assemblies)
                    data_frame = build_DataFrame(data_frame, data)
                else:
                    print "Transrate still needs to run:", transrate_assemblies
		    transrate(trinitydir,transrate_dir,transrate_out,trinity_fasta,sample,trimdir,sra)
                    transrate_assemblies = transrate_out + "assemblies.csv"
            else:
                print "Trinity failed:", trinity_fasta
                trinity_fail.append(newdir)
    print "This is the number of Trinity de novo transcriptome assemblies:"
    print count
    print "This is the number of times Trinity failed:"
    print len(trinity_fail)
    print trinity_fail
    return data_frame
Beispiel #37
0
def execute(url_data):
	sample_dictionary = {}
	missing = []
	trimmed = []
        for item in url_data:
                organism = item[0].replace("'","")
                seqtype = item[1]
                org_seq_dir = basedir + organism + "/"
                clusterfunc.check_dir(org_seq_dir)
                url_list = url_data[item]
                for url in url_list:
                        command_list = []
                        sra = basename(urlparse(url).path)
                        newdir = org_seq_dir + sra + "/"
			trimdir = newdir + "trim/qsub_files/"
			listoffile = os.listdir(trimdir)
			#print listoffile
			trim_file = trimdir+"trim."+sra+".log"
        		#print trim_file
        		matching = [s for s in listoffile if "trim."+sra+".log" in s]
        		matching_string = "TrimmomaticPE: Completed successfully"
        		if os.path.isfile(trim_file):
                		with open(trim_file) as f:
                        		content = f.readlines()
        		if len(matching)!=0:
                		trim_complete = [m for m in content if matching_string in m]
                		if len(trim_complete)!=0:
                        		print "Already trimmed:",matching
					sample_dictionary=get_sample_dictionary(sample_dictionary,trim_file,sra)
                			trimmed.append(sra)
				else:
                        		missing.append(trimdir)
					print "Missing:",trimdir
	#print sample_dictionary
	trim_table(sample_dictionary)
	print "Missing trimmed:",len(missing)
	print missing
	print "Trimmed:",len(trimmed)
	print "Out of"
	print len(url_data.keys())
	return missing
Beispiel #38
0
def combine_files(files,basedir,combine_dir):
	clusterfunc.check_dir(combine_dir)
	for species in files.keys():
		fields=files[species][0].split("_")
		extension=fields[-1]
		parsed_extension1=extension.split(".")
		parsed_extension2=parsed_extension1[1:]
		new_extension=".".join(parsed_extension2)
		newfilename=get_newfilename(fields,new_extension)
		print species
		print files[species]
		#print newfilename
		newfilename=combine_dir+newfilename
		files_string=" ".join(files[species])
		combine_string="cat "+files_string+" > "+newfilename
		print combine_string
		workingdir=os.getcwd()
		os.chdir(basedir)
		#s=subprocess.Popen(combine_string,shell=True)
		#s.wait()
		os.chdir(workingdir)
def execute(url_data):
	sample_dictionary = {}
        for item in url_data:
                organism = item[0].replace("'","")
                seqtype = item[1]
                org_seq_dir = basedir + organism + "/"
                clusterfunc.check_dir(org_seq_dir)
                url_list = url_data[item]
                for url in url_list:
                        command_list = []
                        sra = basename(urlparse(url).path)
                        newdir = org_seq_dir + sra + "/"
			trimdir = newdir + "trim/qsub_files/"
			trim_out_file = trimdir + "trim." + sra + ".log"
			if os.path.isfile(trim_out_file):
				print trim_out_file
				sample_dictionary=get_sample_dictionary(sample_dictionary,trim_out_file,sra)
			else:
				print "No trim out log available:",trim_out_file
	print sample_dictionary
	trim_table(sample_dictionary)
Beispiel #40
0
def check_sra(url_data,missing):
	num_download = []
	for item in url_data:
                organism = item[0].replace("'","")
                seqtype = item[1]
                org_seq_dir = basedir + organism + "/"
                clusterfunc.check_dir(org_seq_dir)
                url_list = url_data[item]
                for url in url_list:
			command_list = []
                        sra = basename(urlparse(url).path)
                        newdir = org_seq_dir + sra + "/"
			trimdir = newdir + "trim/qsub_files/"
			if trimdir in missing:
				if os.path.isdir(newdir):
					print "Directory exists:",sra
					if os.path.isfile(sra):
						print "Exists:",sra
					else:
						num_download.append(newdir)
						print "Missing:",newdir
						clusterfunc.check_dir(newdir)
						print url	
						filestring = newdir + sra
    						if os.path.isfile(filestring):
        						print "file exists:", filestring
						else:
							urlstring = download(url,newdir,sra)
							command_list.append(urlstring)
						if glob.glob(newdir + "*.fastq"):
        						print "SRA has already been extracted", filestring
						else:
							sra_string = sra_extract(newdir,sra)
							command_list.append(sra_string)	
						names = "download_extract"
						print command_list
						if len(command_list) >=1:
							send_to_cluster(newdir,command_list,sra,names)
	print "Num to download:",len(num_download)
	print num_download
Beispiel #41
0
def execute(basedir, url_data):
    for item in url_data.keys():
        # Creates directory for each file to be downloaded
        # Directory will be located according to organism and read type (single
        # or paired)
        organism = item[0]
        seqtype = item[1]
        org_seq_dir = basedir + organism + "/"
        print org_seq_dir
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            filename = basename(urlparse(url).path)
            print filename
            newdir = org_seq_dir + filename + "/"
            full_filename = newdir + filename
            clusterfunc.check_dir(newdir)
            fastqcdir = newdir + "fastqc/"
            clusterfunc.check_dir(fastqcdir)
            # check to see if filename exists in newdir
            if filename in os.listdir(newdir):
                print "sra exists:", filename
                if os.stat(full_filename).st_size == 0:
                    print "SRA file is empty:", filename
                    os.remove(full_filename)
            else:
                print "file will be downloaded:", filename
                download(url, newdir, filename)
            sra_extract(newdir, filename)
    fastqc(newdir, fastqcdir, filename)
Beispiel #42
0
def execute(basedir, url_data):
    for item in url_data.keys():
        # Creates directory for each file to be downloaded
        # Directory will be located according to organism and read type (single
        # or paired)
        organism = item[0]
        seqtype = item[1]
        org_seq_dir = basedir + organism + "/"
        print org_seq_dir
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            filename = basename(urlparse(url).path)
            print filename
            newdir = org_seq_dir + filename + "/"
            full_filename = newdir + filename
            clusterfunc.check_dir(newdir)
            fastqcdir = newdir + "fastqc/"
            clusterfunc.check_dir(fastqcdir)
            # check to see if filename exists in newdir
            if filename in os.listdir(newdir):
                print "sra exists:", filename
                if os.stat(full_filename).st_size == 0:
                    print "SRA file is empty:", filename
                    os.remove(full_filename)
            else:
                print "file will be downloaded:", filename
                download(url, newdir, filename)
            sra_extract(newdir, filename)
    fastqc(newdir, fastqcdir, filename)
Beispiel #43
0
def execute(data_frame, url_data, basedir):
    trinity_fail = []
    count = 0
    # construct an empty pandas dataframe to add on each assembly.csv to
    for item in url_data.keys():
        #print item
        organism = item[0]
        sample = "_".join(item)
        org_seq_dir = basedir + organism + "/"
        url_list = url_data[item]
        for url in url_list:
            sra = basename(urlparse(url).path)
            newdir = org_seq_dir + sra + "/"
            trimdir = newdir + "trim/"
            trinitydir = newdir + "trinity/trinity_out/"
            transrate_dir = newdir + "transrate/"
            clusterfunc.check_dir(transrate_dir)
            trinity_fasta = trinitydir + "Trinity.fasta"
            transrate_out = transrate_dir + "transrate_out." + sample + "/"
            if os.path.isfile(trinity_fasta):
                #transrate(dammit_dir)
                #print transrate_out
                count += 1
                #fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sample)
                #transrate(trinitydir,transrate_dir,transrate_out,trinity_fasta,sample,trimdir,sra)
                transrate_assemblies = transrate_out + "assemblies.csv"
                if os.path.isfile(transrate_assemblies):
                    data = parse_transrate_stats(transrate_assemblies)
                    data_frame = build_DataFrame(data_frame, data)
                else:
                    print "Transrate did not complete:", transrate_assemblies
            else:
                print "Trinity failed:", trinity_fasta
                trinity_fail.append(newdir)
    print "This is the number of Trinity de novo transcriptome assemblies:"
    print count
    print "This is the number of times Trinity failed:"
    print len(trinity_fail)
    print trinity_fail
    return data_frame
Beispiel #44
0
def execute(trinity_fail, count, basedir):
	id_list = os.listdir(basedir)
        for mmetsp in id_list:
		if mmetsp != "qsub_files":
			mmetspdir = basedir + mmetsp + "/"
            		trinitydir = basedir + mmetsp + "/" + "trinity/"
			trinity_files = os.listdir(mmetspdir)
            		trinity_fasta=trinitydir+"trinity_out_2.2.0.Trinity.fasta"
            		#trinity_fasta = trinitydir + sample + ".Trinity.fixed.fasta"
            		clusterfunc.check_dir(trinitydir)
            		if os.path.isfile(trinity_fasta) == False:
				if os.path.isfile("/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"+mmetsp+".trinity_out_2.2.0.Trinity.fasta"):
					print("Trinity finished.")
					count +=1
				else:
					print(mmetspdir)
					right = [s for s in trinity_files if s.endswith(".right.fq")][0]
                        		left = [s for s in trinity_files if s.endswith(".left.fq")][0]
                        		right = mmetspdir + right
                        		left = mmetspdir + left
                        		if os.path.isfile(left) and os.path.isfile(right):
						right = [s for s in trinity_files if s.endswith(".right.fq")][0]
                        			left = [s for s in trinity_files if s.endswith(".left.fq")][0]
                        			right = mmetspdir + right
                       	 			left = mmetspdir + left
						run_trinity(trinitydir,left,right,mmetsp)
            				else:
						print("No files:",left)
			else:
                		print("Trinity completed successfully.", trinity_fasta)
                		count += 1
                		assemblydir = "/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"
                		copy_string = "cp " + trinity_fasta + " " + assemblydir + mmetsp + ".trinity_out_2.2.0.Trinity.fasta"
                		print(copy_string)
                		s = subprocess.Popen(copy_string, shell=True)
                		s.wait()
    	print("Number of Trinity de novo transcriptome assemblies:", count)
   	    print("Number of times Trinity failed:", len(trinity_fail), trinity_fail)
Beispiel #45
0
def execute(url_data):
	trinity_fail=[]
	empty_files=[]
	for item in url_data.keys():
        	organism=item[0]
        	seqtype=item[1]
        	org_seq_dir=basedir+organism+"/"
		clusterfunc.check_dir(org_seq_dir)
        	url_list=url_data[item]
        	for url in url_list:
            		sra=basename(urlparse(url).path)
            		newdir=org_seq_dir+sra+"/"
			filename=newdir+sra
			## check if trinity exists
			trinitydir=newdir+"trinity/"
			left=trinitydir+"left.fq"
			right=trinitydir+"right.fq"
			if os.stat(left).st_size == 0:
				print "File is empty:",left
				if sra not in empty_files:
					empty_files.append(sra)
			if os.stat(right).st_size == 0:
				print "File is empty:",right
				if sra not in empty_files:
					empty_files.append(sra)
			trinity_outputdir=trinitydir+"trinity_out/"
			trinity_file=trinity_outputdir+"Trinity.fasta"
			if os.path.isfile(trinity_file):
				print "Trinity completed successfully:",trinity_file
			else:
				print "Trinity needs to be run again:",filename
				trinity_fail.append(sra)
			diginormdir=newdir+"diginorm/"
			trimdir=newdir+"trim/"
	print "List of empty files:"
	print empty_files
	print "Trinity needs to be run again:"
	print trinity_fail
Beispiel #46
0
def execute(basedir,url_data):
	for item in url_data.keys():
        	organism=item[0]
        	seqtype=item[1]
        	org_seq_dir=basedir+organism+"/"
		print org_seq_dir
		clusterfunc.check_dir(org_seq_dir)
        	url_list=url_data[item]
        	for url in url_list:
            		sra=basename(urlparse(url).path)
            		newdir=org_seq_dir+sra+"/"
			sample="_".join(item)
			filename=newdir+sra
			print filename
			
			##
			# run this to delete SRA file:
			##

			if os.path.isfile(filename):
                                delete_file(filename)
                        else:   
                                print "Already deleted:",filename
Beispiel #47
0
def execute(basedir,url_data):
	for item in url_data.keys():
		organism=item[0]
		org_seq_dir=basedir+organism+"/"
		url_list=url_data[item]
		for url in url_list:
			sra=basename(urlparse(url).path)
			newdir=org_seq_dir+sra+"/"
			trinitydir=newdir+"trinity/trinity_out/"
			transdecoderdir=newdir+"transdecoder/"
			clusterfunc.check_dir(transdecoderdir)
			trinity_in_fasta=trinitydir+"Trinity.fasta"
			#trinity_fasta_prefix=sra+".Trinity.fa"
			trinity_fasta_prefix=sra+".Trinity.fixed.fa"
                        trinity_fasta=fix_fasta(trinity_in_fasta,trinitydir,sra)
			#copy_files(trinity_fasta,trinity_fasta_prefix,transdecoderdir)
			#transdecoder_LongOrf(transdecoderdir,trinity_fasta_prefix)
			#transdecoder_Predict(transdecoderdir,trinity_fasta_prefix)
			#get_longest_ORF(transdecoderdir,trinity_fasta_prefix)
			#new_trinity_fasta="/mnt/mmetsp_trinity_finished/"
                        #clusterfunc.check_dir(new_trinity_fasta)
                        #fix(transdecoderdir,trinity_fasta_prefix,sra,new_trinity_fasta)
                        copy_files(trinity_fasta_prefix,trinity_fasta,transdecoderdir,sra)
Beispiel #48
0
def execute(url_data,basedir,mmetsp_assemblies):
	trinity_fail=[]
	# construct an empty pandas dataframe to add on each assembly.csv to
	for item in mmetsp_data.keys():
		#print item
		organism=item[0]
		sample="_".join(item)
		org_seq_dir=basedir+organism+"/"
		mmetsp_list=mmetsp_data[item]
		for mmetsp in mmetsp_list:
			print mmetsp
			assemblyfileslist=os.listdir(mmetsp_assemblies)
			for filename in assemblyfileslist:
				if filename.startswith(mmetsp):
					if filename.endswith(".fixed.fa"):
						print "This is not the one you want."				
					else:
						print "MMETSP assembly found:",filename
						reference_filename=filename
			sra=item[1]
			newdir=org_seq_dir+sra+"/"
			trinitydir=newdir+"trinity/trinity_out/"
			dammit_dir=trinitydir+"dammit_dir/"
			transrate_dir="/mnt/comparisons/"
			reverse_transrate_dir="/mnt/comparisons_reverse/"
			clusterfunc.check_dir(transrate_dir)
			clusterfunc.check_dir(dammit_dir)
			clusterfunc.check_dir(reverse_transrate_dir)
			#trinity_fasta=dammit_dir+"Trinity.fasta.dammit.fasta"
			trinity_fasta=trinitydir+"Trinity.fasta"
			if os.path.isfile(trinity_fasta):
				print trinity_fasta
				fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sample)
				fixed_mmetsp_ref=fix_fasta_reference(reference_filename,mmetsp_assemblies)
				#transrate(transrate_dir,fixed_trinity,mmetsp_assemblies,fixed_mmetsp_ref)
				transrate_reverse(reverse_transrate_dir,sample,fixed_trinity,mmetsp_assemblies_dir,fixed_mmetsp_ref)
			else:
				print "Trinity failed:",newdir
				trinity_fail.append(newdir)	
	print "This is the number of times Trinity failed:"
	print len(trinity_fail)
	print trinity_fail
Beispiel #49
0
def run_diginorm(fastq_list, mmetsp_dir, mmetsp):
    diginormdir = mmetsp_dir + "diginorm/"
    clusterfunc.check_dir(diginormdir)
Beispiel #50
0
			assemblyfileslist = os.listdir(mmetsp_assemblies)
			for filename in assemblyfileslist:
				if filename.startswith(mmetsp):
					if filename.endswith(".fixed.fa"):
						print "This is not the one you want."
					else:
						print "MMETSP assembly found:", filename
						reference_filename = filename
			sra = item[1]
			newdir = org_seq_dir + sra + "/"
<< << << < .merge_file_oAh5J2
			trinitydir = newdir + "trinity/trinity_out/"
			dammit_dir = trinitydir + "dammit_dir/"
			transrate_dir = "/mnt/comparisons/"
			reverse_transrate_dir = "/mnt/comparisons_reverse/"
			clusterfunc.check_dir(transrate_dir)
			clusterfunc.check_dir(dammit_dir)
			clusterfunc.check_dir(reverse_transrate_dir)
			# trinity_fasta=dammit_dir+"Trinity.fasta.dammit.fasta"
			trinity_fasta = trinitydir + "Trinity.fasta"
			if os.path.isfile(trinity_fasta):
				print trinity_fasta
				fixed_trinity = fix_fasta(trinity_fasta, trinitydir, sample)
				fixed_mmetsp_ref = fix_fasta_reference(
				    reference_filename, mmetsp_assemblies)
				# transrate(transrate_dir,fixed_trinity,mmetsp_assemblies,fixed_mmetsp_ref)
				transrate_reverse(reverse_transrate_dir, sample,
				                  fixed_trinity, mmetsp_assemblies_dir, fixed_mmetsp_ref)
== == == =
			trinitydir = newdir + "trinity/"
			# dammit_dir=trinitydir+"dammit_dir/"
Beispiel #51
0
def check_sra(url_data, no_files, mmetsp_data):
    different = []
    for item in url_data:
        organism = item[0].replace("'", "")
        seqtype = item[1]
        mmetsp_id = item[2].replace("'", "")
        strain, organism_mmetsp, different, alt = get_strain(
            different, mmetsp_id, organism, mmetsp_data)
        org_seq_dir = basedir + organism + "/"
        clusterfunc.check_dir(org_seq_dir)
        url_list = url_data[item]
        for url in url_list:
            command_list = []
            sra = basename(urlparse(url).path)
            if alt == "blank":
                sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id
            else:
                sample = organism + "_" + strain + "_" + sra + "_" + mmetsp_id + "_alt_" + alt
            newdir = org_seq_dir + sra + "/"
            if sra in no_files:
                if os.path.isdir(newdir):
                    print "Directory exists:", sra
                    if os.path.isfile(sra):
                        print "Exists:", sra
                    else:
                        print "Missing:", newdir
                        clusterfunc.check_dir(newdir)
                        print url
                        filestring = newdir + sra
                        if os.path.isfile(filestring):
                            print "file exists:", filestring
                        else:
                            urlstring = download(url, newdir, sra)
                            command_list.append(urlstring)
                        if glob.glob(newdir + "*.fastq"):
                            print "SRA has already been extracted", filestring
                        else:
                            sra_string = sra_extract(newdir, sra)
                            command_list.append(sra_string)
                        names = "download_extract"
                        print command_list
                        if len(command_list) >= 1:
                            send_to_cluster(newdir, command_list, sra, names)
                        else:
                            print "Pipeline already run."
                            fastqcdir = newdir + "fastqc/"
                            clusterfunc.check_dir(fastqcdir)
                            fastqc(newdir, fastqcdir, sra)
                            trimdir = newdir + "trim/"
                            interleavedir = newdir + "interleave/"
                            clusterfunc.check_dir(trimdir)
                            clusterfunc.check_dir(interleavedir)
                            diginormdir = newdir + "diginormdir/"
                            clusterfunc.check_dir(diginormdir)
                            trinitydir = newdir + "trinity/"
                            clusterfunc.check_dir(trinitydir)
                            diginormfile = diginormdir + "qsub_files/" + sra + ".trimmed.interleaved.fq.keep.abundfilt.pe"
                            trinity_fasta = trinitydir + "trinity_out/" + "Trinity.fasta"
                            #trinity_fasta_new =trinitydir+sample+".Trinity.fixed.fasta"
                            trinity_fasta_new = trinitydir + organism + "_" + sra + ".Trinity.fixed.fasta"
                            file1 = newdir + sra + "_1.fastq"
                            file2 = newdir + sra + "_2.fastq"
                            assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/"
                            if os.path.isfile(file1) and os.path.isfile(file2):
                                print file1
                                print file2
                                run_trimmomatic_TruSeq(trimdir, file1, file2,
                                                       sra)
                            file1_trim = trimdir + sra + ".trim_1P.fq"
                            file2_trim = trimdir + sra + ".trim_2P.fq"
                            if os.path.isfile(file1_trim) and os.path.isfile(
                                    file2):
                                #interleave_reads(trimdir, sra, interleavedir)
                                #run_diginorm(diginormdir,interleavedir,trimdir,sra)
                                #run_filter_abund(diginormdir,interleavedir,trimdir,sra)
                                #rename_files(trinitydir,diginormdir,diginormfile,sra)
                                if os.path.isfile(trinity_fasta) == False:
                                    run_trinity(trinitydir, sra)
                                else:
                                    print "Trinity completed!", trinity_fasta
                                    trinity_fixed_fasta = fix_fasta(
                                        trinity_fasta, trinitydir, sra)
                                    #copy_string="cp "+trinity_fixed_fasta+" "+trinity_fasta_new
                                    copy_string = "cp " + trinity_fasta_new + " " + assemblydir
                                    print copy_string
Beispiel #52
0
def run_Trinity(ncgr_dir, mmetsp_dir, mmetsp, data_frame1, data_frame2):
    trinitydir = mmetsp_dir + "trinity/"
    clusterfunc.check_dir(trinitydir)
    diginormdir = mmetsp_dir + "diginorm/"
    transratedir = mmetsp_dir + "transrate/"
    clusterfunc.check_dir(transratedir)
    #split_paired_reads(trinitydir, diginormdir, mmetsp)
    #combine_orphans(diginormdir,mmetsp)
    right = trinitydir + mmetsp + ".right.fq"
    left = trinitydir + mmetsp + ".left.fq"
    trinity_fasta = trinitydir + "trinity_out/" + "Trinity.fasta"
    #rename_files(trinitydir, diginormdir, mmetsp)
    if os.path.isfile(right) and os.path.isfile(left):
        if os.path.isfile(trinity_fasta):
            print trinity_fasta
            cp_string = "cp " + trinity_fasta + " " + mmetsp_dir + mmetsp + ".Trinity.fasta"
            fixed_fasta = fix_fasta(trinity_fasta, mmetsp_dir, mmetsp)
            #print cp_string
            old_assemblies = sorted([
                s for s in os.listdir(mmetsp_dir) if s.endswith(".fixed.fasta")
                and s.split("_")[-1].startswith("SRR")
            ])
            #print old_assemblies
            #for old_assembly in old_assemblies:
            #transrate(transratedir, mmetsp, fixed_fasta, mmetsp_dir, old_assembly)
            #transrate_reverse(transratedir, mmetsp, fixed_fasta, mmetsp_dir, old_assembly)
            #	sra = old_assembly.split("_")[-1].split(".")[0]
            #	sample = mmetsp + "_" + sra
            #	reverse_sample = "reverse_" + mmetsp + "_" + sra
            #	transrate_assemblies_ref = transratedir + sample + "/assemblies.csv"
            #	transrate_reverse_assemblies = transratedir + reverse_sample + "/assemblies.csv"
            #if os.path.isfile(transrate_assemblies_ref):
            #	data1 = parse_transrate_stats(transrate_assemblies_ref,sra,mmetsp)
            #	data_frame1 = build_DataFrame(data_frame1,data1)
            #else:
            #	"Transrate failed:",transrate_assemblies_ref
            #if os.path.isfile(transrate_reverse_assemblies):
            #	data2 = parse_transrate_stats(transrate_reverse_assemblies,sra,mmetsp)
            #	data_frame2 = build_DataFrame(data_frame2,data2)

            #else:
            #	print "Reverse failed:",transrate_reverse_assemblies
            #s = subprocess.Popen(cp_string, shell = True)
            #s.wait()
            ncgr_assembly = mmetsp + ".nt.fa.fixed.fa"
            sample = mmetsp + "_" + mmetsp
            reverse_sample = "reverse_" + mmetsp + "_" + mmetsp
            transrate(transratedir, mmetsp, fixed_fasta, ncgr_dir,
                      ncgr_assembly)
            transrate_reverse(transratedir, mmetsp, fixed_fasta, ncgr_dir,
                              ncgr_assembly)

            transrate_assemblies_ref = transratedir + sample + "/assemblies.csv"
            transrate_reverse_assemblies = transratedir + reverse_sample + "/assemblies.csv"
            if os.path.isfile(transrate_assemblies_ref):
                data1 = parse_transrate_stats(transrate_assemblies_ref, mmetsp,
                                              mmetsp)
                data_frame1 = build_DataFrame(data_frame1, data1)
            else:
                print "Transrate failed:", transrate_assemblies_ref
            if os.path.isfile(transrate_reverse_assemblies):
                data2 = parse_transrate_stats(transrate_reverse_assemblies,
                                              mmetsp, mmetsp)
                data_frame2 = build_DataFrame(data_frame2, data2)
            else:
                print "Transrate failed:", transrate_reverse_assemblies
        else:
            get_trinity(trinitydir, left, right, mmetsp)
            #cp_string1 = "cp " + right + " " + mmetsp_dir
            #cp_string2 = "cp " + left + " " + mmetsp_dir
            #s = subprocess.Popen(cp_string1, shell=True)
            #print cp_string1
            #s.wait()
            #t = subprocess.Popen(cp_string2, shell=True)
            #print cp_string2
            #t.wait()
    return data_frame1, data_frame2
Beispiel #53
0
        trinity_fasta = org_seq_dir + "trinity/" + organism + "_" + sra + ".Trinity.fixed.fasta"
        #if os.path.isfile(file1) and os.path.isfile(file2):
        #		print file1
        #		print file2
        #		run_trinity(mmetsp_dir,file1,file2,mmetsp)
        #	else:
        #		print "missing:",file1
        if os.path.isfile(trinity_fasta):
            print trinity_fasta
            count.append(trinity_fasta)
            cp_string = "cp " + trinity_fasta + " " + mmetsp_dir
            print cp_string
            s = subprocess.Popen(cp_string, shell=True)
            s.wait()
        else:
            print "Missing:", trinity_fasta
            missing.append(trinity_fasta)
    print len(count)
    print missing


basedir = "/mnt/scratch/ljcohen/mmetsp_sra/"
newdir = "/mnt/scratch/ljcohen/mmetsp/"
clusterfunc.check_dir(newdir)
datafile = "../SraRunInfo_719.csv"
url_data = get_data(datafile)
print url_data
print len(url_data)
#move_files(url_data,basedir,newdir)
get_trinity(url_data, newdir, basedir)
def execute(data_frame1,data_frame2,ncgr_dir,trinity_fail, count, basedir):
	assemblydir = "/mnt/scratch/ljcohen/mmetsp_assemblies/"
	old_files = os.listdir(assemblydir)
	id_list = os.listdir(basedir)
        for mmetsp in id_list:
		if mmetsp != "qsub_files":
			alt_mmetsp = mmetsp + "_2"
			mmetspdir = basedir + mmetsp + "/"
            		trinitydir = basedir + mmetsp + "/" + "trinity/"
			trinity_files = os.listdir(mmetspdir)
			transrate_dir = mmetspdir + "transrate/"
			clusterfunc.check_dir(transrate_dir)
            		trinity_fasta=trinitydir+"trinity_out_2.2.0.Trinity.fasta"
            		alt_trinity_fasta = "/mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/"+mmetsp+".trinity_out_2.2.0.Trinity.fasta"
			#trinity_fasta = trinitydir + sample + ".Trinity.fixed.fasta"
            		clusterfunc.check_dir(trinitydir)
            		if os.path.isfile(trinity_fasta) == False and os.path.isfile(alt_trinity_fasta) == False:
				right = [s for s in trinity_files if s.endswith(".right.fq")][0]
        			left = [s for s in trinity_files if s.endswith(".left.fq")][0]
				right = mmetspdir + right
				left = mmetspdir + left
				if os.path.isfile(left) and os.path.isfile(right):
					#run_trinity(trinitydir,left,right,mmetsp)
                			print "Trinity not finished:", trinity_fasta
                			trinity_fail.append(trinitydir)
            			else:
					print "No files:",left
			elif os.path.isfile(trinity_fasta) == True:
				
				 or os.path.isfile(alt_trinity_fasta) == True:

                		print "Trinity completed successfully.", trinity_fasta
                		count += 1
                		old_assemblies = glob.glob(assemblydir+"*"+mmetsp+"*")
				if len(old_assemblies) >= 1:
					full_assembly = old_assemblies[0]
				else:
					print glob.glob(assemblydir + "*" + mmetsp + "*")
				#copy_string = "cp " + trinity_fasta + " " + assemblydir
                		#print copy_string
                		#s = subprocess.Popen(copy_string, shell=True)
                		#s.wait()
                		fixed_fasta = fix_fasta(trinity_fasta,trinitydir,mmetsp)
				#sra = old_assembly.split("_")[-1].split(".")[0]
				ncgr_assembly = ncgr_dir + mmetsp + ".nt.fa.fixed.fa"	
				sample = "trinity2.2.0_"+mmetsp+"_trinity2014"
				reverse_sample = "reverse_trinity2014_" + mmetsp + "_trinity2.2.0"
    				transrate_out = transrate_dir + sample + "/" + "assemblies.csv"
				transrate_reverse_assemblies = transrate_dir + reverse_sample + "/" + "assemblies.csv"
				if os.path.isfile(transrate_out):
					print "Transrate completed:",transrate_out
					data1 = parse_transrate_stats(transrate_out,mmetsp,mmetsp)
                                	data_frame1 = build_DataFrame(data_frame1,data1)
				else:
					transrate(transrate_dir, mmetsp, fixed_fasta, mmetspdir, full_assembly)
				if os.path.isfile(transrate_reverse_assemblies):
					print "Transrate complete:",transrate_reverse_assemblies
					data2 = parse_transrate_stats(transrate_reverse_assemblies,mmetsp,mmetsp)
                        		data_frame2 = build_DataFrame(data_frame2,data2)
				else:
					transrate_reverse(transrate_dir, mmetsp, fixed_fasta, mmetspdir, full_assembly)
Beispiel #55
0
			if os.path.isfile(left) and os.path.isfile(right):
				print left
				print right
			else:
				print "Does not exist.",left,right
		else:
			print "Does not exist:",diginormdir
        	trinity_fasta = assemblydir + sample + "/" + sample + ".Trinity.fixed.fa"
		transrate_out = transratedir + sample + "/"
        	transrate_assemblies = transrate_out + "/" + "assemblies.csv"
		if os.path.isfile(trinity_fasta):
        		print trinity_fasta
        	else:
                	print "Trinity failed:", trinity_fasta
		if os.path.isfile(transrate_assemblies):
        	        data = parse_transrate_stats(transrate_assemblies)
                    	data_frame = build_DataFrame(data_frame, data)
        	else:  
                    	print "Running transrate..."
                  	transrate(transratedir,transrate_out,trinity_fasta,sample,left,right)
	return data_frame

assemblydir = "/home/ljcohen/msu_assemblies_finished/"
basedir = "/home/ljcohen/osmotic_combined/"
transratedir = "/home/ljcohen/osmotic_transrate_scores/"
clusterfunc.check_dir(transratedir)
listoffiles = os.listdir(basedir)
data_frame = pd.DataFrame()
data_frame = execute(data_frame,listoffiles, assemblydir, transratedir)
#data_frame.to_csv("transrate_scores.csv")
Beispiel #56
0
            # check to see if filename exists in newdir
            if filename in os.listdir(newdir):
                print "sra exists:", filename
                if os.stat(full_filename).st_size == 0:
                    print "SRA file is empty:", filename
                    os.remove(full_filename)
            else:
                print "file will be downloaded:", filename
                download(url, newdir, filename)
            sra_extract(newdir, filename)
    fastqc(newdir, fastqcdir, filename)


def fastqc(newdir, fastqcdir, filename):
    listoffiles = os.listdir(newdir)
    print listoffiles
    fastq_file_list = []
    for i in listoffiles:
        if i.endswith(".fastq"):
            fastq_file_list.append(newdir + i)
    fastqc_report(fastq_file_list, newdir, fastqcdir, filename)


datafile = "SraRunInfo.csv"
basedir = "~/"
clusterfunc.check_dir(basedir)
for datafile in datafiles:
    url_data = get_data(datafile)
    print url_data
    execute(basedir, url_data)