forked from johnsolk/MMETSP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
report.py
312 lines (273 loc) · 11.2 KB
/
report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import os
import os.path
from os.path import basename
from urllib import urlopen
from urlparse import urlparse
import subprocess
from subprocess import Popen, PIPE
import urllib
import shutil
import glob
# custom Lisa module
import clusterfunc
# Python plotting libraries
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats, integrate
import seaborn as sns
#sns.set(color_codes=True)
def get_data(thefile):
mmetsp_data={}
with open(thefile,"rU") as inputfile:
headerline=next(inputfile).split(',')
#print headerline
position_name=headerline.index("ScientificName")
position_reads=headerline.index("Run")
position_mmetsp=headerline.index("SampleName")
for line in inputfile:
line_data=line.split(',')
name="_".join(line_data[position_name].split())
read_type=line_data[position_reads]
mmetsp=line_data[position_mmetsp]
test_mmetsp=mmetsp.split("_")
if len(test_mmetsp)>1:
print test_mmetsp
mmetsp=test_mmetsp[0]
name_read_tuple=(name,read_type)
#print name_read_tuple
#check to see if Scientific Name and run exist
if name_read_tuple in mmetsp_data.keys():
#check to see if ftp exists
if mmetsp in mmetsp_data[name_read_tuple]:
print "mmetsp ID already exists:", mmetsp
else:
mmetsp_data[name_read_tuple].append(mmetsp)
else:
mmetsp_data[name_read_tuple] = [mmetsp]
return mmetsp_data
def fix_fasta(trinity_fasta,trinity_dir,sample):
#os.chdir(trinity_dir)
trinity_out=trinity_dir+sample+".Trinity.fixed.fa"
fix="""
sed 's_|_-_g' {} > {}
""".format(trinity_fasta,trinity_out)
#s=subprocess.Popen(fix,shell=True)
print fix
#s.wait()
#os.chdir("/home/ubuntu/MMETSP/")
return trinity_out
def fix_fasta_reference(mmetsp_assembly,mmetsp_assembly_dir):
mmetsp_assembly_out=mmetsp_assembly_dir+mmetsp_assembly+".fixed.fa"
fix="""
sed 's_|_-_g' {}{} > {}
""".format(mmetsp_assembly_dir,mmetsp_assembly,mmetsp_assembly_out)
print fix
#s=subprocess.Popen(fix,shell=True)
#s.wait()
return mmetsp_assembly_out
def transrate(transrate_dir,sample,trinity_fasta,mmetsp_assemblies_dir,filename):
transrate_command="""
transrate -o {}{} \\
--assembly {} \\
--reference {} \\
--threads 4
""".format(transrate_dir,sample,trinity_fasta,filename)
print transrate_command
#s=subprocess.Popen(transrate_command,shell=True)
#s.wait()
def transrate_reverse(transrate_dir,sample,trinity_fasta,mmetsp_assemblies_dir,filename):
transrate_command="""
transrate -o {}{} \\
--assembly {} \\
--reference {} \\
--threads 4
""".format(transrate_dir,sample,filename,trinity_fasta)
print "This is the reverse transrate command:"
print transrate_command
#s=subprocess.Popen(transrate_command,shell=True)
#s.wait()
def parse_transrate_stats(transrate_assemblies):
data=pd.DataFrame.from_csv(transrate_assemblies,header=0,sep=',')
return data
def build_DataFrame(data_frame,transrate_data):
#columns=["n_bases","gc","gc_skew","mean_orf_percent"]
frames=[data_frame,transrate_data]
data_frame=pd.concat(frames)
return data_frame
def execute(url_data,basedir,mmetsp_assemblies):
trinity_fail=[]
# construct an empty pandas dataframe to add on each assembly.csv to
for item in mmetsp_data.keys():
#print item
organism=item[0]
sample="_".join(item)
org_seq_dir=basedir+organism+"/"
mmetsp_list=mmetsp_data[item]
for mmetsp in mmetsp_list:
print mmetsp
assemblyfileslist=os.listdir(mmetsp_assemblies)
for filename in assemblyfileslist:
if filename.startswith(mmetsp):
if filename.endswith(".fixed.fa"):
print "This is not the one you want."
else:
print "MMETSP assembly found:",filename
reference_filename=filename
sra=item[1]
newdir=org_seq_dir+sra+"/"
trinitydir=newdir+"trinity/trinity_out/"
dammit_dir=trinitydir+"dammit_dir/"
transrate_dir="/mnt/comparisons/"
reverse_transrate_dir="/mnt/comparisons_reverse/"
clusterfunc.check_dir(transrate_dir)
clusterfunc.check_dir(dammit_dir)
clusterfunc.check_dir(reverse_transrate_dir)
#trinity_fasta=dammit_dir+"Trinity.fasta.dammit.fasta"
trinity_fasta=trinitydir+"Trinity.fasta"
if os.path.isfile(trinity_fasta):
print trinity_fasta
fixed_trinity=fix_fasta(trinity_fasta,trinitydir,sample)
fixed_mmetsp_ref=fix_fasta_reference(reference_filename,mmetsp_assemblies)
#transrate(transrate_dir,fixed_trinity,mmetsp_assemblies,fixed_mmetsp_ref)
transrate_reverse(reverse_transrate_dir,sample,fixed_trinity,mmetsp_assemblies_dir,fixed_mmetsp_ref)
else:
print "Trinity failed:",newdir
trinity_fail.append(newdir)
print "This is the number of times Trinity failed:"
print len(trinity_fail)
print trinity_fail
def get_extra_assemblies(extra_dir,data_frame):
listofassemblies=os.listdir(extra_dir)
for assembly_fasta in listofassemblies:
sample_info=assembly_fasta.split("_")
sra=sample_info[2].split(".")[0]
sample_info=sample_info[:-1]
sample_info.append(sra)
sample="_".join(sample_info)
print sample
transrate_out=extra_dir+sample+"/"
fixed_trinity=fix_fasta(assembly_fasta,extra_dir,sample)
transrate(extra_dir,transrate_out,fixed_trinity)
transrate_assemblies=transrate_out+"assemblies.csv"
data=parse_transrate_stats(transrate_assemblies)
data_frame=build_DataFrame(data_frame,data)
return data_frame
def get_extra_assemblies_transrate(extradir,transrate_dir,mmetsp_assemblies):
datafile="/home/ubuntu/MMETSP/MMETSP_SRA_Run_Info_subset2.csv"
mmetsp_data=get_data(datafile)
print mmetsp_data
listofassemblies=os.listdir(extra_dir)
for filename in listofassemblies:
if filename.endswith("Trinity.dammit.fasta"):
print filename
assembly_filename=extra_dir+filename
file_info=filename.split("_")
sra_info=file_info[2].split(".")
sra=sra_info[0]
organism=file_info[0]+"_"+file_info[1]
org_sra=(organism,sra)
sample=organism+"_"+sra+".dammit"
print sample
fixed_trinity=fix_fasta(assembly_filename,extra_dir,sample)
mmetsp_list=mmetsp_data[org_sra]
mmetsp=mmetsp_list[0]
assemblyfileslist=os.listdir(mmetsp_assemblies)
for ref_filename in assemblyfileslist:
if ref_filename.startswith(mmetsp):
if ref_filename.endswith(".fixed.fa"):
print "This is not the one you want."
else:
print "MMETSP assembly found:",ref_filename
fixed_reference=fix_fasta_reference(ref_filename,mmetsp_assemblies)
reference_filename=fixed_reference
#transrate(transrate_dir,sample,fixed_trinity,mmetsp_assemblies,reference_filename)
transrate_reverse(transrate_dir,sample,fixed_trinity,mmetsp_assemblies_dir,reference_filename)
def get_contigs_data(data_frame,transrate_dir):
listofdirs=os.listdir(transrate_dir)
for dirname in listofdirs:
transrate_dirname=transrate_dir+dirname+"/"
transrate_dirnames=os.listdir(transrate_dirname)
for dirname2 in transrate_dirnames:
if dirname2.endswith(".fixed"):
transrate_contigs=transrate_dirname+dirname2+"/contigs.csv"
if os.path.isfile(transrate_contigs):
print transrate_contigs
data=parse_transrate_stats(transrate_contigs)
data_frame=build_DataFrame(data_frame,data)
else:
print "File missing:",transrate_contigs
return data_frame
def get_assemblies_data(data_frame,transrate_dir):
listofdirs=os.listdir(transrate_dir)
for dirname in listofdirs:
transrate_assemblies=transrate_dir+dirname+"/assemblies.csv"
print transrate_assemblies
if os.path.isfile(transrate_assemblies):
data=parse_transrate_stats(transrate_assemblies)
data_frame=build_DataFrame(data_frame,data)
else:
print "File missing:",transrate_assemblies
return data_frame
def get_ref_transrate(transrate_dir):
listdirs=os.listdir(transrate_dir)
print listdirs
for dirname in listdirs:
newdir=transrate_dir+dirname+"/"
print newdir
newfile=newdir+"assemblies.csv"
if os.path.isfile(newfile):
print "Exists:",newfile
else:
print "Does not exist:",newfile
# The following dictionary is formatted as
# basedir:datafile
file_locations={"/mnt2/mmetsp/":"MMETSP_SRA_Run_Info_subset_d.csv",
"/mnt3/mmetsp/":"MMETSP_SRA_Run_Info_subset_a.csv",
"/mnt4/mmetsp/":"MMETSP_SRA_Run_Info_subset_b.csv"}
#datafile="MMETSP_SRA_Run_Info_subset2.csv"
extra_dir="/mnt2/mmetsp3/"
data_frame_assemblies=pd.DataFrame()
data_frame_contigs=pd.DataFrame()
mmetsp_assemblies_dir="/mnt/MMETSP_assemblies/"
transrate_dir="/mnt/comparisons/"
reverse_transrate_dir="/mnt/comparisons_reverse/"
#for basedir in file_locations.keys():
# datafile=file_locations[basedir]
# mmetsp_data=get_data(datafile)
# print mmetsp_data
# data_frame=execute(data_frame,mmetsp_data,basedir,extra_dir,mmetsp_assemblies_dir)
#get_extra_assemblies_transrate(extra_dir,transrate_dir,mmetsp_assemblies_dir)
# check if transrate data exists:
#get_ref_transrate(transrate_dir)
# Grab the transrate data after it has completed:
#data_frame_assemblies=get_assemblies_data(data_frame_assemblies,transrate_dir)
#data_frame_contigs=get_contigs_data(data_frame_contigs,transrate_dir)
#print data_frame_contigs
# print the transrate data to files:
#data_frame_assemblies.to_csv("/home/ubuntu/MMETSP/MMETSP_transrate_data.csv")
#if os.path.isfile("/home/ubuntu/MMETSP/MMETSP_transrate_data.csv"):
# print "File written: /home/ubuntu/MMETSP/MMETSP_transrate_data.csv"
#data_frame_contigs.to_csv("/home/ubuntu/MMETSP/MMETSP_transrate_reference_comparisons.csv")
#if os.path.isfile("/home/ubuntu/MMETSP/MMETSP_transrate_reference_comparisons.csv"):
# print "File written: /home/ubuntu/MMETSP/MMETSP_transrate_reference_comparisons.csv"
for basedir in file_locations.keys():
datafile=file_locations[basedir]
mmetsp_data=get_data(datafile)
print mmetsp_data
execute(mmetsp_data,basedir,mmetsp_assemblies_dir)
get_extra_assemblies_transrate(extra_dir,reverse_transrate_dir,mmetsp_assemblies_dir)
# check if transrate data exists:
get_ref_transrate(reverse_transrate_dir)
# Grab the transrate data after it has completed:
data_frame_assemblies=get_assemblies_data(data_frame_assemblies,reverse_transrate_dir)
data_frame_contigs=get_contigs_data(data_frame_contigs,reverse_transrate_dir)
#print data_frame_contigs
# print the transrate data to files:
data_frame_assemblies.to_csv("/home/ubuntu/MMETSP/MMETSP_reverse_transrate_data.csv")
if os.path.isfile("/home/ubuntu/MMETSP/MMETSP_reverse_transrate_data.csv"):
print "File written: /home/ubuntu/MMETSP/MMETSP_reverse_transrate_data.csv"
#data_frame_contigs.to_csv("/home/ubuntu/MMETSP/MMETSP_transrate_reference_comparisons.csv")
#if os.path.isfile("/home/ubuntu/MMETSP/MMETSP_transrate_reference_comparisons.csv"):
# print "File written: /home/ubuntu/MMETSP/MMETSP_transrate_reference_comparisons.csv"