forked from Farhat/gephcort
/
reanimate.py
executable file
·426 lines (345 loc) · 15.9 KB
/
reanimate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
#!/usr/bin/python
'''
####################################################################################
# This short script is a part of gephcort package v1.0
#
# For any queries related to script, contact: Amol Kolte (amolkolte1989@gmail.com)
####################################################################################
Created on 29-Mar-2012
'''
####################################################################################
# Scanning Inputs
####################################################################################
import sys, getopt
def main(argv):
global seq, intree, seq_format, iterations, phen, out, ressurect_file, log_file
try:
opts, args = getopt.getopt(argv, "hs:t:f:i:p:o:r:l:", ['seq=', 'intree=', 'seq_format=', 'iterations=', 'phen=', 'out=', 'ressurect_file=', 'log_file='])
except getopt.GetoptError:
print "\n USAGE: python reanimate.py -s <seq_file> -t <tree_file> -f <format(fasta/phylip)> -i <phen_iterations> -p <phen_file> -r <ressurect_output_file> -o <output_file> \n \n \t --seq, -s : SNP sequence file \n \t --tree, -t : Newick tree \n \t --seq_format, -f : SNP sequence file format (phylip/fasta) \n \t --iter, -i : Phenotype shuffling iterations \n \t --phen, -p : Custom format phenotype file \n \t --out, -o : Output filename \n \t --ressurect_file, -r : Output file obtained from ressurect.R for a given seq and tree file \n \t --log_file, -l : Log file (Optional)\n"
for opt, arg in opts:
if opt=='-h':
print '\n\n python reanimate.py -s <seq_file> -t <tree_file> -f <format(fasta/phylip)> -i <phen_iterations> -p <phen_file> -r <ressurect_output_file> -o <output_file> -l <log_file> (Optional)\n\n'
sys.exit()
elif opt in ("-s", "--seq"):
seq=arg
elif opt in ("-t", "--intree"):
intree=arg
elif opt in ("-f", "--seq_format"):
if arg=="phylip":
seq_format='iphylip'
elif arg=="fasta":
seq_format='fasta'
else:
print "\n** Fatal Error : Sequence format not supported !!! Kindly consider converting it into 'fasta' or 'phylip' format **"
elif opt in ("-i", "--iterations"):
iterations=arg
elif opt in ("-p", "--phen"):
phen=arg
elif opt in ("-o", "--out"):
out=arg
elif opt in ("-r", "--ressurect_file"):
ressurect_file=arg
elif opt in ("-l", "--log_file"):
log_file=arg
if __name__ == "__main__":
if sys.argv[1:]:
main(sys.argv[1:])
else:
print "\n USAGE: python reanimate.py -s <seq_file> -t <tree_file> -f <format(fasta/phylip)> -i <phen_iterations> -p <phen_file> -r <ressurect_output_file> -o <output_file> \n \n \t --seq, -s : SNP sequence file \n \t --tree, -t : Newick tree \n \t --seq_format, -f : SNP sequence file format (phylip/fasta) \n \t --iter, -i : Phenotype shuffling iterations \n \t --phen, -p : Custom format phenotype file \n \t --out, -o : Output filename \n \t --ressurect_file, -r : Output file obtained from ressurect.R for a given seq and tree file \n \t --log_file, -l : Log file (Optional) \n"
sys.exit()
###############################################################################
# Loading required modules
###############################################################################
print "Initializing .. Loading modules"
import time, scipy, math, numpy as np # system commands
from ete2 import PhyloTree # To create a phylogenetic tree
from random import shuffle # Shuffle phenotype
from scipy import stats # To calculate p-value from z-score
from rpy2.robjects.packages import importr
from sets import Set
# Opening log file
try:
log=open(log_file, "w")
except:
log=open("gephcort_run.log", "w")
# Logging start time
log.write("Start time: "+str(time.localtime()[0])+"-"+str(time.localtime()[2])+"-"+str(time.localtime()[1])+"\t"+str(time.localtime()[3])+":"+str(time.localtime()[4])+":"+str(time.localtime()[5])+"\n")
ape_objects={"delta.plot":"delta_plot", "dist.dna":"dist_dna", "dist.nodes":"dist_nodes", "node.depth":"node_depth",
"node.depth.edgelength":"node_depth_edgelength","node.height":"node_height", "node.height.clado":"node_height_clado",
"prop.part":"prop_part"}
ape=importr("ape", robject_translations = ape_objects) # Required for phangorn
ph=importr("phangorn") # Phylogenetic operations in R
print "All modules imported successfully"
t = PhyloTree(intree, alignment=seq, alg_format=seq_format) # Main tree containing entire sequence
dtp = PhyloTree(intree) # Dummy tree for phenotype shuffling
print "Tree file read successfully"
phenfile=open(phen, "r") # Phenotype file
phenlist=[]
for line in phenfile.readlines():
phenlist.append([line.split("\t")[0].strip(), line.split("\t")[1].strip()])
phenfile.close()
phenotype={} # Dictionary containing species names and their phenotype values
# Phenotype file should have two columns separated by tab containing taxa name
# in the first column and a numerical phenotype value in the second
#
# dog1 3.2
# dog2 4.4
# cat2 4.5
# .
# .
if len(phenlist[1])==3:
for i in range(len(phenlist)):
try:
phenotype[phenlist[i][0]] = [float(phenlist[i][1])-float(phenlist[i][2]), float(phenlist[i][1])+float(phenlist[i][2])]
except IOError:
print "Error while readling phenotype file"
if len(phenlist[1])==2:
for i in range(len(phenlist)):
try:
phenotype[phenlist[i][0]] = [float(phenlist[i][1]), float(phenlist[i][1])]
except IOError:
print "Error while readling phenotype file"
else:
print "Phenotype file format not supported, please go through the instructions"
missing = 18 # phangorn value for missing genotype
shphenotype=[]
shphenotype.append(phenotype.values())
#########################################################################################
# Reading complete ancestral sequence data generated through R (In the form of pseudo-patterns)
#########################################################################################
print "Reading complete ancestral sequence data generated through R"
rtree=PhyloTree(intree) # Tree for "R" generated patterns
tree=ape.read_tree(intree)
rlist=[]
ropf = open(ressurect_file, "r") # rdata.dat is a rgp.R generated output file
for tab in ropf.readlines():
tab=tab.rstrip()
rlist.append(tab.split(" "))
ropf.close()
ori=np.array(rlist)
for node in rtree.traverse("postorder"): # Patterns are being linked to their corresponding nodes
if node.is_leaf():
node.add_features(data=[None for i in range(len(rlist[0])-1)]) # Its rlist[0]-1, because nucleotides begins with name of species
for i in range(len(ori[:,0])):
if '"'+node.name+'"' == ori[:,0][i] :
node.add_features(rtoken=i+1)
else :
node.add_features(data=[None for i in range(len(rlist[0])-1)])
node.add_features(rtoken=None)
for node in rtree.traverse("postorder"):
if node.is_leaf():
node.data=map(lambda x: x, rlist[node.rtoken-1][1:]) # Its the sequence after name
node.up.rtoken=int(ph.Ancestors(tree, node.rtoken, "parent")[0])
else:
try:
node.data=map(lambda x: x, rlist[node.rtoken-1][1:])
node.name=node.rtoken
node.up.rtoken=int(ph.Ancestors(tree, node.rtoken, "parent")[0])
except :
print "Root node is encountered, ancestral node mapping complete"
###############################################################################################
# Phenotype manipulations
###############################################################################################
print "Starting Phenotype manipulations"
def phrange(node):
''' Recursive function '''
left, right = node.children
if left.phenrange is None:
phrange(left)
if right.phenrange is None:
phrange(right)
temp = []
nwrange = [None, None]
temp.append(left.phenrange)
temp.append(right.phenrange)
temp.sort()
if temp[1][0] < temp[0][1]:
nwrange[0] = temp[1][0]
nwrange[1] = min(temp[0][1], temp[1][1])
elif temp[1][0] > temp[0][1]:
nwrange[0] = temp[0][1]
nwrange[1] = temp[1][0]
else:
nwrange[0] = temp[1][0]
nwrange[1] = temp[0][1]
node.phenrange = nwrange
node.phenvalue=np.average(node.phenrange)
left.phenvalue=np.average(left.phenrange)
right.phenvalue=np.average(right.phenrange)
counter=0
for node in dtp.traverse("preorder"):
node.add_features(counter=counter)
counter += 1
if node.is_leaf():
node.add_features(phenrange=phenotype[node.name], phenvalue=None)
else:
node.add_features(phenrange=None, phenvalue=None)
sflphen = [] # List to contain phenvalues for all the nodes after several shuffling attempts.
phrange(dtp)
for p in xrange(int(iterations)):
shuffle(shphenotype[0])
sflphen.append([])
index = 0
for node in dtp.traverse("preorder"): # serves two purpose, 1) Add existing values to sflphen 2) reset tree phenrange attribites
sflphen[p].append(node.phenvalue)
if node.is_leaf():
node.phenrange = shphenotype[0][index]
index += 1
else:
node.phenrange = None
phrange(dtp)
##########################################################################################
# Generating patterns from R output data
##########################################################################################
print "Generating patterns from R output data"
single=True
for node in rtree.traverse("postorder"):
if single==True:
if node.is_leaf():
ref=node
single=False
dt=rtree
pattern={} # Contains all the possible patterns of SNPs across the tree, generated by rgp.R
sfldict={} # Details of branches where nucleotide is changing and the corresponding p-value
bchanges={} # Number of braches the nucleotide is changing and the corresponding pattern
#mafreq={} # Stores minor allele frequency for individal SNP
counter=0
for node in dt.traverse('preorder'):
node.add_features(counter=counter)
counter += 1
# Number of statistical tests being performed
tests=0
for var in xrange(len(ref.data)):
currentdata=[]
currentdict={"18" : 18}
token=0
for node in dt.traverse("postorder"):
if node.is_leaf():
try:
currentdata.append(currentdict[node.data[var]]) # Pattern recognition
except:
currentdata.append(token)
currentdict[node.data[var]]=token
token+=1
else:
pass
tdata=tuple(currentdata)
# if tdata.count(0) < tdata.count(1):
# af=tdata.count(0)*100/(tdata.count(0)+tdata.count(1))
# elif tdata.count(0) > tdata.count(1):
# af=tdata.count(1)*100/(tdata.count(0)+tdata.count(1))
# else:
# af=50
try:
pvalue=pattern[tdata]
except:
temp=[]
for node in dt.traverse('preorder'):
if not node.is_leaf():
left, right = node.children
if left.data[var] != node.data[var]: # Nucleotide substitution
if left.data[var] != missing :
temp.append([node.counter, left.counter])
if right.data[var] != node.data[var]: # Nucleotide substitution
if right.data[var] != missing :
temp.append([node.counter, right.counter])
temp.sort()
tpp=map(tuple, temp)
tp=tuple(tpp)
dtemp=[]
try:
pvalue=sfldict[tp]
except:
for index in xrange(len(sflphen)):
tsum=0.0
if not len(temp) == 0:
for n in xrange(len(temp)):
tsum += abs(sflphen[index][temp[n][0]]-sflphen[index][temp[n][1]])
dtemp.append(math.pow((tsum/len(temp)),1))
else :
dtemp.append('NA') # if all alleles are identical
if 'NA' in dtemp:
pvalue=-1
else:
pvalue=scipy.stats.norm.sf(abs(dtemp[0] - np.average(dtemp[1:])) / np.std(dtemp[1:]))*2
tests += 1
sfldict[tp]=pvalue
pattern[tdata]=pvalue
bchanges[tdata]=len(tp)
# mafreq[tdata]=af
###############################################################################################
# Reading orignal sequence, evaluating by breaking into patterns
###############################################################################################
print "Reading orignal sequence, evaluating by breaking into patterns"
single=True
for node in t.traverse("postorder"):
if single==True:
if node.is_leaf():
oriref=node
single=False
plist=[None for var in xrange(len(oriref.sequence))]
blist=[None for var in xrange(len(oriref.sequence))]
#maflist=[None for var in xrange(len(oriref.sequence))]
for var in xrange(len(oriref.sequence)):
currentdata=[]
currentdict={"-" : 18}
token=0
for node in t.traverse("postorder"):
if node.is_leaf():
try:
currentdata.append(currentdict[node.sequence[var]]) #Pattern recognition
except:
currentdata.append(token)
currentdict[node.sequence[var]]=token
token+=1
else:
pass
tdata=tuple(currentdata)
try:
plist[var]=pattern[tdata]
blist[var]=bchanges[tdata]
# maflist[var]=mafreq[tdata]
except:
print "Patterns missing"
#############################################################################################
# Performing multiple correction
#############################################################################################
uniq_plist=Set(plist)
try:
uniq_plist.remove(-1)
except:
pass
uniq_plist=list(uniq_plist)
sorted_uniq_plist=sorted(uniq_plist)
corrected={} # corrected[raw_pvalue]=[fdr_pvalue, bonferroni_pvalue]
for entry in sorted_uniq_plist:
corrected[entry]=[min(1, entry*len(sorted_uniq_plist)/(len(sorted_uniq_plist)-sorted_uniq_plist.index(entry))), min(1, entry*len(sorted_uniq_plist))]
#############################################################################################
# Writing output
#############################################################################################
try:
opf=open(out, "w")
except:
opf=open("/tmp/gpresult.dat", "w")
opf.write('#SNP_index'+'\t'+'p-value'+'\t'+'p.adjusted_FDR'+'\t'+'p.adjusted_Bonferroni'+'\n')
for i in range(len(plist)):
if plist[i] in corrected:
opf.write(str(i)+'\t'+str(plist[i])+'\t'+str(corrected[plist[i]][0])+'\t'+str(corrected[plist[i]][1])+'\n')
else:
opf.write(str(i)+'\t'+'NA'+'\t'+'NA'+'\t'+'NA'+'\n')
opf.close()
#############################################################################################
# Writing Summary Log
#############################################################################################
log.write("End time: "+str(time.localtime()[0])+"-"+str(time.localtime()[2])+"-"+str(time.localtime()[1])+"\t"+str(time.localtime()[3])+":"+str(time.localtime()[4])+":"+str(time.localtime()[5])+"\n\n")
log.write("Genotype file\t"+seq+"\n")
log.write("Newick file\t"+intree+"\n")
log.write("Phenotype file\t"+phen+"\n")
log.write("Iterations for permutation test\t"+str(iterations)+"\n")
log.write("Number of SNPs\t"+str(len(oriref.sequence))+"\n")
log.write("Total number of observed patters\t"+ str(len(pattern))+"\n")
log.write("Number of statistical tests performed\t"+str(tests))
log.close()
print "Operation complete"
#################################################################################################