-
Notifications
You must be signed in to change notification settings - Fork 0
/
mpi_nstat.py
executable file
·376 lines (313 loc) · 12.4 KB
/
mpi_nstat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
#!/usr/bin/env python
#from __future__ import with_statement
import pypar
import corrcperm
import sys
import time
import cPickle
#import cProfile
import random
import getopt
import math
#this code uses mpi to calculate the pvalues using the nstatistic.
#a vector of nstatistics for each permutation of the genetic data is generated
# by distributing the work for each permutation to different nodes.
#the nstatistics are then collected at the server and combined into a vector of pvalues
mu = 0.0
std = 2.0
numnodes = pypar.size()
class task:
#these are the default run parameters.
# 'Hyperdip_n.pck' and 'Tel_n.pck': The Hyperdip and Tel childhood leukemia data provided by St. Jude Children's research hospital.
simulatedata = False
file1 = 'Hyperdip_n.pck'
file2 = 'Tel_n.pck'
outfilename = 'pvals'
numthreads = 1
kern = 2
seed = 12345
logprefix = ''
genes = 1000
columns = 80
perms = 1000
groupsize = columns/8
fisher = True # Fisher's transformation
pvaluethreshold = -1
def __init__(self, argv):
lastincluded = False
for n, arg in enumerate(argv):#this is a hack so that we don't get the mpi arguments
if arg == 'last':
lastincluded = True
argv = argv[1:n]
if not lastincluded:
abnormalexit("please end the arguments to the script with 'last'")
try:
opts, args = getopt.getopt(argv, 't' ,['genes=', 'columns=', 'groupsize=', 'permutations=', 'kernel=', 'numthreads=','seed=', 'file1=', 'file2=', 'outfile=', 'logprefix=', 'threshold=', 'fisher', 'nofisher'])
except getopt.GetoptError, err:
abnormalexit(str(err))
for o, a in opts:
if o == '-t':
self.simulatedata = True
elif o == '--genes':
self.genes = eval(a)
elif o == '--columns':
self.columns = eval(a)
elif o == '--groupsize':
self.groupsize = eval(a)
elif o == '--permutations':
self.perms = eval(a)
elif o == '--kernel':
self.kern == eval(a)
elif o == '--numthreads':
self.numthreads = eval(a)
elif o == '--seed':
self.seed = eval(a)
elif o == '--file1':
self.file1 = a
elif o == '--file2':
self.file2 = a
elif o == '--outfile':
self.outfilename = a
elif o == '--logprefix':
self.logprefix = a
elif o == '--fisher':
self.fisher = True
elif o == '--nofisher':
self.fisher = False
elif o == '--threshold':
self.pvaluethreshold = eval(a)
else:
abnormalexit('unrecognized argument: '+o)
if self.pvaluethreshold == -1:
self.pvaluethreshold = self.perms
# q, r = divmod(self.columns,self.groupsize)
# if r == 0:
# self.groups = q
# else:
# self.groups = q+1
if self.groupsize > self.columns:
abnormalexit('groupsize greater than number of columns')
self.initdata()
def initdata(self):
random.seed(self.seed)#if this isn't done, python uses the current system time for the seed, or the os random source. Since each mpi node generates simulated data separately, this must be done so that they generate the same data.
if self.simulatedata == True:
#set up the program with generated data
self.data = corrcperm.corrcperm(self.genes, self.columns, self.groupsize, self.fisher)
self.data.makerandomdata(self.seed, mu, std)
else:
#use real data
#with open(self.file1, 'rb') as f1:
# condition1 = cPickle.load(f1)
f1 = open(self.file1, 'rb')
condition1 = cPickle.load(f1)
#with open(self.file2, 'rb') as f2:
# condition2 = cPickle.load(f2)
f2 = open(self.file2, 'rb')
condition2 = cPickle.load(f2)
if condition1.shape != condition2.shape:
sys.exit('Error: conditions are not the same size')
self.genes = condition1.shape[0]
self.columns = condition1.shape[1]
self.data = corrcperm.corrcperm(self.genes, self.columns, self.groupsize, self.fisher)
for rownum, row in enumerate(condition1):
for col, value in enumerate(row):
self.data.data_set(rownum, col, value)
for rownum, row in enumerate(condition2):
for col, value in enumerate(row):
self.data.data_set(rownum, col+self.columns, value)
def dojob(self, job):
"""this is where the actual computation in the mpi nodes occurs"""
vec, invalidated = job
for i in invalidated:
self.data.ignoregene(i)
setpermutevec(self.data, vec)
self.data.rearrangeall()
self.data.threadedallNstats(self.kern, self.numthreads)
#return the data to its original arrangement.
#this is not strictly necessary, but it makes the program deterministic
#(and otherwise, the data will be completely scrambled at the end)
undorearrange(self.data,vec)
return [self.data.getNstat(gene) for gene in xrange(self.data.genes)]
def printruninfo(self):
print 'genes',self.genes
print 'columns', self.columns
print 'perms', self.perms
print 'kern', self.kern
print 'numthreads', self.numthreads
print 'groupsize', self.groupsize
# print 'number of groups', self.groups
print 'numnodes', numnodes
print 'threshold', self.pvaluethreshold
print 'fisher', self.fisher
def setpermutevec(data, vec):
for i, v in enumerate(vec):
data.permutevecset(i, v)
def undorearrange(data,vec):
for i, v in enumerate(vec):
data.permutevecset(v,i)
data.rearrangeall()
def sendtoall(msg):
for proc in xrange(1, numnodes):
pypar.send(msg, proc, tag=OUT)
def abnormalexit(reason):
"""this tells each worker node to exit, then kills the server process.
this should only be called by the server node"""
print 'abnormal exit'
print reason
sendtoall(('Die', 0))
pypar.barrier()
pypar.finalize()
sys.exit(2)
############
rank = pypar.rank()
procname = pypar.get_processor_name()
#these are message tags
#"OUT" messages go from the server to workers
#"RETURN" messages go from the workers to the server
#the value of these variables should remain constant!
OUT = 0
RETURN = 1
class invalidatedgenes:
def __init__(self, numworkers, numgenes):
self.workerindices = []
self.invalidatedlist = []
self.invalidated = {}
for i in range(0, numgenes):
self.invalidated[i] = False
for i in range(0, numworkers+1):
self.workerindices.append(0) #every worker's index starts at zero
def invalidategene(self, genenum):
if self.invalidated[genenum] != True:
self.invalidated[genenum] = True
self.invalidatedlist.append(genenum)
def getnewinvalidatedgenes(self, workernum):
previousindex = self.workerindices[workernum]
self.workerindices[workernum] = len(self.invalidatedlist)
return self.invalidatedlist[previousindex:] #return every invalidated gene from the index to the end
class serverdata:
def __init__(self, t):#t is of type task
self.opts = t
self.invalidatedgeneset = invalidatedgenes(numnodes-1, self.opts.genes)
class jobgenerator:
"""generates jobs for dojob"""
jobindex = 0
def __init__(self, server):
self.numjobs = server.opts.perms
self.server = server
self.opts = server.opts
def __iter__(self):
return self
def hasnext(self, workernum):
"""are there new jobs available?"""
return self.jobindex < self.numjobs
def next(self, workernum):
"""get the next job
in this case a job is a permutation vector
"""
if self.jobindex < self.numjobs:
self.jobindex += 1
job = range(self.opts.columns*2)
random.shuffle(job)
print self.jobindex
invalidated = self.server.invalidatedgeneset.getnewinvalidatedgenes(workernum)
#print invalidated
return (self.jobindex, (job, invalidated))
else:
raise StopIteration
class resultcollector:
"""collects the responses from the worker nodes and combines the results from them at the server"""
def __init__(self, server):#server is of type serverdata
self.server = server
self.nstats = server.opts.dojob((range(server.opts.columns*2), []))#calculate the unpermuted nstatistics
self.t = server.opts
self.pvals = [0.0 for i in xrange(server.opts.genes)]
def collect(self, response):
jobnum, result = response
for i, v in enumerate(result):
if v >= self.nstats[i]:
self.pvals[i] += 1
if self.pvals[i] > self.t.pvaluethreshold:
self.server.invalidatedgeneset.invalidategene(i)
def finish(self):
"""call this when all the results are in"""
#print self.pvals
self.pvals = [x/self.t.perms for x in self.pvals]
#write the results out to a file
outfile = open(self.t.outfilename, 'w')
cPickle.dump(self.pvals, outfile)
outfile.close()
######################################################
def main():
#--------------------#
# server code
#--------------------#
if rank == 0:
print 'server running on ', procname
opts = task(sys.argv)
opts.printruninfo()
sendtoall(('Start', sys.argv))
server = serverdata(opts)
#set up the collector and generator
start = time.time()
collector = resultcollector(server)
end = time.time()
print end-start
jobs = jobgenerator(server)
numjobsreceived = 0
#begin distributing work
for proc in xrange(1, min(numnodes, jobs.numjobs+1)):
job = jobs.next(proc)
pypar.send(('job',job), proc, tag=OUT)
while numjobsreceived < jobs.jobindex:#while any job is still running
#wait for any node to send a result
msg, status = pypar.receive(pypar.any_source, return_status=True, tag=RETURN)
numjobsreceived += 1
proc, response = msg
if jobs.hasnext(proc):#see if there is more work to be done
job = jobs.next(proc)
pypar.send(('job',job), proc, tag=OUT)#send it to the node that just completed
#combine the results *after* sending the new job
#(this way the worker can proceed while the results are being combined)
collector.collect(response)
#all jobs collected, kill the workers
sendtoall(('Done', 0))
#finish up the computation
collector.finish()
#--------------------#
# worker code
#--------------------#
else:
while True:
start = time.time()
(code, msg), status = pypar.receive(0, return_status=True, tag=OUT)
end = time.time()
print 'waiting', end-start
if code == 'Done':#all work is done
opts.printruninfo()
break
elif code == 'Die':#abnormal exit
break
elif code == 'Start':
opts = task(msg)
sys.stdout = open(opts.logprefix+'%02d.log'%rank, 'w') #logfile
print 'client', rank, 'running on', procname
else:
start = time.time()
jobnum, job = msg
print jobnum
result = opts.dojob(job)#do the job
end = time.time()
print 'working',msg[0], end-start
start = time.time()
pypar.send((rank, (jobnum, result)), 0, tag=RETURN)#return the result to the server
end = time.time()
print 'sending', end-start
#------------------#
#end of parallel code
pypar.barrier()
pypar.finalize()
start = time.time()
#cProfile.run('main()', 'nstatprof')
main()
end = time.time()
print end - start