/
tryRrun2.py
executable file
·151 lines (123 loc) · 5.84 KB
/
tryRrun2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/python
import os
import time
import signal
import shutil
import sys
import socket
import counterApplications
import random
import random
tmpDir = sys.argv[1]
numtries = sys.argv[2]
application = sys.argv[3]
numtries = 50 ## I redefine it here. for really stubborn cases
MIN_LAM_NODES = 5 ## highly deployment dependant. But in our clusters
## less than 5 noes means something seriously wrong.
## Beware that a partially working cluster, if less than MIN_LAM_NODES, will
## behave weirdly, as tryRrun2.py will start, but then kill, lots of running jobs.
# def tryRrun(Rcommand, tmpDir, numtries = 10, application = "SignS")
# """ Try to launch R via os.system, verifying MPI got initialized.
# We try to initiate R up to numtries times; we verify MPI (or Snow)
# got initialized correctly (that requires that R produces an mpiOK file).
# If it doesn't, we call the mpi sanitization scripts to do their job
# and try again.
# """
def collectZombies(k = 10):
""" Make sure there are no zombies in the process tables.
This is probably an overkill, but works.
"""
for nk in range(k):
try:
tmp = os.waitpid(-1, os.WNOHANG)
except:
None
## The following does not work. We would need to caputer the output
## from ps, and then get substring with -sessionsuffix and the number = lamSuffix.
## But killing lam kills all slaves and the main process
## lamdpid = os.popen('ps --ppid ' + str(lampid) + ' -o "%p" --no-headers').readline()
## time.sleep(0.5)
## general cleaning
# buried = os.system("/http/mpi.log/buryThem2.py")
killedlamandr = os.system('/http/mpi.log/killOldLamAllMachines.py')
# cleaned_dirs = os.system('/http/mpi.log/delete_old_dirs.py')
try:
counterApplications.add_to_log(application, tmpDir, socket.gethostname())
except:
None
startedOK = False
time.sleep(random.uniform(0, 8)) ## to prevent truly simultaneous from crashing MPI
for i in range(int(numtries)):
os.system('touch ' + tmpDir + '/numtries_' + str(i)) ## debug
lamSuffix = str(int(time.time())) + str(os.getpid()) + \
str(random.randint(10, 999999))
lamenvfile = open(tmpDir + '/lamSuffix', mode = 'w')
lamenvfile.write(lamSuffix)
lamenvfile.flush()
lamenvfile.close()
lamenv = os.putenv('LAM_MPI_SESSION_SUFFIX', lamSuffix)
# fullRcommand = 'export LAM_MPI_SESSION_SUFFIX="' + lamSuffix + '";' + '/usr/bin/lamboot -b -H /http/mpi.defs/lamb-host.' + socket.gethostname() + '.def; cd ' + tmpDir + '; sleep 40;' + '/http/R-custom/bin/R --no-restore --no-readline --no-save --slave <f1.R >>f1.Rout 2> error.msg &'
fullRcommand = 'export LAM_MPI_SESSION_SUFFIX="' + lamSuffix + '";' + '/usr/bin/lamboot -b -H /http/mpi.defs/lamb-host.' + socket.gethostname() + '.def; cd ' + tmpDir + '; sleep 40;' + '/var/www/bin/R-local-7-LAM-MPI/bin/R --no-restore --no-readline --no-save --slave <f1.R >>f1.Rout 2> error.msg &'
counterApplications.add_to_LAM_SUFFIX_LOG(lamSuffix, application, tmpDir,
socket.gethostname())
Rrun = os.system(fullRcommand)
os.system('touch ' + tmpDir + '/first_Rrun') ## debug
time.sleep(100 + random.uniform(1, 12))
collectZombies()
if os.path.exists(tmpDir + "/RterminatedOK"):
startedOK = True
break
if os.path.exists(tmpDir + "/mpiOK"):
if int(os.popen('lamnodes | wc').readline().split()[0]) > MIN_LAM_NODES:
## debug
os.system('echo "' + str(int(os.popen('lamnodes | wc').readline().split()[0])) + '" > ' + tmpDir + '/MIN_LAM_NODES_CHECK')
startedOK = True
break
try:
lamkill = os.system('lamhalt; lamwipe')
except:
None
try:
counterApplications.add_to_MPIErrorLog(application, tmpDir,
socket.gethostname())
except:
None
# if not os.path.exists('/http/mpi.log/' + application + 'ErrorLog'):
# os.system('touch /http/mpi.log/' + application + 'ErrorLog')
# outlog = open('/http/mpi.log/' + application + 'ErrorLog', mode = 'a')
# outlog.write('MPI fails on ' + time.ctime(time.time()) +
# ' Directory: ' + tmpDir + '\n')
# outlog.close()
if not startedOK:
## Logging
try:
counterApplications.add_to_MPIErrorLog(application, tmpDir,
socket.gethostname(),
'MPI max num crashes')
except:
None
# if not os.path.exists('/http/mpi.log/' + application + 'ErrorLog'):
# os.system('touch /http/mpi.log/' + application + 'ErrorLog')
# outlog = open('/http/mpi.log/' + application + 'ErrorLog', mode = 'a')
# outlog.write('MPI fails on ' + time.ctime(time.time()) +
# ' Directory: ' + tmpDir + '\n')
# outlog.close()
## Make sure the checkdone.cgi will stop; we create the two files here
## either of which will lead to loading results.html
out1 = open(tmpDir + "/natural.death.pid.txt", mode = "w")
out2 = open(tmpDir + "/kill.pid.txt", mode = "w")
out1.write('MPI initialization error!!')
out2.write('MPI initialization error!!')
out1.close()
out2.close()
outf = open(tmpDir + "/pre-results.html", mode = "w")
outf.write("<html><head><title> MPI initialization problem.</title></head><body>\n")
outf.write("<h1> MPI initialization problem.</h1>")
outf.write("<p> After " + numtries + " attempts we have been unable to ")
outf.write(" initialize MPI.</p>")
outf.write("<p> We will be notified of this error, but we would also ")
outf.write("appreciate if you can let us know of any circumstances or problems ")
outf.write("so we can diagnose the error.</p>")
outf.write("</body></html>")
outf.close()
shutil.copyfile(tmpDir + "/pre-results.html", tmpDir + "/results.html")