This repository has been archived by the owner on May 21, 2021. It is now read-only.
/
changed_harness2.py
executable file
·644 lines (584 loc) · 23.5 KB
/
changed_harness2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
#!/usr/bin/python2.6
import os,sys,time,thread,select,re,threading,socket
from opts import opts
from ssh import SSH
from readnwrite import unblocked_read
from threading import Thread
from Queue import Queue
import logging,exceptions
from adv_auto_start import check_for_new_build,get_latest_build,get_current_build
from conf import readConf,BadConf
#from test import Build_KILL
'''FIX ME
1)Remove all the global variables and make it static.
2)Any bugs while interaction with the menu is highly probable because of the merry go round of the functions of that part.. watch it.
3)Temporary fix for the build detection
newly added code include controlled and timed number of cntrl-c and making the slave exit when master is dead during the master_wait sleep and also handled the race between the master thread and slave thread during the restarts
'''
done = False
kill = False
JOB_HASH = []
KILL_DICT = {}
DONE_HASH = []
cid=[] #list of all currently running jobs
pending_jobs= [] #list of all queued jobs
#pending_jobs= 0
DYN_RESTART = {} #dict of the conf file name and its coressponding arguments to the Job object
dead_jobs = set() #set of all dead jobs
ALL_JOBS = {} #dict of all the conf files and corresponding job id's
DEPENDENCY_DICT = {} #dict of conf files , with keys as their master's job-id
BUILD_KILL=False # this should be used by externel code for the auto stop of the all apps running,don't mess with this
WAIT_TILL_CLEAR = False #this is used during the build detection to flag off when all the jobs are killed and ready to restart.
BUILD_RESTART = 0 #counts the number of times the new build has been detected and jobs are relaunched
BAD_CONF = [] #includes the array of all conf's that are labelled bad
class BuildRestart(exceptions.Exception):
def __init__(self):
return
def __string__(self):
print 'somebody called the BUILD_KILL'
class Job(Thread):
def __init__(self,items):
global KILL_DICT,DONE_HASH,BAD_CONF
self.CONF_GOOD=True
self.tdata=''
#self.build = buid
path,self.JOB_ID,self.MASTER_ID,self.build = items
DONE_HASH[self.JOB_ID] = False
self.path = path
#self.conf=self.readconf(path)
try:
self.conf=readConf(path)
except BadConf:
self.CONF_GOOD = False
print 'Bad configuration file',self.path
KILL_DICT[self.path] = False
kill = False
#print self.path,DONE_HASH[self.JOB_ID],KILL_DICT[self.path]
if self.CONF_GOOD:
self.initialize(path)
Thread.__init__(self)
else :
BAD_CONF.append(path)
DONE_HASH[self.JOB_ID] = True
def initialize(self,path):
global BUILD_RESTART
user = self.conf['user'].rstrip('\n')
self.user = user
host = self.conf['host'].rstrip('\n')
self.host = host
self.EXPECT=user+'@'+host
self.port = int(self.conf['port'].rstrip('\n'))
self.BUILD_IDENTIFIER = ''
if self.port >= 0:
self.SEQ = False
else :
self.SEQ = True
if self.build != ''
#BUILD_IDENTIFIER=str(re.search('(avm-x86-[0-9]+)',self.build).group(1))
self.BUILD_IDENTIFIER = str(re.search('.*/([^/]+)/+sandbox.*',self.build).group(1))
if BUILD_RESTART > 0:
if self.build != ''
self.log = self.conf['logs'].rstrip('\n')+ self.BUILD_IDENTIFIER
else:
self.log = self.conf['logs'].rstrip('\n') +'.bd.'+ str(BUILD_RESTART)
else :
self.log = self.conf['logs'].rstrip('\n')
self.duration = float(self.conf['dur'].rstrip('\n'))
self.time_handling = False
self.sleep_time = float(self.conf['delay'].rstrip('\n'))
if self.duration > 0:
self.time_handling = True
self.arg= user+'@'+host
self.process_alive = True
self.process_killed= False
self.PORT_OPEN = False
self.allIsWell = True
self.is_slave = False
self.is_restart = True
self.restart_attempts = int(self.conf['restart'].rstrip('\n'))
def wait_till_master(self):
global JOB_HASH
self.is_slave = True
if not self.SEQ:
while not JOB_HASH[self.MASTER_ID] and not (kill or KILL_DICT[self.path]) and not DONE_HASH[self.MASTER_ID]:
time.sleep(3)
if self.SEQ :
while not DONE_HASH[self.MASTER_ID] and not (kill or KILL_DICT[self.path]) :
time.sleep(3)
#print JOB_HASH[self.MASTER_ID],(kill or KILL_DICT[self.path]),DONE_HASH[self.MASTER_ID],'\t somebody woke me up',self.path
def run(self):
global cid
global pending_jobs #highly thread unsafe, but for now living with it.
if self.sleep_time > 0:
self.delay()
#if not (kill or KILL_DICT[self.path]):
if self.MASTER_ID != -1:
#pending_jobs += 1
pending_jobs.append(self.path)
self.wait_till_master()
#pending_jobs -= 1
pending_jobs.remove(self.path)
if not (kill or KILL_DICT[self.path]):
cid.append(self.path)
#print 'appended!!!!!!',self.path
#print self.MASTER_ID
sshOb = SSH(self.host,self.user,self.conf['pass'])
self.pid,fd,success = sshOb.fork()
#print '---------------',success
#print fd
if success:
self.parent_processing(fd)
else:
cid.remove(self.path)
DONE_HASH[self.JOB_ID] = True
dead_jobs.add(self.path)
def parent_processing(self,fd):
global cid
global DONE_HASH
#print self.ident
if self.time_handling:
self.spawn_timer()
if self.build != '' :
self.conf['command'] = 'SANDBOX='+self.build+'/sandbox; BUILD_IDENTIFIER='+self.BUILD_IDENTIFIER+' ; '+self.conf['command'] #Kedar
#cid.append(self.pid) #appending the child process id for safe clean-up
if self.allIsWell:
self.write(fd,'cd '+ self.conf['path'])
self.readtillexpect(fd,self.EXPECT) #If there is issue in the expect ,where the second time read directly results in the expect which is a bogus one bcoz of the previous command to change the path, simple fix would be to do a read by select method for a period of time so the fd is emptied.
#self.backup_read(fd)
unblocked_read(fd,1024,3)
self.write(fd,self.conf['command'])
if int(self.port) > 0 :
self.port_monitor()
else:
JOB_HASH[self.JOB_ID] = True #this should have been under else statement
#self.unblocking_read3(fd,output,data)
self.readtillexpect(fd,self.EXPECT,True)
self.process_killed = True
cid.remove(self.path)
#cid.pop()
JOB_HASH[self.JOB_ID] = False
#print 'a value popped',self.path
DONE_HASH[self.JOB_ID] = True
def backup_read(self,fd):
rd,rd,wt = select.select([fd],[],[],5)
if fd in rd:
return os.read(fd,1024)
else:
return ''
def write(self,fd,cont):
if not cont.endswith('\n'):
cont += '\n'
chars = os.write(fd,cont)
time.sleep(1)
return chars
def readtillexpect(self,fd,expect,log=False):
''' the key method that reads till the Job is done'''
global done,kill,KILL_DICT
assert isinstance(expect,str)
try:
fi = self.openlog(self.log,'w',1024,log)
reply = ''
self.restarted_times = 0
has_restarted = False
self.file_size = 0
max_kill_attempts = 20
kill_attempts = 0
self.max_file_size = 100**4
while True:
if has_restarted:
self.write(fd,self.conf['command'])
reply = ''
has_restarted = False
self.log = self.log+'.'+str(self.restarted_times)
fi = self.openlog(self.log,'w',1024,log)
if kill or KILL_DICT[self.path] or (self.is_slave and (not self.SEQ) and not JOB_HASH[self.MASTER_ID]): #can be argued to be wrong implementation but logically correct as it is used in logical and only if it is a slave the second part i.e., JOB_HASH[self.master_id] is checked
#print 'got a kill signal'
print 'I reached here ', self.SEQ,' ',self.path
self.write(fd,'\x03')
time.sleep(4)
kill_attempts += 1
#print 'no of attempts :',kill_attempts
if kill_attempts > max_kill_attempts:
print 'reached the point of hard kill'
self.hard_kill(fd,fi,log)
rd,wt,ex = select.select([fd],[],[],.1)
if fd in rd:
rep = os.read(fd,1024)
rep.replace('\r','')
#print rep,
self.writelog(fi,rep,log)
if log and self.file_size > self.max_file_size:
fi = self.file_ops(fi)
reply += rep
self.file_size += len(rep)
if re.search(expect,reply) :#or kill_attempts > max_kill_attempts:
#rep = self.backup_read(fd) # A rather more cautious read to make sure the buffer is cleared up
rep = unblocked_read(fd,1024,3)
self.writelog(fi,rep,log)
kill_attempts = 0
if self.restart(log):
has_restarted = True
continue
break
finally:
self.closelog(fi)
def hard_kill(self,fd,fi,log):
self.write(fd,'\x1A')
rep = self.readtillexpect2(fd,[self.EXPECT])
#assert not re.search(rep,self.EXPECT)
self.writelog(fi,rep,log)
self.write(fd,'kill -9 %1')
def readtillexpect2(self,fd,expect):
assert isinstance(expect,list)
reply = ''
while True:
rd,wt,ex = select.select([fd],[],[],.1)
if fd in rd:
rep = os.read(fd,1024)
rep.replace('\r','')
print rep,
reply += rep
for i in expect :
if re.search(i,reply):
return reply
def restart(self,log):
if kill or KILL_DICT[self.path] or (self.is_slave and not JOB_HASH[self.MASTER_ID]) or not (self.is_restart and self.restart_attempts > 0 and self.restarted_times < self.restart_attempts) or not log:
return False
self.restarted_times += 1
return True
def openlog(self,fname,mode,bsize,log):
if log:
return open(fname,mode,bsize)
def closelog(self,fi):
try:
fi.close()
except:
pass
def writelog(self,fi,wt,log):
if log:
fi.write(wt)
def export_sandbox(self,fd):
self.blocked_read_write(fd,1024,'export SANDBOX='+sys.argv[1]+'\n')
def port_monitor(self):
monitor = Thread(target = self.pmonitor)
monitor.daemon=True
monitor.start()
def pmonitor(self):
global JOB_HASH
while not self.port_scan(self.host,self.port) :
#print 'port not established yet'
time.sleep(2)
#print 'Did I reach here????'
JOB_HASH[self.JOB_ID] = True
#print 'finally!!!!11 port esatblished'
def port_scan(self,host,port):
s = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
try:
s.connect((host,int(port)))
s.shutdown(2)
return True
except:
return False
def spawn_timer(self):
k = Thread(target = self.handle_time_dur, args=(self.duration,self.pid))
k.start()
def handle_time_dur(self,dur,pid):
max_sleep = float(dur)*60
sleep_count=0
while not self.process_killed and sleep_count < max_sleep and not (kill or KILL_DICT[self.path]):
time.sleep(1)
sleep_count +=1
KILL_DICT[self.path] = True
#try:
#os.kill(pid,9)
#self.process_killed = True
#except:
#print 'handle this baby'
def delay(self):
global pending_jobs
max_sleep = float(self.sleep_time)*60
sleep_count = 0
#pending_jobs += 1
pending_jobs.append(self.path)
while sleep_count < max_sleep and not (kill or KILL_DICT[self.path]):
time.sleep(1)
sleep_count += 1
#pending_jobs -= 1
pending_jobs.remove(self.path)
def kill_child(self,child_pid):
os.kill(child_pid,9)
def file_ops(self,fh):
fh.close()
self.rotate_file(fh) #implemented new file rotate method
fh = open(self.log,'a',1024)
self.file_size = 0
return fh
def rotate_file(self,fh):
rotate_fi = open(self.log)
full = rotate_fi.readlines()
rotate_fi.close()
rotate_fi = open(self.log,'w')
rotate_fi.writelines(full[-300:])
rotate_fi.close()
def check_process_alive(self):
try:
os.kill(self.pid,0)
self.process_alive = True
except:
self.process_alive = False
def kill_menu():
global KILL_DICT
print '================================='
print '1) To kill all running apps type "all" and exit\n2) To kill an app type "kill"\n3) To resume type "resume"\n4) To see the list of running and queued process type "list"\n5) To restart a dead job enter "restart"\n6) To dynamically reconfigure the conf file type "reconf"'
val = str(raw_input('-> ')) #needs to be fixed, in a rare occassion all apps may be dead but the main thread is blocked for a raw_input
val = val.rstrip('\n')
if val != 'all' and val != 'resume' and val != 'kill' and val != 'list' and val != 'restart' and val != 'reconf':
kill_menu()
else:
clean_up(val)
def current_jobs():
global cid
print 'Current jobs: ',[ i for i in cid ],'\tPending jobs: ',[ i for i in pending_jobs ],'\tDead Jobs: ',[i for i in dead_jobs] #if found buggy use if cid!='' in bot the for loops respectively
def clean_up(val):
global cid,kill,DONE_HASH,KILL_DICT
#val = kill_menu()
if val == 'all':
#kill = True
#time.sleep(2) #for a rare race condition where after a restart the job thread has not yet updated the kill or KILL_DICT values but the main thread exists bcoz of the past values, probably seen if only one conf file is used
#while False in DONE_HASH:
#time.sleep(1)
#kill_all_procs(cid)
kill_all()
sys.exit()
elif val == 'resume':
signal_handle()
elif val == 'list':
current_jobs()
elif val == 'restart':
handle_dynrestart()
elif val == 'kill':
kill_display()
#KILL_DICT[val] = True
#if len(cid) > 1:
#kill_menu()
#signal_handle()
elif val == 'reconf':
handle_reconf()
else :
print 'invalid selection'
kill_menu()
signal_handle()
def kill_all():
global cid,kill,DONE_HASH,KILL_DICT
print "cleaning up........."
kill = True
time.sleep(2) #for a rare race condition where after a restart the job thread has not yet updated the kill or KILL_DICT values but the main thread exists bcoz of the past values, probably seen if only one conf file is used
while False in DONE_HASH:
try:
time.sleep(1)
except KeyboardInterrupt:
print "please be patient"
pass
kill_all_procs(cid)
def handle_reconf():
global ALL_JOBS,KILL_DICT,DONE_HASH,dead_jobs
reconf_val = raw_input('Enter the conf file name to be reconf or exit to exit to main menu\n-> ')
if reconf_val == 'exit':
kill_menu()
if reconf_val not in ALL_JOBS.iterkeys():
print 'wrong conf file name'
kill_menu()
else:
KILL_DICT[reconf_val] = True
while not DONE_HASH[ALL_JOBS[reconf_val]]:
time.sleep(2)
dynamic_restart(reconf_val)
dead_jobs.discard(reconf_val)
signal_handle()
def kill_display():
kill_val = raw_input('Enter the conf file name to be killed or exit to exit to main menu\n-> ')
if kill_val == 'exit':
kill_menu()
if kill_val not in KILL_DICT.keys():
print 'invalid conf file name'
kill_menu()
else:
KILL_DICT[kill_val] = True
signal_handle()
def kill_all_procs(cid):
for processes in cid:
try:
os.kill(processes,9)
except:
pass
#print 'done'
def check_num_jobs(li):
jobs = li.split('->')
return jobs,len(jobs)
def signal_handle():
global cid,dead_jobs,BUILD_KILL,WAIT_TILL_CLEAR,BUILD_RESTART
try:
while True :#threading.active_count()>1:
#print BUILD_KILL
if BUILD_KILL:
WAIT_TILL_CLEAR = True
kill_all()
BUILD_RESTART += 1
raise BuildRestart
thread_no = threading.enumerate()[1:]
#print thread_no
print 'number of current jobs: ',len(cid),'\tnumber of queued jobs: ',len(pending_jobs),'\tnumber of dead jobs: ',len(dead_jobs)
#if len(cid)==0 and len(pending_jobs) == 0:
#raise KeyboardInterrupt
time.sleep(4)
except KeyboardInterrupt:
#try:
kill_menu()
#except KeyboardInterrupt:
#pass
def getconf(path,build):
global DYN_RESTART
KILL_DICT = {}
q = Queue()
if not os.path.isfile(path):
print 'Master conf file not found'
sys.exit(1)
with open(path) as fi:
li = fi.readlines()
job_id = 0
for i in li:
s,le = check_num_jobs(i.rstrip('\n'))
for r in s:
pu = r,job_id,-1 if s.index(r) == 0 else (job_id-1),build
ALL_JOBS[r] = job_id
DYN_RESTART[r] = pu
KILL_DICT[r] = False
if s.index(r) != 0:
DEPENDENCY_DICT[job_id-1] = r #easier when the dict has job_id has key and conf file name has path
q.put(pu)
job_id += 1
return q,KILL_DICT,job_id
def fireJobs(q):
while not q.empty():
try:
o = q.get()
th = Job(o)#q.get())
try:
th.daemon=True
except:
#print 'bad conf file(%s)' %o
continue
else :
th.start()
except KeyboardInterrupt:
sys.exit(1)
def handle_dynrestart():
global dead_jobs,DEPENDENCY_DICT,ALL_JOBS,BAD_CONF
if len(dead_jobs) == 0:
print 'No dead jobs currently'
signal_handle()
value = raw_input('enter the dead job name to restart of exit to exit from the submenu\n->')
if value not in dead_jobs or value == 'exit':
print 'Failed to restart the job',value
else:
try:
if ALL_JOBS[value] in DEPENDENCY_DICT.iterkeys() and DEPENDENCY_DICT[ALL_JOBS[value]] not in pending_jobs: #wait till the dependent children are dead before restarting and also may throw a key error for the second validation
print 'has a dependent'
while DEPENDENCY_DICT[ALL_JOBS[value]] not in dead_jobs: #hmmm now gotta handle slave thread starting up even before master has reset its value
if DEPENDENCY_DICT[ALL_JOBS[value]] in BAD_CONF:
print 'the dependent job has a bad conf'
break
print 'waiting for the child to die'
time.sleep(2)
#dynamic_restart(DEPENDENCY_DICT[ALL_JOBS[value]])
#dead_jobs.discard(DEPENDENCY_DICT[ALL_JOBS[value]])
#time.sleep(2)
#print '---testing the race condition'
except KeyError:
pass
dynamic_restart(value)
#dynamic_restart(DEPENDENCY_DICT[ALL_JOBS[value]]) moved up int the if block..... might not really matter much
dead_jobs.discard(value)
time.sleep(1) #trying to make sure the master thread resets its values before the slave reads it
if ALL_JOBS[value] in DEPENDENCY_DICT.iterkeys():
dynamic_restart(DEPENDENCY_DICT[ALL_JOBS[value]]) #moving it back from block..... matters very much
dead_jobs.discard(DEPENDENCY_DICT[ALL_JOBS[value]])
signal_handle()
def dynamic_restart(valu):
global DYN_RESTART
q = Queue()
q.put(DYN_RESTART[valu])
fireJobs(q)
def implementation(conf,build = ''):
global DONE_HASH,JOB_HASH,KILL_DICT,job_id,cid
q = Queue()
q,KILL_DICT,job_id = getconf(conf,build)
[ (JOB_HASH.append(False),DONE_HASH.append(False)) for k in range(0,job_id) ]
fireJobs(q)
time.sleep(.5) #To handle the race between main threads and child threads
signal_handle()
def file_check():
global BUILD_KILL,WAIT_TILL_CLEAR
while True:
if os.path.isfile('abc.txt'):
BUILD_KILL = True
#while WAIT_TILL_CLEAR:
#print 'waiting to clear all the apps'
#time.sleep(2)
time.sleep(10)
#break
def reset_vals():
global done,kill,JOB_HASH,KILL_DICT,DONE_HASH,cid,pending_jobs,DYN_RESTART,dead_jobs,ALL_JOBS,DEPENDENCY_DICT,BUILD_KILL,WAIT_TILL_CLEAR
done = False
kill = False
JOB_HASH = []
KILL_DICT = {}
DONE_HASH = []
cid =[] #list of all currently running jobs
pending_jobs = [] #list of all queued jobs
#pending_jobs= 0
DYN_RESTART = {} #dict of the conf file name and its coressponding arguments to the Job object
dead_jobs = set() #set of all dead jobs
ALL_JOBS = {} #dict of all the conf files and corresponding job id's
DEPENDENCY_DICT = {} #dict of conf files , with keys as their master's job-id
BUILD_KILL=False # this should be used by externel code for the auto stop of the all apps running,don't mess with this
WAIT_TILL_CLEAR = False #this is used during the build detection to flag off when all the jobs are killed and ready to restart.
def build_check():
global BUILD_KILL,WAIT_TILL_CLEAR
while True:
build = check_for_new_build(get_current_build()) #pending : implement the build-check thread to wait till
BUILD_KILL = True #pending : implement the build-check thread to wait till all the runs are safely shutdown
time.sleep(4)
k = time.time()
while WAIT_TILL_CLEAR:
assert (time.time() - k) < 500
print 'waiting to clear all the apps'
time.sleep(2)
if __name__ == '__main__':
opts,args = opts()
master_conf = opts.conf
smokes_mode = opts.build
simple_harness = opts.plain
path_to_build = opts.path
curr = opts.current
if not simple_harness:
if curr:
build = get_current_build(path_to_build)
else :
build = check_for_new_build(get_current_build(path_to_build),path_to_build)
th = threading.Thread(target=build_check)
th.daemon = True
th.start()
while True:
try:
build = ''
if not simple_harness:
build = get_current_build(path_to_build)
implementation(master_conf,build)
except BuildRestart:
if smokes_mode:
reset_vals()
print '----------------restarting----------------'
else :
print '----------------exiting---------------'
break