def get(self, timeout=0): # @UnusedVariable if not self.told_you_ready: raise CompmakeBug("I didnt tell you it was ready.") if self.already_read: msg = 'Compmake BUG: should not call twice.' raise CompmakeBug(msg) self.already_read = True assert os.path.exists(self.retcode) ret_str = open(self.retcode, 'r').read() try: ret = int(ret_str) except ValueError: msg = 'Could not interpret file %r: %r.' % (self.retcode, ret_str) raise HostFailed(host='localhost', job_id=self.job_id, reason=msg, bt='') # # # raise HostFailed(host="xxx", # job_id=self.job_id, # reason=reason, bt="") # XXX # try: stderr = open(self.stderr, 'r').read() stdout = open(self.stdout, 'r').read() stderr = 'Contents of %s:\n' % self.stderr + stderr stdout = 'Contents of %s:\n' % self.stdout + stdout # if ret == CompmakeConstants.RET_CODE_JOB_FAILED: # msg = 'SGE Job failed (ret: %s)\n' % ret # msg += indent(stderr, '| ') # # mark_as_failed(self.job_id, msg, None) # raise JobFailed(msg) # elif ret != 0: # msg = 'SGE Job failed (ret: %s)\n' % ret # error(msg) # msg += indent(stderr, '| ') # raise JobFailed(msg) if not os.path.exists(self.out_results): msg = 'job succeeded but no %r found' % self.out_results msg += '\n' + indent(stderr, 'stderr') msg += '\n' + indent(stdout, 'stdout') raise CompmakeBug(msg) res = safe_pickle_load(self.out_results) result_dict_raise_if_error(res) return res finally: fs = [self.stderr, self.stdout, self.out_results, self.retcode] for filename in fs: if os.path.exists(filename): os.unlink(filename)
def result_dict_raise_if_error(res): from compmake.exceptions import JobFailed from compmake.exceptions import HostFailed from compmake.exceptions import CompmakeBug from compmake.exceptions import JobInterrupted result_dict_check(res) if 'fail' in res: raise JobFailed.from_dict(res) if 'abort' in res: raise HostFailed.from_dict(res) if 'bug' in res: raise CompmakeBug.from_dict(res) if 'interrupted' in res: raise JobInterrupted.from_dict(res)
def ready(self): if self.told_you_ready: raise CompmakeBug('should not call ready() twice') if self.npolls % 20 == 1: try: qacct = get_qacct(self.sge_id) # print('job: %s sgejob: %s res: %s' % (self.job_id, # self.sge_id, qacct)) if 'failed' in qacct and qacct['failed'] != '0': reason = 'Job schedule failed: %s\n%s' % (qacct['failed'], qacct) raise HostFailed(host="xxx", job_id=self.job_id, reason=reason, bt="") # XXX except JobNotRunYet: qacct = None pass else: qacct = None self.npolls += 1 if os.path.exists(self.retcode): self.told_you_ready = True return True else: if qacct is not None: msg = 'The file %r does not exist but it looks like the job ' \ 'is done' % self.retcode msg += '\n %s ' % qacct # All right, this is simply NFS that is not updated yet # raise CompmakeBug(msg) return False
def ready(self): self.count += 1 is_ready = self.async_result.ready() # tmp_filename = self.tmp_filename if self.count > 10000 and (self.count % 100 == 0): # if is_ready: # if not os.path.exists(tmp_filename): # msg = 'I would have expected tmp_filename # to exist.\n %s' % tmp_filename # error('%s: %s' % (self.job_id, msg)) # else: # if os.path.exists(tmp_filename): # msg = 'The tmp_filename exists! but job # not returned yet.\n %s' % tmp_filename # error('%s: %s' % (self.job_id, msg)) # # if self.count % 100 == 0: # s = open(tmp_filename).read() # print('%s: %s: %s ' % (self.job_id, # self.count, s)) if False: if self.count % 100 == 0: s = self.read_status() # @UnusedVariable #print('%70s: %10s %s ' % (self.job_id, self.count, # s)) # timeout if self.count > 100000: raise HostFailed(host='localhost', job_id=self.job_id, reason='Timeout', bt='') return is_ready
def pmake_worker(name, job_queue, result_queue, signal_queue, signal_token, write_log=None): if write_log: f = open(write_log, 'w') def log(s): #print('%s: %s' % (name, s)) f.write('%s: ' % name) f.write(s) f.write('\n') f.flush() else: def log(s): print('%s: %s' % (name, s)) pass log('started pmake_worker()') signal.signal(signal.SIGINT, signal.SIG_IGN) def put_result(x): log('putting result in result_queue..') result_queue.put(x, block=True) if signal_queue is not None: log('putting result in signal_queue..') signal_queue.put(signal_token, block=True) log('(done)') try: while True: log('Listening for job') try: job = job_queue.get(block=True, timeout=5) except Empty: log('Could not receive anything.') continue if job == PmakeSub.EXIT_TOKEN: log('Received EXIT_TOKEN.') break log('got job: %s' % str(job)) function, arguments = job try: result = function(arguments) except JobFailed as e: log('Job failed, putting notice.') log('result: %s' % str(e)) # debug put_result(e.get_result_dict()) except JobInterrupted as e: log('Job interrupted, putting notice.') put_result(dict(abort=str(e))) # XXX except CompmakeBug as e: # XXX :to finish log('CompmakeBug') put_result(e.get_result_dict()) else: log('result: %s' % str(result)) put_result(result) log('...done.') # except KeyboardInterrupt: pass except BaseException as e: reason = 'aborted because of uncaptured:\n' + indent( traceback.format_exc(), '| ') mye = HostFailed(host="???", job_id="???", reason=reason, bt=traceback.format_exc()) log(str(mye)) put_result(mye.get_result_dict()) except: mye = HostFailed(host="???", job_id="???", reason='Uknown exception (not BaseException)', bt="not available") log(str(mye)) put_result(mye.get_result_dict()) log('(put)') if signal_queue is not None: signal_queue.close() result_queue.close() log('clean exit.')
def pmake_worker(name, job_queue, result_queue, signal_queue, signal_token, write_log=None): if write_log: f = open(write_log, "w") def log(s): f.write("%s: " % name) f.write(s) f.write("\n") f.flush() else: def log(s): pass log("started pmake_worker()") signal.signal(signal.SIGINT, signal.SIG_IGN) def put_result(x): log("putting result in result_queue..") result_queue.put(x, block=True) if signal_queue is not None: log("putting result in signal_queue..") signal_queue.put(signal_token, block=True) log("(done)") try: while True: log("Listening for job") job = job_queue.get(block=True) log("got job: %s" % str(job)) if job == PmakeSub.EXIT_TOKEN: break function, arguments = job try: result = function(arguments) except JobFailed as e: log("Job failed, putting notice.") log("result: %s" % str(e)) # debug put_result(e.get_result_dict()) except JobInterrupted as e: log("Job interrupted, putting notice.") put_result(dict(abort=str(e))) # XXX except CompmakeBug as e: # XXX :to finish log("CompmakeBug") put_result(e.get_result_dict()) else: log("result: %s" % str(result)) put_result(result) log("...done.") # except KeyboardInterrupt: pass except BaseException as e: reason = "aborted because of uncaptured:\n" + indent(traceback.format_exc(e), "| ") mye = HostFailed(host="???", job_id="???", reason=reason, bt=traceback.format_exc(e)) log(str(mye)) put_result(mye.get_result_dict()) except: mye = HostFailed(host="???", job_id="???", reason="Uknown exception (not BaseException)", bt="not available") log(str(mye)) put_result(mye.get_result_dict()) log("(put)") if signal_queue is not None: signal_queue.close() result_queue.close() log("clean exit.")