def parmake_job2_new_process(args): """ Starts the job in a new compmake process. """ (job_id, context) = args compmake_bin = which("compmake") db = context.get_compmake_db() storage = db.basepath # XXX: where = os.path.join(storage, "parmake_job2_new_process") if not os.path.exists(storage): try: os.makedirs(storage) except: pass out_result = os.path.join(where, "%s.results.pickle" % job_id) out_result = os.path.abspath(out_result) cmd = [compmake_bin, storage] if not all_disabled(): cmd += ["--contracts"] cmd += [ "--status_line_enabled", "0", "--colorize", "0", "-c", "make_single out_result=%s %s" % (out_result, job_id), ] cwd = os.getcwd() cmd_res = system_cmd_result( cwd, cmd, display_stdout=False, display_stderr=False, raise_on_error=False, capture_keyboard_interrupt=False ) ret = cmd_res.ret if ret == CompmakeConstants.RET_CODE_JOB_FAILED: # XXX: msg = "Job %r failed in external process" % job_id msg += indent(cmd_res.stdout, "stdout| ") msg += indent(cmd_res.stderr, "stderr| ") res = safe_pickle_load(out_result) os.unlink(out_result) result_dict_check(res) raise JobFailed.from_dict(res) elif ret != 0: msg = "Host failed while doing %r" % job_id msg += "\n cmd: %s" % " ".join(cmd) msg += "\n" + indent(cmd_res.stdout, "stdout| ") msg += "\n" + indent(cmd_res.stderr, "stderr| ") raise CompmakeBug(msg) # XXX: res = safe_pickle_load(out_result) os.unlink(out_result) result_dict_check(res) return res
def result_dict_raise_if_error(res): from compmake.exceptions import JobFailed from compmake.exceptions import HostFailed from compmake.exceptions import CompmakeBug from compmake.exceptions import JobInterrupted result_dict_check(res) if 'fail' in res: raise JobFailed.from_dict(res) if 'abort' in res: raise HostFailed.from_dict(res) if 'bug' in res: raise CompmakeBug.from_dict(res) if 'interrupted' in res: raise JobInterrupted.from_dict(res)
def mvac_job(args): """ args = tuple job_id, context, queue_name, show_events Returns a dictionary with fields "user_object", "new_jobs", 'delete_jobs'. "user_object" is set to None because we do not want to load in our thread if not necessary. Sometimes it is necessary because it might contain a Promise. """ job_id, context, event_queue_name, show_output, volumes, cwd = args # @UnusedVariable check_isinstance(job_id, str) check_isinstance(event_queue_name, str) # Disable multyvac logging disable_logging_if_config(context) db = context.get_compmake_db() job = get_job(job_id=job_id, db=db) if job.needs_context: msg = 'Cannot use multyvac for dynamic job.' raise CompmakeException(msg) time_start = time.time() multyvac_job = mvac_instance(db, job_id, volumes, cwd) multyvac_job.wait() errors = [multyvac_job.status_error, multyvac_job.status_killed] if multyvac_job.status in errors: e = 'Multyvac error (status: %r)' % multyvac_job.status bt = str(multyvac_job.stderr) cache = Cache(Cache.FAILED) cache.exception = e cache.backtrace = bt cache.timestamp = time.time() cache.captured_stderr = str(multyvac_job.stderr) cache.captured_stdout = str(multyvac_job.stdout) set_job_cache(job_id, cache, db=db) raise JobFailed(job_id=job_id, reason=str(e), bt=bt) user_object = multyvac_job.result user_object_deps = collect_dependencies(user_object) set_job_userobject(job_id, user_object, db=db) cache = get_job_cache(job_id, db=db) cache.captured_stderr = str(multyvac_job.stderr) cache.captured_stdout = str(multyvac_job.stdout) cache.state = Cache.DONE cache.timestamp = time.time() walltime = cache.timestamp - time_start cache.walltime_used = walltime cache.cputime_used = multyvac_job.cputime_system cache.host = 'multyvac' cache.jobs_defined = set() set_job_cache(job_id, cache, db=db) result_dict = dict(user_object=user_object, user_object_deps=user_object_deps, new_jobs=[], deleted_jobs=[]) result_dict_check(result_dict) return result_dict
def make(job_id, context, echo=False): # @UnusedVariable """ Makes a single job. Returns a dictionary with fields: "user_object" "user_object_deps" = set of Promises "new_jobs" -> new jobs defined "deleted_jobs" -> jobs that were defined but not anymore Raises JobFailed or JobInterrupted. Also SystemExit, KeyboardInterrupt, MemoryError are captured. """ db = context.get_compmake_db() int_make = IntervalTimer() host = 'hostname' # XXX if get_compmake_config('set_proc_title'): setproctitle('cm-%s' % job_id) # TODO: should we make sure we are up to date??? # up, reason = up_to_date(job_id, db=db) # @UnusedVariable # if up: # msg = 'Job %r appears already done.' % job_id # msg += 'This can only happen if another compmake process uses the ' \ # 'same DB.' # logger.error(msg) # user_object = get_job_userobject(job_id, db=db) # # XXX: this is not right anyway # return dict(user_object=user_object, # user_object_deps=collect_dependencies(user_object), # deleted_jobs=[], # new_jobs=[]) job = get_job(job_id, db=db) cache = get_job_cache(job_id, db=db) if cache.state == Cache.DONE: prev_defined_jobs = set(cache.jobs_defined) # print('%s had previously defined %s' % (job_id, prev_defined_jobs)) else: # print('%s was not DONE' % job_id) prev_defined_jobs = None # Note that at this point we save important information in the Cache # so if we set this then it's going to destroy it # cache.state = Cache.IN _ PROGRESS # set_job_cache(job_id, cache, db=db) # TODO: delete previous user object def progress_callback(stack): publish(context, 'job-progress-plus', job_id=job_id, host=host, stack=stack) init_progress_tracking(progress_callback) disable_capture = False if disable_capture: capture = None else: echo = False capture = OutputCapture( context=context, prefix=job_id, # This is instantaneous echo and should be False # They will generate events anyway. echo_stdout=echo, echo_stderr=echo) # TODO: add whether we should just capture and not echo old_emit = logging.StreamHandler.emit from compmake.ui.coloredlog import colorize_loglevel FORMAT = "%(name)10s|%(filename)15s:%(lineno)-4s - %(funcName)-15s| %(message)s" formatter = Formatter(FORMAT) class Store(object): nhidden = 0 def my_emit(_, log_record): # note that log_record.msg might be an exception try: try: s = str(log_record.msg) except UnicodeEncodeError: s = unicode(log_record.msg) except: s = 'Could not print log_record %s' % id(log_record) log_record.msg = colorize_loglevel(log_record.levelno, s) res = formatter.format(log_record) print(res) # this will be captured by OutputCapture anyway except: Store.nhidden += 1 logging.StreamHandler.emit = my_emit already = set(context.get_jobs_defined_in_this_session()) def get_deleted_jobs(): generated = set(context.get_jobs_defined_in_this_session()) - already # print('failure: rolling back %s' % generated) from compmake.ui.ui import delete_jobs_recurse_definition todelete_ = set() # delete the jobs that were previously defined if prev_defined_jobs: todelete_.update(prev_defined_jobs) # and also the ones that were generated todelete_.update(generated) deleted_jobs_ = delete_jobs_recurse_definition(jobs=todelete_, db=db) # now we failed, so we need to roll back other changes # to the db return deleted_jobs_ try: result = job_compute(job=job, context=context) assert isinstance(result, dict) and len(result) == 5 user_object = result['user_object'] new_jobs = result['new_jobs'] int_load_results = result['int_load_results'] int_compute = result['int_compute'] int_gc = result['int_gc'] int_gc.stop() except KeyboardInterrupt as e: bt = traceback.format_exc() deleted_jobs = get_deleted_jobs() mark_as_failed(job_id, 'KeyboardInterrupt: ' + str(e), backtrace=bt, db=db) cache = get_job_cache(job_id, db=db) if capture is not None: cache.captured_stderr = capture.get_logged_stderr() cache.captured_stdout = capture.get_logged_stdout() else: msg = '(Capture turned off.)' cache.captured_stderr = msg cache.captured_stdout = msg set_job_cache(job_id, cache, db=db) raise JobInterrupted(job_id=job_id, deleted_jobs=deleted_jobs) except (BaseException, ArithmeticError, BufferError, LookupError, Exception, SystemExit, MemoryError) as e: bt = traceback.format_exc() if six.PY2: s = '%s: %s' % (type(e).__name__, e) # # s = type(e).__name__ + ': ' + e.__str__().strip() # try: # s = s.decode('utf-8', 'replace').encode('utf-8', 'replace') # except (UnicodeDecodeError, UnicodeEncodeError) as ue: # print(ue) # XXX # s = 'Could not represent string.' else: s = '%s: %s' % (type(e).__name__, e) mark_as_failed(job_id, s, backtrace=bt, db=db) deleted_jobs = get_deleted_jobs() cache = get_job_cache(job_id, db=db) if capture is not None: cache.captured_stderr = capture.get_logged_stderr() cache.captured_stdout = capture.get_logged_stdout() else: msg = '(Capture turned off.)' cache.captured_stderr = msg cache.captured_stdout = msg set_job_cache(job_id, cache, db=db) raise JobFailed(job_id=job_id, reason=s, bt=bt, deleted_jobs=deleted_jobs) finally: int_finally = IntervalTimer() if capture is not None: capture.deactivate() # even if we send an error, let's save the output of the process logging.StreamHandler.emit = old_emit if Store.nhidden > 0: msg = 'compmake: There were %d messages hidden due to bugs in logging.' % Store.nhidden print(msg) int_finally.stop() # print('finally: %s' % int_finally) int_save_results = IntervalTimer() # print('Now %s has defined %s' % (job_id, new_jobs)) if prev_defined_jobs is not None: # did we defined fewer jobs this time around? # then we need to delete them todelete = set() for x in prev_defined_jobs: if x not in new_jobs: todelete.add(x) from compmake.ui.ui import delete_jobs_recurse_definition deleted_jobs = delete_jobs_recurse_definition(jobs=todelete, db=db) else: deleted_jobs = set() # print('Now %s has deleted %s' % (job_id, deleted_jobs)) set_job_userobject(job_id, user_object, db=db) int_save_results.stop() # logger.debug('Save time for %s: %s s' % (job_id, walltime_save_result)) int_make.stop() end_time = time() cache = Cache(Cache.DONE) # print('int_make: %s' % int_make) # print('int_load_results: %s' % int_load_results) # print('int_compute: %s' % int_compute) if int_gc.get_walltime_used() > 1.0: logger.warning( 'Expensive garbage collection detected at the end of %s: %s' % (job_id, int_gc)) # print('int_save_results: %s' % int_save_results) cache.int_make = int_make cache.int_load_results = int_load_results cache.int_compute = int_compute cache.int_gc = int_gc cache.int_save_results = int_save_results cache.timestamp = end_time cache.walltime_used = int_make.get_walltime_used() cache.cputime_used = int_make.get_cputime_used() cache.host = host cache.jobs_defined = new_jobs set_job_cache(job_id, cache, db=db) return dict(user_object=user_object, user_object_deps=collect_dependencies(user_object), new_jobs=new_jobs, deleted_jobs=deleted_jobs)
def parmake_job2_new_process(args): """ Starts the job in a new compmake process. """ (job_id, context) = args compmake_bin = which('compmake') db = context.get_compmake_db() storage = db.basepath # XXX: where = os.path.join(storage, 'parmake_job2_new_process') if not os.path.exists(storage): try: os.makedirs(storage) except: pass out_result = os.path.join(where, '%s.results.pickle' % job_id) out_result = os.path.abspath(out_result) cmd = [compmake_bin, storage] if not all_disabled(): cmd += ['--contracts'] cmd += [ '--status_line_enabled', '0', '--colorize', '0', '-c', 'make_single out_result=%s %s' % (out_result, job_id), ] cwd = os.getcwd() cmd_res = system_cmd_result(cwd, cmd, display_stdout=False, display_stderr=False, raise_on_error=False, capture_keyboard_interrupt=False) ret = cmd_res.ret if ret == CompmakeConstants.RET_CODE_JOB_FAILED: # XXX: msg = 'Job %r failed in external process' % job_id msg += indent(cmd_res.stdout, 'stdout| ') msg += indent(cmd_res.stderr, 'stderr| ') res = safe_pickle_load(out_result) os.unlink(out_result) result_dict_check(res) raise JobFailed.from_dict(res) elif ret != 0: msg = 'Host failed while doing %r' % job_id msg += '\n cmd: %s' % " ".join(cmd) msg += '\n' + indent(cmd_res.stdout, 'stdout| ') msg += '\n' + indent(cmd_res.stderr, 'stderr| ') raise CompmakeBug(msg) # XXX: res = safe_pickle_load(out_result) os.unlink(out_result) result_dict_check(res) return res