def test_actions_after_job_process_crash(self): # the job process is *not* running self.is_pid_running.return_value = False # but the database record says it is self.get_job_status.return_value = 'running' supervisor.supervise(1, self.job.id, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual( ((self.job.id,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual( ((self.job.id,), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status_and_error_msg.call_count) self.assertEqual( ((self.job.id,), {'error_msg': 'job process 1 crashed or terminated'}), self.update_job_status_and_error_msg.call_args)
def test_actions_after_job_process_failures(self): # the job process is running but has some failure counters above zero # shorten the delay to checking failure counters supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2 self.is_pid_running.return_value = True self.get_job_status.return_value = 'running' stats.delete_job_counters(self.job.id) stats.incr_counter(self.job.id, "h", "a:failed") stats.incr_counter(self.job.id, "r", "b:failed") stats.incr_counter(self.job.id, "r", "b:failed") supervisor.supervise(1, self.job.id, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1, ), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((self.job.id, ), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((self.job.id, terminate), {}), self.cleanup_after_job.call_args)
def test_actions_after_job_process_failures(self): # the job process is running but has some failure counters above zero # shorten the delay to checking failure counters supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2 self.is_pid_running.return_value = True self.get_job_status.return_value = 'running' stats.delete_job_counters(self.job.id) stats.incr_counter(self.job.id, "h", "a:failed") stats.incr_counter(self.job.id, "r", "b:failed") stats.incr_counter(self.job.id, "r", "b:failed") supervisor.supervise(1, self.job.id, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1,), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual( ((self.job.id,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual( ((self.job.id,), {}), self.cleanup_after_job.call_args)
def test_actions_after_job_process_termination(self): # the job process is *not* running self.is_pid_running.return_value = False self.get_job_status.return_value = 'succeeded' supervisor.supervise(1, self.job.id, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((self.job.id, ), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((self.job.id, terminate), {}), self.cleanup_after_job.call_args)
def test_actions_after_a_critical_message(self): # the job process is running self.is_pid_running.return_value = True with patch('openquake.engine.supervising.' \ 'supervisor.SupervisorLogMessageConsumer.run') as run: def run_(mc): record = logging.LogRecord( 'oq.job.%s' % self.job.id, logging.CRITICAL, 'path', 42, 'a msg', (), None) mc.log_callback(record) assert mc._stopped # the supervisor will receive a msg run.side_effect = run_ supervisor.supervise(1, self.job.id, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1,), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual( ((self.job.id,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual( ((self.job.id,), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual( 1, self.update_job_status_and_error_msg.call_count) self.assertEqual( ((self.job.id, 'a msg'), {}), self.update_job_status_and_error_msg.call_args)
def test_actions_after_job_process_termination(self): # the job process is *not* running self.is_pid_running.return_value = False self.get_job_status.return_value = 'succeeded' supervisor.supervise(1, self.job.id, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual( ((self.job.id,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual( ((self.job.id,), {}), self.cleanup_after_job.call_args)
def test_actions_after_job_process_crash(self): # the job process is *not* running self.is_pid_running.return_value = False # but the database record says it is self.get_job_status.return_value = 'running' supervisor.supervise(1, self.job.id, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((self.job.id, ), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((self.job.id, terminate), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status.call_count) self.assertEqual(((self.job.id, ), {}), self.update_job_status.call_args)
def test_actions_after_a_critical_message(self): # the job process is running self.is_pid_running.return_value = True with patch('openquake.engine.supervising.' 'supervisor.SupervisorLogMessageConsumer.run') as run: def run_(mc): record = logging.LogRecord('oq.job.%s' % self.job.id, logging.CRITICAL, 'path', 42, 'a msg', (), None) mc.log_callback(record) assert mc._stopped # the supervisor will receive a msg run.side_effect = run_ supervisor.supervise(1, self.job.id, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1, ), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((self.job.id, ), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((self.job.id, terminate), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status.call_count) self.assertEqual(((self.job.id, ), {}), self.update_job_status.call_args)
_do_run_calc(job, exports, calc, job_type) except Exception, ex: logs.LOG.critical("Calculation failed with exception: '%s'" % str(ex)) raise finally: job.is_running = False job.save() return supervisor_pid = os.fork() if not supervisor_pid: # supervisor process logs.set_logger_level(logs.logging.root, log_level) # TODO: deal with KVS garbage collection supervisor.supervise(job_pid, job.id, log_file=log_file) return # parent process # ignore Ctrl-C as well as supervisor process does. thus only # job executor terminates on SIGINT supervisor.ignore_sigint() # wait till both child processes are done os.waitpid(job_pid, 0) os.waitpid(supervisor_pid, 0) # Refresh the job record, since the forked processes are going to modify # job state. return models.OqJob.objects.get(id=job.id)
_job_exec(job, log_level, exports, job_type, calc) except Exception, ex: logs.LOG.critical("Calculation failed with exception: '%s'" % str(ex)) raise finally: job.is_running = False job.save() return supervisor_pid = os.fork() if not supervisor_pid: # supervisor process logs.set_logger_level(logs.logging.root, log_level) # TODO: deal with KVS garbage collection supervisor.supervise(job_pid, job.id, log_file=log_file) return # parent process # ignore Ctrl-C as well as supervisor process does. thus only # job executor terminates on SIGINT supervisor.ignore_sigint() # wait till both child processes are done os.waitpid(job_pid, 0) os.waitpid(supervisor_pid, 0) else: try: _job_exec(job, log_level, exports, job_type, calc) except Exception, ex: logs.LOG.critical("Calculation failed with exception: '%s'" %