def test_actions_after_a_critical_message(self): # the job process is running self.is_pid_running.return_value = True with patch("openquake.supervising." "supervisor.SupervisorLogMessageConsumer.run") as run: def run_(mc): record = logging.LogRecord("oq.job.123", logging.CRITICAL, "path", 42, "a msg", (), None) mc.log_callback(record) assert mc._stopped # the supervisor will receive a msg run.side_effect = run_ supervisor.supervise(1, 123, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1,), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((123,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status_and_error_msg.call_count) self.assertEqual(((123, "failed", "a msg"), {}), self.update_job_status_and_error_msg.call_args)
def test_actions_after_job_process_failures(self): # the job process is running but has some failure counters above zero # shorten the delay to checking failure counters supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2 self.is_pid_running.return_value = True self.get_job_status.return_value = 'running' stats.delete_job_counters(self.job.id) stats.incr_counter(self.job.id, "h", "a-failures") stats.incr_counter(self.job.id, "r", "b-failures") stats.incr_counter(self.job.id, "r", "b-failures") supervisor.supervise(1, self.job.id, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1,), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual( ((self.job.id,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual( ((self.job.id,), {}), self.cleanup_after_job.call_args)
def test_actions_after_job_process_crash(self): # the job process is *not* running self.is_pid_running.return_value = False # but the database record says it is self.get_job_status.return_value = 'running' supervisor.supervise(1, self.job.id, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual( ((self.job.id,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual( ((self.job.id,), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status_and_error_msg.call_count) self.assertEqual( ((self.job.id,), {'error_msg': 'job process 1 crashed or terminated'}), self.update_job_status_and_error_msg.call_args)
def test_actions_after_a_critical_message(self): # the job process is running self.is_pid_running.return_value = True with patch('openquake.supervising.' \ 'supervisor.SupervisorLogMessageConsumer.run') as run: def run_(mc): record = logging.LogRecord('oq.job.123', logging.CRITICAL, 'path', 42, 'a msg', (), None) mc.log_callback(record) assert mc._stopped # the supervisor will receive a msg run.side_effect = run_ supervisor.supervise(1, 123, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1,), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((123,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status_and_error_msg.call_count) self.assertEqual(((123, 'failed', 'a msg'), {}), self.update_job_status_and_error_msg.call_args)
def main(): # pylint: disable=C0111 os.environ['DJANGO_SETTINGS_MODULE'] = 'openquake.settings' from openquake.supervising import supervisor job_id = int(sys.argv[1]) pid = int(sys.argv[2]) supervisor.supervise(pid, job_id)
def test_actions_after_job_process_termination(self): # the job process is *not* running self.is_pid_running.return_value = False self.get_job_status.return_value = 'succeeded' supervisor.supervise(1, 123, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((123,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)
def test_actions_after_job_process_termination(self): # the job process is *not* running self.is_pid_running.return_value = False self.get_job_status.return_value = "succeeded" supervisor.supervise(1, 123, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((123,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)
def test_actions_after_job_process_termination(self): # the job process is *not* running self.is_pid_running.return_value = False self.get_job_status.return_value = 'succeeded' supervisor.supervise(1, 123) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args) # the outcome is signalled self.assertEqual(1, self.signal_job_outcome.call_count) self.assertEqual(((123, 'succeeded'), {}), self.signal_job_outcome.call_args)
def test_actions_after_job_process_crash(self): # the job process is *not* running self.is_pid_running.return_value = False # but the database record says it is self.get_job_status.return_value = 'running' supervisor.supervise(1, 123) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args) # the outcome is signalled self.assertEqual(1, self.signal_job_outcome.call_count) self.assertEqual(((123, 'failed'), {}), self.signal_job_outcome.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status_and_error_msg.call_count) self.assertEqual(((123, 'failed', 'crash'), {}), self.update_job_status_and_error_msg.call_args)
def test_actions_after_job_process_crash(self): # the job process is *not* running self.is_pid_running.return_value = False # but the database record says it is self.get_job_status.return_value = 'running' supervisor.supervise(1, 123, timeout=0.1) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((123,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status_and_error_msg.call_count) self.assertEqual( ((123, 'failed', 'job process 1 crashed or terminated'), {}), self.update_job_status_and_error_msg.call_args)
def test_actions_after_a_critical_message(self): # the job process is running self.is_pid_running.return_value = True with patch('openquake.supervising.' \ 'supervisor.SupervisorLogMessageConsumer.run') as run: def run_(mc): while True: try: mc.message_callback(amqp.Message(body='a msg')) except StopIteration: break # the supervisor will receive a msg run.side_effect = run_ supervisor.supervise(1, 123) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1,), {}), self.terminate_job.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args) # the outcome is signalled self.assertEqual(1, self.signal_job_outcome.call_count) self.assertEqual(((123, 'failed'), {}), self.signal_job_outcome.call_args) # the status in the job record is updated self.assertEqual(1, self.update_job_status_and_error_msg.call_count) self.assertEqual(((123, 'failed', 'a msg'), {}), self.update_job_status_and_error_msg.call_args)
def test_actions_after_job_process_failures(self): # the job process is running but has some failure counters above zero # shorten the delay to checking failure counters supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2 self.is_pid_running.return_value = True self.get_job_status.return_value = 'running' stats.delete_job_counters(123) stats.incr_counter(123, "h", "a:failed") stats.incr_counter(123, "r", "b:failed") stats.incr_counter(123, "r", "b:failed") supervisor.supervise(1, 123, timeout=0.1) # the job process is terminated self.assertEqual(1, self.terminate_job.call_count) self.assertEqual(((1,), {}), self.terminate_job.call_args) # stop time is recorded self.assertEqual(1, self.record_job_stop_time.call_count) self.assertEqual(((123,), {}), self.record_job_stop_time.call_args) # the cleanup is triggered self.assertEqual(1, self.cleanup_after_job.call_count) self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)
_do_run_hazard(job, exports) except Exception, ex: logs.LOG.critical("Calculation failed with exception: '%s'" % str(ex)) raise finally: job.is_running = False job.save() return supervisor_pid = os.fork() if not supervisor_pid: # supervisor process logs.set_logger_level(logs.logging.root, log_level) # TODO: deal with KVS garbage collection supervisor.supervise(job_pid, job.id, log_file=log_file) return # parent process # ignore Ctrl-C as well as supervisor process does. thus only # job executor terminates on SIGINT supervisor.ignore_sigint() # wait till both child processes are done os.waitpid(job_pid, 0) os.waitpid(supervisor_pid, 0) # Refresh the job record, since the forked processes are going to modify # job state. return models.OqJob.objects.get(id=job.id)
job.save() raise else: job.status = 'succeeded' job.save() return supervisor_pid = os.fork() if not supervisor_pid: # supervisor process logs.set_logger_level(logs.logging.root, log_level) supervisor_pid = os.getpid() job.supervisor_pid = supervisor_pid job.job_pid = job_pid job.save() supervisor.supervise(job_pid, job.id, log_file=log_file) return # parent process # ignore Ctrl-C as well as supervisor process does. thus only # job executor terminates on SIGINT supervisor.ignore_sigint() # wait till both child processes are done os.waitpid(job_pid, 0) os.waitpid(supervisor_pid, 0) return job def _launch_job(job_ctxt, sections):
LOG.critical("Job failed with exception: '%s'" % str(ex)) a_job.set_status('failed') raise else: a_job.set_status('succeeded') return supervisor_pid = os.fork() if not supervisor_pid: # supervisor process supervisor_pid = os.getpid() job = OqJob.objects.get(id=a_job.job_id) job.supervisor_pid = supervisor_pid job.job_pid = job_pid job.save() supervisor.supervise(job_pid, a_job.job_id) return # parent process # ignore Ctrl-C as well as supervisor process does. thus only # job executor terminates on SIGINT supervisor.ignore_sigint() # wait till both child processes are done os.waitpid(job_pid, 0) os.waitpid(supervisor_pid, 0) def parse_config_file(config_file): """ We have a single configuration file which may contain a risk section and
job.save() raise else: job.status = 'succeeded' job.save() return supervisor_pid = os.fork() if not supervisor_pid: # supervisor process logs.set_logger_level(logs.logging.root, log_level) supervisor_pid = os.getpid() job.supervisor_pid = supervisor_pid job.job_pid = job_pid job.save() supervisor.supervise(job_pid, job.id) return # parent process # ignore Ctrl-C as well as supervisor process does. thus only # job executor terminates on SIGINT supervisor.ignore_sigint() # wait till both child processes are done os.waitpid(job_pid, 0) os.waitpid(supervisor_pid, 0) return job def _launch_job(job_ctxt, sections):
calculation.status = 'failed' calculation.save() raise else: calculation.status = 'succeeded' calculation.save() return supervisor_pid = os.fork() if not supervisor_pid: # supervisor process supervisor_pid = os.getpid() calculation.supervisor_pid = supervisor_pid calculation.job_pid = calc_pid calculation.save() supervisor.supervise(calc_pid, calculation.id) return # parent process # ignore Ctrl-C as well as supervisor process does. thus only # job executor terminates on SIGINT supervisor.ignore_sigint() # wait till both child processes are done os.waitpid(calc_pid, 0) os.waitpid(supervisor_pid, 0) return calculation def _launch_calculation(calc_proxy, sections):