def timeout_callback(self): """ On timeout expiration check if the job process is still running, and act accordingly if not. """ if not supervising.is_pid_running(self.pid): logging.info('Process %s not running', self.pid) # see what status was left in the database by the exited job job_status = get_job_status(self.job_id) if job_status == 'succeeded': signalling.signal_job_outcome(self.job_id, 'succeeded') else: signalling.signal_job_outcome(self.job_id, 'failed') if job_status == 'running': # The job crashed without having a chance to update the # status in the database. We do it here. update_job_status_and_error_msg(self.job_id, 'failed', 'crash') cleanup_after_job(self.job_id) raise StopIteration
def main(): """ Look through all jobs with status "running" and check the status of their supervisors: if one is missing -- do :meth:`openquake.job.spawn_job_supervisor` for it. """ qs = OqJob.objects.filter(status="running").values_list("id", "job_pid", "supervisor_pid") for job_id, job_pid, supervisor_pid in qs: if not supervising.is_pid_running(supervisor_pid): proc = multiprocessing.Process(target=supervise, args=(job_id, job_pid)) proc.start()
def main(): """ Look through all jobs with status "running" and check the status of their supervisors: if one is missing -- do :meth:`openquake.job.spawn_job_supervisor` for it. """ qs = OqJob.objects.filter(status='running') \ .values_list('id', 'job_pid', 'supervisor_pid') for job_id, job_pid, supervisor_pid in qs: if not supervising.is_pid_running(supervisor_pid): job.spawn_job_supervisor(job_id, job_pid)
def main(): """ Look through all jobs with status "running" and check the status of their supervisors: if one is missing -- do :meth:`openquake.job.spawn_job_supervisor` for it. """ qs = OqJob.objects.filter(status='running') \ .values_list('id', 'job_pid', 'supervisor_pid') for job_id, job_pid, supervisor_pid in qs: if not supervising.is_pid_running(supervisor_pid): proc = multiprocessing.Process(target=supervise, args=(job_id, job_pid)) proc.start()
def timeout_callback(self): """ On timeout expiration check if the job process is still running and whether it experienced any failures. Terminate the job process in the latter case. """ def failure_counters_need_check(): """Return `True` if failure counters should be checked.""" self.fcc_delay_value += 1 result = self.fcc_delay_value >= self.FCC_DELAY if result: self.fcc_delay_value = 0 return result process_stopped = job_failed = False message = None if not supervising.is_pid_running(self.job_pid): message = ('job process %s crashed or terminated' % self.job_pid) process_stopped = True elif failure_counters_need_check(): # Job process is still running. failures = stats.failure_counters(self.job_id) if failures: message = "job terminated with failures: %s" % failures else: failed_nodes = abort_due_to_failed_nodes(self.job_id) if failed_nodes: message = ("job terminated due to %s failed nodes" % failed_nodes) if failures or failed_nodes: terminate_job(self.job_pid) job_failed = True if job_failed or process_stopped: job_status = get_job_status(self.job_id) if process_stopped and job_status == 'complete': message = 'job process %s succeeded' % self.job_pid self.selflogger.debug(message) elif not job_status == 'complete': # The job crashed without having a chance to update the # status in the database, or it has been running even though # there were failures. We update the job status here. self.selflogger.error(message) update_job_status_and_error_msg(self.job_id, error_msg=message) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id) raise StopIteration()
def timeout_callback(self): """ On timeout expiration check if the job process is still running and whether it experienced any failures. Terminate the job process in the latter case. """ def failure_counters_need_check(): """Return `True` if failure counters should be checked.""" self.fcc_delay_value += 1 result = self.fcc_delay_value >= self.FCC_DELAY if result: self.fcc_delay_value = 0 return result process_stopped = job_failed = False message = None if not supervising.is_pid_running(self.job_pid): message = ('job process %s crashed or terminated' % self.job_pid) process_stopped = True elif failure_counters_need_check(): # Job process is still running. failures = stats.failure_counters(self.job_id) if failures: message = "job terminated with failures: %s" % failures else: failed_nodes = abort_due_to_failed_nodes(self.job_id) if failed_nodes: message = ("job terminated due to %s failed nodes" % failed_nodes) if failures or failed_nodes: terminate_job(self.job_pid) job_failed = True if job_failed or process_stopped: job_status = get_job_status(self.job_id) if process_stopped and job_status == 'succeeded': message = 'job process %s succeeded' % self.job_pid self.selflogger.info(message) elif job_status == 'running': # The job crashed without having a chance to update the # status in the database, or it has been running even though # there were failures. We update the job status here. self.selflogger.error(message) update_job_status_and_error_msg(self.job_id, 'failed', message) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id) raise StopIteration()