def test_count_failed_nodes_with_a_node_that_went_offline(self): # Result: 1 failed nodes cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up") self.db_mock.return_value = {"N1": cs} self.live_mock.return_value = set() actual = monitor.count_failed_nodes(self.job) self.assertEqual(1, actual)
def abort_due_to_failed_nodes(job_id): """Should the job be aborted due to failed compute nodes? The job should be aborted when the following conditions coincide: - we observed failed compute nodes - the "no progress" timeout has been exceeded :param int job_id: the id of the job in question :returns: the number of failed compute nodes if the job should be aborted zero otherwise. """ logging.debug("> check for failed nodes") result = 0 job = OqJob.objects.get(id=job_id) failed_nodes = monitor.count_failed_nodes(job) if failed_nodes: logging.debug(">> failed_nodes: %s", failed_nodes) no_progress_period, timeout = stats.get_progress_timing_data(job) logging.debug(">> no_progress_period: %s", no_progress_period) logging.debug(">> timeout: %s", timeout) if no_progress_period > timeout: result = failed_nodes else: logging.debug('>> no failures') logging.debug("< check for failed nodes") return result
def abort_due_to_failed_nodes(job_id): """Should the job be aborted due to failed compute nodes? The job should be aborted when the following conditions coincide: - we observed failed compute nodes - the "no progress" timeout has been exceeded :param int job_id: the id of the job in question :returns: the number of failed compute nodes if the job should be aborted zero otherwise. """ logging.debug("> abort_due_to_failed_nodes") result = 0 job = OqJob.objects.get(id=job_id) failed_nodes = monitor.count_failed_nodes(job) logging.debug(">> failed_nodes: %s" % failed_nodes) if failed_nodes: no_progress_period, timeout = stats.get_progress_timing_data(job) logging.debug(">> no_progress_period: %s" % no_progress_period) logging.debug(">> timeout: %s" % timeout) if no_progress_period > timeout: result = failed_nodes logging.debug("< abort_due_to_failed_nodes") return result
def test_count_failed_nodes_with_failures_before_calculation(self): # Result: 1 node failure; this simulates the situation where a # node has failed from the very beginning and never recovered i.e. it # never took on any tasks. Only nodes that were functioning at some # time during the calculation and *then* failed are counted. n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up") n1.save(using="job_superv") n2 = models.CNodeStats(oq_job=self.job, node="N7", current_status="down") self.db_mock.return_value = {"N6": n1, "N7": n2} self.live_mock.return_value = set() actual = monitor.count_failed_nodes(self.job) self.assertEqual(1, actual) # The failed node has been updated to capture that. n1 = models.CNodeStats.objects.get(id=n1.id) self.assertEqual("down", n1.current_status) self.assertEqual(1, n1.failures)
def test_count_failed_nodes_with_failures_during_calculation(self): # Result: 2 node failures, please note that the function under test # counts the total number of node failures that occurred during a # calculation and *not* the number of currently failed nodes. n1 = models.CNodeStats(oq_job=self.job, node="N3", current_status="up") n2 = models.CNodeStats(oq_job=self.job, node="N4", current_status="down", failures=1) self.db_mock.return_value = {"N3": n1, "N4": n2} self.live_mock.return_value = set(["N5"]) actual = monitor.count_failed_nodes(self.job) self.assertEqual(2, actual) # Please note also that the new node ("N5") was written to the # database [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5") self.assertEqual("up", n3.current_status) self.assertEqual(0, n3.failures)
def test_count_failed_nodes_with_failed_and_recovered_node(self): # Result: 1 node failure; the node failed and recovered. Its failures # counter is unaffected by the recovery. n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up") n1.save(using="job_superv") self.assertEqual(0, n1.failures) n1.current_status = "down" n1.save(using="job_superv") n1 = models.CNodeStats.objects.get(id=n1.id) self.assertEqual(1, n1.failures) self.db_mock.return_value = {"N8": n1} self.live_mock.return_value = set(["N8"]) actual = monitor.count_failed_nodes(self.job) self.assertEqual(1, actual) # The failed node has been updated to capture that. n1 = models.CNodeStats.objects.get(id=n1.id) self.assertEqual("up", n1.current_status) self.assertEqual(1, n1.failures)
def _switch_to_job_phase(job_ctxt, ctype, status): """Switch to a particular phase of execution. This involves creating a `job_phase_stats` record and logging the new status. :param job_ctxt: An :class:`~openquake.engine.JobContext` instance. :param str ctype: calculation type (hazard|risk) :param str status: one of the following: pre_executing, executing, post_executing, post_processing, export, clean_up, complete """ job = OqJob.objects.get(id=job_ctxt.job_id) JobPhaseStats.objects.create(oq_job=job, ctype=ctype, job_status=status) logs.log_progress("%s (%s)" % (status, ctype), 1) if status == "executing": # Record the compute nodes that were available at the beginning of the # execute phase so we can detect failed nodes later. failed_nodes = monitor.count_failed_nodes(job) if failed_nodes == -1: logs.LOG.critical("No live compute nodes, aborting calculation") sys.exit(1)
def test_count_failed_nodes_with_zero_nodes(self): # Signal when there are zero live nodes at the start of the calculation self.db_mock.return_value = {} self.live_mock.return_value = set() actual = monitor.count_failed_nodes(self.job) self.assertEqual(-1, actual)