Beispiel #1
0
 def test__db_cnode_status_and_two_jobs(self):
     job1 = engine.prepare_job()
     for node, status in [("P1", "up"), ("P2", "down"), ("P3", "down")]:
         ns = models.CNodeStats(oq_job=job1,
                                node=node,
                                current_status=status)
         ns.save(using="job_superv")
     job2 = engine.prepare_job()
     expected = {}
     for node, status in [("Q2", "down"), ("Q3", "down")]:
         ns = models.CNodeStats(oq_job=job2,
                                node=node,
                                current_status=status)
         ns.save(using="job_superv")
         expected[node] = ns
     self.assertEqual(expected, monitor._db_cnode_status(job2.id))
Beispiel #2
0
 def test_count_failed_nodes_with_a_node_that_went_offline(self):
     # Result: 1 failed nodes
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     self.db_mock.return_value = {"N1": cs}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
Beispiel #3
0
 def test__db_cnode_status_and_wrong_job_id(self):
     job = engine.prepare_job()
     expected = {}
     for node, status in [("O1", "up"), ("O2", "down"), ("O3", "down")]:
         ns = models.CNodeStats(oq_job=job,
                                node=node,
                                current_status=status)
         ns.save(using="job_superv")
     self.assertEqual(expected, monitor._db_cnode_status(-1))
Beispiel #4
0
 def test_count_failed_nodes_with_failures_before_calculation(self):
     # Result: 1 node failure; this simulates the situation where a
     # node has failed from the very beginning and never recovered i.e. it
     # never took on any tasks. Only nodes that were functioning at some
     # time during the calculation and *then* failed are counted.
     n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up")
     n1.save(using="job_superv")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N7",
                            current_status="down")
     self.db_mock.return_value = {"N6": n1, "N7": n2}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
     # The failed node has been updated to capture that.
     n1 = models.CNodeStats.objects.get(id=n1.id)
     self.assertEqual("down", n1.current_status)
     self.assertEqual(1, n1.failures)
Beispiel #5
0
 def test_count_failed_nodes_with_failures_during_calculation(self):
     # Result: 2 node failures, please note that the function under test
     # counts the total number of node failures that occurred during a
     # calculation and *not* the number of currently failed nodes.
     n1 = models.CNodeStats(oq_job=self.job, node="N3", current_status="up")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N4",
                            current_status="down",
                            failures=1)
     self.db_mock.return_value = {"N3": n1, "N4": n2}
     self.live_mock.return_value = set(["N5"])
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(2, actual)
     # Please note also that the new node ("N5") was written to the
     # database
     [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5")
     self.assertEqual("up", n3.current_status)
     self.assertEqual(0, n3.failures)
Beispiel #6
0
    def test_cnode_stats_failure_counter_with_up_down_transition(self):
        # The failures counter is incremented in case of a
        #   up -> down transition
        cs = models.CNodeStats(oq_job=self.job, node="N3", current_status="up")
        cs.save(using="job_superv")
        cs.current_status = "down"
        cs.save(using="job_superv")
        cs = models.CNodeStats.objects.get(id=cs.id)

        self.assertEqual(1, cs.failures)
Beispiel #7
0
 def test__db_cnode_status(self):
     job = engine.prepare_job()
     expected = {}
     for node, status in [("N1", "up"), ("N2", "down"), ("N3", "down")]:
         ns = models.CNodeStats(oq_job=job,
                                node=node,
                                current_status=status)
         ns.save(using="job_superv")
         expected[node] = ns
     self.assertEqual(expected, monitor._db_cnode_status(job.id))
Beispiel #8
0
    def test_cnode_stats_failure_counter_with_down_up_transition(self):
        # The failures counter is only stepped in case of a
        #   up -> down transition
        # and will remain unchanged here.
        cs = models.CNodeStats(oq_job=self.job,
                               node="N6",
                               current_status="down")
        cs.save(using="job_superv")
        cs.current_status = "up"
        cs.save(using="job_superv")
        cs = models.CNodeStats.objects.get(id=cs.id)

        self.assertEqual(0, cs.failures)
Beispiel #9
0
def count_failed_nodes(job):
    """Check compute nodes and return the total number of failures.

    Please note that this function counts the total number of node
    failures that occurred during a calculation and *not* the number
    of currently failed nodes.

    :param job: The :class:`openquake.db.models.OqJob` instance to use
    :return: the number of failures i.e. how many nodes went from "up" to
        "down" *at some time* during the calculation
    """
    live_nodes = _live_cnode_status()
    db_stats = _db_cnode_status(job)

    if not live_nodes and not db_stats:
        # No live compute nodes and nothing stored in the database; this will
        # never work -> indicate failure
        return -1

    def set_status(node, status):
        """Update the status of the given node in the database."""
        cs = db_stats[node]
        cs.current_status = status
        cs.save(using="job_superv")

    # working nodes according to the database
    dworking_nodes = set(cs.node for cs in db_stats.values()
                         if cs.current_status == "up" and cs.failures == 0)
    # nodes that have failed at least once at some time during the calculation
    dfailed_nodes = set(cs.node for cs in db_stats.values() if cs.failures > 0)

    # Which working nodes stored in the db have gone bad/down?
    total_failures = len(dfailed_nodes)
    new_failed = dworking_nodes - live_nodes
    for node in new_failed:
        set_status(node, "down")
        total_failures += 1

    # Any entirely new nodes?
    new_nodes = live_nodes - set(db_stats.keys())
    for node in new_nodes:
        cs = models.CNodeStats(oq_job=job, node=node, current_status="up")
        cs.save(using="job_superv")

    # Any nodes that came back after a failure?
    for node in live_nodes.intersection(dfailed_nodes):
        set_status(node, "up")

    return total_failures
Beispiel #10
0
    def test_cnode_stats_without_state_transition_and_same_timestamps(self):
        # The `previous_ts` and `current_ts` time stamps are untouched
        # if the compute node status did not change
        cs = models.CNodeStats(oq_job=self.job,
                               node="N8",
                               current_status="down")
        cs.save(using="job_superv")
        old_current_ts = cs.current_ts

        cs.node = "N8+1"
        cs.save(using="job_superv")
        cs = models.CNodeStats.objects.get(id=cs.id)

        self.assertIs(None, cs.previous_ts)
        self.assertEqual(old_current_ts, cs.current_ts)
Beispiel #11
0
    def test_cnode_stats_with_state_transition_and_managed_timestamps(self):
        # The `previous_ts` and `current_ts` time stamps are managed properly
        # in case of a state transition
        cs = models.CNodeStats(oq_job=self.job,
                               node="N7",
                               current_status="down")
        cs.save(using="job_superv")
        old_current_ts = cs.current_ts

        cs.current_status = "up"
        cs.save(using="job_superv")
        cs = models.CNodeStats.objects.get(id=cs.id)

        self.assertIsNot(None, cs.previous_ts)
        self.assertEqual(old_current_ts, cs.previous_ts)
Beispiel #12
0
    def test_count_failed_nodes_with_failed_and_recovered_node(self):
        # Result: 1 node failure; the node failed and recovered. Its failures
        # counter is unaffected by the recovery.
        n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up")
        n1.save(using="job_superv")
        self.assertEqual(0, n1.failures)

        n1.current_status = "down"
        n1.save(using="job_superv")
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual(1, n1.failures)

        self.db_mock.return_value = {"N8": n1}
        self.live_mock.return_value = set(["N8"])
        actual = monitor.count_failed_nodes(self.job)
        self.assertEqual(1, actual)
        # The failed node has been updated to capture that.
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual("up", n1.current_status)
        self.assertEqual(1, n1.failures)
Beispiel #13
0
 def test_cnode_stats_with_correct_data(self):
     # The db record is saved w/o triggering an exception
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     cs.save(using="job_superv")