Python count_failed_nodesの例、openquake.utils.monitor.count_failed_nodes Pythonの例

コード例 #1

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: PseudononymousEpistle/oq-engine

 def test_count_failed_nodes_with_a_node_that_went_offline(self):
     # Result: 1 failed nodes
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     self.db_mock.return_value = {"N1": cs}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)

コード例 #2

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: bwyss/oq-engine

 def test_count_failed_nodes_with_a_node_that_went_offline(self):
     # Result: 1 failed nodes
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     self.db_mock.return_value = {"N1": cs}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)

コード例 #3

0

ファイルを表示

ファイル: supervisor.py プロジェクト: matley/oq-engine

def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    logging.debug("> check for failed nodes")
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes
    else:
        logging.debug('>> no failures')

    logging.debug("< check for failed nodes")
    return result

コード例 #4

0

ファイルを表示

def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    logging.debug("> abort_due_to_failed_nodes")
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)
    logging.debug(">> failed_nodes: %s" % failed_nodes)

    if failed_nodes:
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s" % no_progress_period)
        logging.debug(">> timeout: %s" % timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    logging.debug("< abort_due_to_failed_nodes")
    return result

コード例 #5

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: PseudononymousEpistle/oq-engine

 def test_count_failed_nodes_with_failures_before_calculation(self):
     # Result: 1 node failure; this simulates the situation where a
     # node has failed from the very beginning and never recovered i.e. it
     # never took on any tasks. Only nodes that were functioning at some
     # time during the calculation and *then* failed are counted.
     n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up")
     n1.save(using="job_superv")
     n2 = models.CNodeStats(oq_job=self.job, node="N7",
                            current_status="down")
     self.db_mock.return_value = {"N6": n1, "N7": n2}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
     # The failed node has been updated to capture that.
     n1 = models.CNodeStats.objects.get(id=n1.id)
     self.assertEqual("down", n1.current_status)
     self.assertEqual(1, n1.failures)

コード例 #6

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: PseudononymousEpistle/oq-engine

 def test_count_failed_nodes_with_failures_during_calculation(self):
     # Result: 2 node failures, please note that the function under test
     # counts the total number of node failures that occurred during a
     # calculation and *not* the number of currently failed nodes.
     n1 = models.CNodeStats(oq_job=self.job, node="N3",
                             current_status="up")
     n2 = models.CNodeStats(oq_job=self.job, node="N4",
                             current_status="down", failures=1)
     self.db_mock.return_value = {"N3": n1, "N4": n2}
     self.live_mock.return_value = set(["N5"])
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(2, actual)
     # Please note also that the new node ("N5") was written to the
     # database
     [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5")
     self.assertEqual("up", n3.current_status)
     self.assertEqual(0, n3.failures)

コード例 #7

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: bwyss/oq-engine

 def test_count_failed_nodes_with_failures_before_calculation(self):
     # Result: 1 node failure; this simulates the situation where a
     # node has failed from the very beginning and never recovered i.e. it
     # never took on any tasks. Only nodes that were functioning at some
     # time during the calculation and *then* failed are counted.
     n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up")
     n1.save(using="job_superv")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N7",
                            current_status="down")
     self.db_mock.return_value = {"N6": n1, "N7": n2}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
     # The failed node has been updated to capture that.
     n1 = models.CNodeStats.objects.get(id=n1.id)
     self.assertEqual("down", n1.current_status)
     self.assertEqual(1, n1.failures)

コード例 #8

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: bwyss/oq-engine

 def test_count_failed_nodes_with_failures_during_calculation(self):
     # Result: 2 node failures, please note that the function under test
     # counts the total number of node failures that occurred during a
     # calculation and *not* the number of currently failed nodes.
     n1 = models.CNodeStats(oq_job=self.job, node="N3", current_status="up")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N4",
                            current_status="down",
                            failures=1)
     self.db_mock.return_value = {"N3": n1, "N4": n2}
     self.live_mock.return_value = set(["N5"])
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(2, actual)
     # Please note also that the new node ("N5") was written to the
     # database
     [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5")
     self.assertEqual("up", n3.current_status)
     self.assertEqual(0, n3.failures)

コード例 #9

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: bwyss/oq-engine

    def test_count_failed_nodes_with_failed_and_recovered_node(self):
        # Result: 1 node failure; the node failed and recovered. Its failures
        # counter is unaffected by the recovery.
        n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up")
        n1.save(using="job_superv")
        self.assertEqual(0, n1.failures)

        n1.current_status = "down"
        n1.save(using="job_superv")
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual(1, n1.failures)

        self.db_mock.return_value = {"N8": n1}
        self.live_mock.return_value = set(["N8"])
        actual = monitor.count_failed_nodes(self.job)
        self.assertEqual(1, actual)
        # The failed node has been updated to capture that.
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual("up", n1.current_status)
        self.assertEqual(1, n1.failures)

コード例 #10

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: PseudononymousEpistle/oq-engine

    def test_count_failed_nodes_with_failed_and_recovered_node(self):
        # Result: 1 node failure; the node failed and recovered. Its failures
        # counter is unaffected by the recovery.
        n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up")
        n1.save(using="job_superv")
        self.assertEqual(0, n1.failures)

        n1.current_status = "down"
        n1.save(using="job_superv")
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual(1, n1.failures)

        self.db_mock.return_value = {"N8": n1}
        self.live_mock.return_value = set(["N8"])
        actual = monitor.count_failed_nodes(self.job)
        self.assertEqual(1, actual)
        # The failed node has been updated to capture that.
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual("up", n1.current_status)
        self.assertEqual(1, n1.failures)

コード例 #11

0

ファイルを表示

ファイル: engine.py プロジェクト: bwyss/oq-engine

def _switch_to_job_phase(job_ctxt, ctype, status):
    """Switch to a particular phase of execution.

    This involves creating a `job_phase_stats` record and logging the new
    status.

    :param job_ctxt:
        An :class:`~openquake.engine.JobContext` instance.
    :param str ctype: calculation type (hazard|risk)
    :param str status: one of the following: pre_executing, executing,
        post_executing, post_processing, export, clean_up, complete
    """
    job = OqJob.objects.get(id=job_ctxt.job_id)
    JobPhaseStats.objects.create(oq_job=job, ctype=ctype, job_status=status)
    logs.log_progress("%s (%s)" % (status, ctype), 1)
    if status == "executing":
        # Record the compute nodes that were available at the beginning of the
        # execute phase so we can detect failed nodes later.
        failed_nodes = monitor.count_failed_nodes(job)
        if failed_nodes == -1:
            logs.LOG.critical("No live compute nodes, aborting calculation")
            sys.exit(1)

コード例 #12

0

ファイルを表示

def _switch_to_job_phase(job_ctxt, ctype, status):
    """Switch to a particular phase of execution.

    This involves creating a `job_phase_stats` record and logging the new
    status.

    :param job_ctxt:
        An :class:`~openquake.engine.JobContext` instance.
    :param str ctype: calculation type (hazard|risk)
    :param str status: one of the following: pre_executing, executing,
        post_executing, post_processing, export, clean_up, complete
    """
    job = OqJob.objects.get(id=job_ctxt.job_id)
    JobPhaseStats.objects.create(oq_job=job, ctype=ctype, job_status=status)
    logs.log_progress("%s (%s)" % (status, ctype), 1)
    if status == "executing":
        # Record the compute nodes that were available at the beginning of the
        # execute phase so we can detect failed nodes later.
        failed_nodes = monitor.count_failed_nodes(job)
        if failed_nodes == -1:
            logs.LOG.critical("No live compute nodes, aborting calculation")
            sys.exit(1)

コード例 #13

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: bwyss/oq-engine

 def test_count_failed_nodes_with_zero_nodes(self):
     # Signal when there are zero live nodes at the start of the calculation
     self.db_mock.return_value = {}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(-1, actual)

コード例 #14

0

ファイルを表示

ファイル: utils_monitor_test.py プロジェクト: PseudononymousEpistle/oq-engine

 def test_count_failed_nodes_with_zero_nodes(self):
     # Signal when there are zero live nodes at the start of the calculation
     self.db_mock.return_value = {}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(-1, actual)