def test_take_action_pause_failed(self, 
        mock_load_rule_config,
        mock_get_requests,
        mock_email_load_config,
        mock_load_ecc_config,
        mock_list_pods,
        mock_create_email_for_issue_with_pause_resume_job):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["alert_job_owners"] = True

        mock_get_requests.return_value.json.return_value = {"result": "Sorry, something went wrong."}

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "mock-worker-one": {"instance": "192.168.0.1:9090"}
        }
        mock_list_pods.return_value = test_util.mock_v1_pod_list([
            {
                "job_name": "87654321-wxyz",
                "user_name": "user1",
                "vc_name": "vc1",
                "node_name": "mock-worker-one"
            }])

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        ecc_reboot_node_rule_instance.nodes_ready_for_action = ["mock-worker-one"]

        ecc_reboot_node_rule_instance.take_action()

        self.assertEqual(1, mock_create_email_for_issue_with_pause_resume_job.call_count)
    def test_check_status_ecc_error_detected(
            self, mock_load_ecc_config, mock_request_get, mock_list_node,
            mock_rule_alert_handler_load_config, mock_email_handler):

        mock_rule_config = test_util.mock_rule_config()
        mock_rule_alert_handler_load_config.return_value = mock_rule_config
        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler()
        mock_request_get.return_value.json.return_value = _mock_prometheus_ecc_data(
        )
        mock_list_node.return_value = test_util.mock_v1_node_list([{
            "instance":
            "192.168.0.1",
            "node_name":
            "mock-worker-one"
        }, {
            "instance":
            "192.168.0.2",
            "node_name":
            "mock-worker-two"
        }])

        ecc_rule_instance = EccDetectErrorRule(mock_rule_alert_handler,
                                               mock_rule_config)
        check_status_response = ecc_rule_instance.check_status()

        self.assertTrue(check_status_response)
        self.assertEqual(len(ecc_rule_instance.new_bad_nodes), 2)
        self.assertTrue("mock-worker-one" in ecc_rule_instance.new_bad_nodes)
        self.assertTrue("mock-worker-two" in ecc_rule_instance.new_bad_nodes)
    def test_check_status_large_latency_detected(
            self, mock_load_latency_config, mock_request_get, mock_list_node,
            mock_rule_alert_handler_load_config, mock_email_handler,
            mock_create_email_for_dris):

        mock_rule_config = test_util.mock_rule_config()
        mock_rule_alert_handler_load_config.return_value = mock_rule_config
        mock_load_latency_config.return_value = test_util.mock_latency_config()
        mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler()
        mock_request_get.return_value.json.return_value = _mock_prometheus_latency_data(
        )
        mock_list_node.return_value = test_util.mock_v1_node_list([{
            "instance":
            "192.168.0.1",
            "node_name":
            "mock-worker-one"
        }, {
            "instance":
            "192.168.0.2",
            "node_name":
            "mock-worker-two"
        }])

        latency_rule_instance = NvidiaSmiLatencyRule(mock_rule_alert_handler,
                                                     mock_rule_config)
        check_status_response = latency_rule_instance.check_status()

        self.assertTrue(check_status_response)
        self.assertEqual(len(latency_rule_instance.impacted_nodes), 1)
        self.assertTrue(
            "mock-worker-one" in latency_rule_instance.impacted_nodes)
    def test_take_action(self, mock_load_rule_config, mock_load_ecc_config,
                         mock_email_handler, mock_create_email_for_dris):
        mock_rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = mock_rule_config
        mock_load_ecc_config.return_value = test_util.mock_latency_config()

        alert = rule_alert_handler.RuleAlertHandler()
        latency_rule_instance = NvidiaSmiLatencyRule(alert, mock_rule_config)
        latency_rule_instance.impacted_nodes = {
            "mock-worker-one": "192.168.0.1",
            "mock-worker-two": "192.168.0.2"
        }

        latency_rule_instance.take_action()

        self.assertEqual(1, mock_create_email_for_dris.call_count)

        self.assertTrue("smi_latency_rule" in alert.rule_cache)
        self.assertTrue(
            "mock-worker-one" in alert.rule_cache["smi_latency_rule"])
        self.assertEqual(
            "192.168.0.1", alert.rule_cache["smi_latency_rule"]
            ["mock-worker-one"]["instance"])
        self.assertTrue(
            "mock-worker-two" in alert.rule_cache["smi_latency_rule"])
        self.assertEqual(
            "192.168.0.2", alert.rule_cache["smi_latency_rule"]
            ["mock-worker-two"]["instance"])
    def test_check_status_no_action_needed(self,
        mock_load_rule_config,
        mock_email_handler,
        mock_load_etcd_config,
        mock_load_ecc_config,
        mock_request_get,
        mock_pod_list):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        etcd_config = test_util.mock_etcd_config()
        mock_load_etcd_config.return_value = etcd_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["days_until_node_reboot"] = 5

        time_two_days_ago = datetime.utcnow() - timedelta(days=2)
        time_three_days_ago = datetime.utcnow() - timedelta(days=3)
        time_six_days_ago = datetime.utcnow() - timedelta(days=6)

        #  ecc error detection occured in previous iteration
        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "node1": {
                "time_found": time_two_days_ago.strftime(rule_config['date_time_format']),
                "instance": "192.168.0.1"
            },
            # this node already has a reboot attempt, so should not trigger take action
            "node2": {
                "time_found": time_three_days_ago.strftime(rule_config['date_time_format']),
                "instance": "192.168.0.2",
                "reboot_requested": time_two_days_ago.strftime(rule_config['date_time_format'])
            }
        }

        # both nodes have not been rebooted after initial detection
        node_boot_times = {
            "192.168.0.1": str(time_three_days_ago.replace(tzinfo=timezone.utc).timestamp()),
            "192.168.0.2": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp())
        }
        mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times)

        # at least one job running on the node
        mock_pod_list.return_value = test_util.mock_v1_pod_list([
            {
                "job_name": "87654321-wxyz",
                "user_name": "user1",
                "vc_name": "vc1",
                "node_name": "node1"
            }
        ])

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        response = ecc_reboot_node_rule_instance.check_status()


        self.assertFalse(response)
        self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
    def test_check_status_node_due_for_reboot(self,
            mock_load_rule_config,
            mock_email_handler,
            mock_load_ecc_config,
            mock_load_etcd_config,
            mock_request_get,
            mock_pod_list):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        etcd_config = test_util.mock_etcd_config()
        mock_load_etcd_config.return_value = etcd_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["days_until_node_reboot"] = 5

        time_six_days_ago = datetime.utcnow() - timedelta(days=6)
        time_five_days_ago = datetime.utcnow() - timedelta(days=5, minutes=1)

        #  ecc error detection occured in previous iteration
        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "node1": {
                "time_found": time_five_days_ago.strftime(rule_config['date_time_format']),
                "instance": "192.168.0.1"
            }
        }

        # reboot is due to be rebooted (exceeded configured deadline), should trigger take action
        node_boot_times = {
            "192.168.0.1": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp())
        }
        mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times)
        
        # at least one job running on the node
        mock_pod_list.return_value = test_util.mock_v1_pod_list([
            {
                "job_name": "87654321-wxyz",
                "user_name": "user1",
                "vc_name": "vc1",
                "node_name": "node1"
            }
        ])

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        response = ecc_reboot_node_rule_instance.check_status()

        self.assertTrue(response)
        self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
        self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action)
        self.assertEqual(1, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration))
        self.assertTrue("87654321-wxyz" in ecc_reboot_node_rule_instance.jobs_ready_for_migration)
    def test_remove_from_rule_cache(self, mock_email_handler, mock_config):
        mock_config.return_value = _mock_rule_config()

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()

        rule = "TestRule"
        cache_key = "test_key"
        cache_value = "test_value"
        rule_alert_handler_instance.rule_cache[rule] = {cache_key: cache_value}

        rule_alert_handler_instance.remove_from_rule_cache(rule, cache_key)

        self.assertTrue(rule in rule_alert_handler_instance.rule_cache)
        self.assertEqual(0, len(rule_alert_handler_instance.rule_cache[rule]))
Example #8
0
    def test_update_rule_cache(self, mock_email_handler, mock_config):
        mock_config.return_value = _mock_rule_config()

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()

        rule = "TestRule"
        cache_key = "test_key"
        cache_value = "test_value"

        rule_alert_handler_instance.update_rule_cache(rule, cache_key, cache_value)

        self.assertTrue(rule in rule_alert_handler_instance.rule_cache)
        self.assertTrue(cache_key in rule_alert_handler_instance.rule_cache[rule])
        self.assertEqual(cache_value, rule_alert_handler_instance.rule_cache[rule][cache_key])
Example #9
0
    def test_check_rule_cache(self, mock_email_handler, mock_config):
        mock_config.return_value = _mock_rule_config()

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()

        rule = "TestRule"
        cache_key = "test_key"
        cache_value = "test_value"
        rule_alert_handler_instance.rule_cache[rule] = {cache_key: cache_value}

        result = rule_alert_handler_instance.check_rule_cache(rule, cache_key)
        self.assertTrue(result)

        result = rule_alert_handler_instance.check_rule_cache(rule, "should not exist")
        self.assertFalse(result)
    def test_check_status_node_rebooted_after_detection(self,
            mock_load_rule_config,
            mock_email_handler,
            mock_load_ecc_config,
            mock_load_etcd_config,
            mock_request_get,
            mock_pod_list,
            mock_uncordon_node):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        etcd_config = test_util.mock_etcd_config()
        mock_load_etcd_config.return_value = etcd_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["days_until_node_reboot"] = 5

        time_one_days_ago = datetime.utcnow() - timedelta(days=1)
        now = datetime.utcnow()

        #  ecc error detection occured in previous iteration
        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "node1": {
                "time_found": time_one_days_ago.strftime(rule_config['date_time_format']),
                "instance": "192.168.0.1"
            }
        }

        # node rebooted *after* initial ecc error detection
        node_boot_times = {
            "192.168.0.1": str(now.replace(tzinfo=timezone.utc).timestamp())
        }
        mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times)

        mock_pod_list.return_value = test_util.mock_v1_pod_list([])

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        response = ecc_reboot_node_rule_instance.check_status()


        self.assertFalse(response)
        self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
        self.assertTrue("node1" not in rule_alert_handler_instance.rule_cache["ecc_rule"])
    def test_check_status_no_jobs_running(self,
            mock_load_rule_config,
            mock_email_handler,
            mock_load_ecc_config,
            mock_load_etcd_config,
            mock_request_get,
            mock_pod_list):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        etcd_config = test_util.mock_etcd_config()
        mock_load_etcd_config.return_value = etcd_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["days_until_node_reboot"] = 5

        time_two_days_ago = datetime.utcnow() - timedelta(days=2)

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "node1": {
                "time_found": time_two_days_ago.strftime(rule_config['date_time_format']),
                "instance": "192.168.0.1"
            }
        }

        # node not due to be rebooted
        node_boot_times = {
            "192.168.0.1": str(time_two_days_ago.replace(tzinfo=timezone.utc).timestamp())
        }
        mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times)

        # no pods running on node, should trigger take action
        mock_pod_list.return_value = test_util.mock_v1_pod_list([])

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        response = ecc_reboot_node_rule_instance.check_status()


        self.assertTrue(response)
        self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
        self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action)
        self.assertEqual(0, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration))
    def test_get_rule_cache_keys(self, mock_email_handler, mock_config):
        mock_config.return_value = _mock_rule_config()
        rule = "TestRule"

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()

        keys = rule_alert_handler_instance.get_rule_cache_keys(rule)
        self.assertEqual(len(keys), 0)

        rule_alert_handler_instance.update_rule_cache(rule, "test_key1",
                                                      "test_value1")
        rule_alert_handler_instance.update_rule_cache(rule, "test_key2",
                                                      "test_value2")
        rule_alert_handler_instance.update_rule_cache(rule, "test_key3",
                                                      "test_value3")

        keys = rule_alert_handler_instance.get_rule_cache_keys(rule)
        self.assertEqual(len(keys), 3)
        self.assertTrue("test_key1" in keys)
        self.assertTrue("test_key2" in keys)
        self.assertTrue("test_key3" in keys)
    def test_check_status_ecc_error_node_already_detected(
            self, mock_load_ecc_config, mock_request_get, mock_list_node,
            mock_rule_alert_handler_load_config, mock_email_handler):

        mock_rule_config = test_util.mock_rule_config()
        mock_rule_alert_handler_load_config.return_value = mock_rule_config
        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler()
        # nodes already detected in previous run
        mock_rule_alert_handler.rule_cache = {
            "ecc_rule": {
                "mock-worker-one": {
                    "time_found": "2020-02-18 21:14:20.351019",
                    "instance": "192.168.0.1"
                }
            }
        }
        mock_request_get.return_value.json.return_value = _mock_prometheus_ecc_data(
        )
        mock_list_node.return_value = test_util.mock_v1_node_list([{
            "instance":
            "192.168.0.1",
            "node_name":
            "mock-worker-one"
        }, {
            "instance":
            "192.168.0.2",
            "node_name":
            "mock-worker-two"
        }])

        ecc_rule_instance = EccDetectErrorRule(mock_rule_alert_handler,
                                               mock_rule_config)
        check_status_response = ecc_rule_instance.check_status()

        self.assertTrue(check_status_response)
        self.assertEqual(len(ecc_rule_instance.new_bad_nodes), 1)
        self.assertTrue("mock-worker-two" in ecc_rule_instance.new_bad_nodes)
    def test_check_status_time_to_take_action(self,
            mock_load_rule_config,
            mock_email_handler,
            mock_ecc_config,
            mock_request_get):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        mock_ecc_config.return_value = test_util.mock_ecc_config()
        mock_ecc_config.return_value["days_until_node_reboot"] = 5

        time_six_days_ago = datetime.utcnow() - timedelta(days=6)
        time_five_days_ago = datetime.utcnow() - timedelta(days=5, minutes=1)

        #  ecc error detection occured in previous iteration
        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "node1": {
                "time_found": time_five_days_ago.strftime(rule_config['date_time_format']),
                "instance": "192.168.0.1"
            }
        }

        node_boot_times = {
            "192.168.0.1": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp())
        }
        mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times)


        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        response = ecc_reboot_node_rule_instance.check_status()


        self.assertTrue(response)
        self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
        self.assertEqual("node1", ecc_reboot_node_rule_instance.nodes_ready_for_action[0])
    def test_clean_expired_items_in_rule_cache(self, mock_load_rule_config,
                                               mock_email_handler,
                                               mock_ecc_config,
                                               mock_request_get):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        mock_ecc_config.return_value = test_util.mock_latency_config()
        mock_ecc_config.return_value["hours_until_alert_expiration"] = 4

        time_one_hours_ago = datetime.utcnow() - timedelta(hours=1)
        time_four_hours_ago = datetime.utcnow() - timedelta(hours=4, minutes=1)

        #  large latency alert detected previously
        alert = rule_alert_handler.RuleAlertHandler()
        alert.rule_cache["smi_latency_rule"] = {
            "node1": {
                "time_found":
                time_four_hours_ago.strftime(rule_config['date_time_format']),
                "instance":
                "192.168.0.1"
            },
            "node2": {
                "time_found":
                time_one_hours_ago.strftime(rule_config['date_time_format']),
                "instance":
                "192.168.0.2"
            }
        }

        smi_latency_rule_instance = nvidia_smi_latency_rule.NvidiaSmiLatencyRule(
            alert, rule_config)
        smi_latency_rule_instance.clean_expired_items_in_rule_cache()

        self.assertEqual(1, len(alert.rule_cache))
        self.assertTrue("node2" in alert.rule_cache["smi_latency_rule"])
    def test_take_action_pause_failed(self, 
        mock_load_rule_config,
        mock_put_requests,
        mock_get_requests,
        mock_email_load_config,
        mock_load_ecc_config,
        mock_load_etcd_config):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        etcd_config = test_util.mock_etcd_config()
        mock_load_etcd_config.return_value = etcd_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["alert_job_owners"] = True

        mock_get_requests.return_value.json.return_value = {"result": "Sorry, something went wrong."}

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "mock-worker-one": {"instance": "192.168.0.1:9090"},
            "mock-worker-two": {"instance": "192.168.0.2:9090"},
            "mock-worker-three": {"instance": "192.168.0.3:9090"},
            "mock-worker-four": {"instance": "192.168.0.4:9090"}
        }

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        ecc_reboot_node_rule_instance.nodes_ready_for_action = [
            "mock-worker-one", 
            "mock-worker-two",
            "mock-worker-three",
            "mock-worker-four"
        ]
        ecc_reboot_node_rule_instance.jobs_ready_for_migration = {
            "87654321-wxyz":
                {
                    "user_name": "user1",
                    "vc_name": "vc1",
                    "node_names": ["mock-worker-one"],
                    "job_link": "/job-link-1"
                },
                # distributed job
                "12345678-abcd": {
                    "user_name": "user2",
                    "vc_name": "vc1",
                    "node_names": ["mock-worker-two", "mock-worker-three"],
                    "job_link": "/job-link-2"
                }
        }

        mock_put_requests.return_value.json.return_value = {
            "action": "set",
            "node": {
                "key": "/mock-worker-four/reboot",
                "value": "True",
                "modifiedIndex": 39,
                "createdIndex": 39
            }
        }

        ecc_reboot_node_rule_instance.take_action()
        # node should be skipped since job migration failed
        self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"])
        self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"])
        self.assertFalse("mock-worker-one" in ecc_reboot_node_rule_instance.nodes_ready_for_action)
        # node should be skipped since job migration failed (distributed job)
        self.assertTrue("mock-worker-two" in rule_alert_handler_instance.rule_cache["ecc_rule"])
        self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-two"])
        self.assertFalse("mock-worker-two" in ecc_reboot_node_rule_instance.nodes_ready_for_action)
        # node should be skipped since job migration failed (distributed job)
        self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"])
        self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"])
        self.assertFalse("mock-worker-three" in ecc_reboot_node_rule_instance.nodes_ready_for_action)
        # node should be successfully rebooted (had no jobs to migrate)
        self.assertTrue("mock-worker-four" in rule_alert_handler_instance.rule_cache["ecc_rule"])
        self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-four"])
        self.assertTrue("mock-worker-four" in ecc_reboot_node_rule_instance.nodes_ready_for_action)
    def test_take_action_reboot_failed(self, 
        mock_load_rule_config,
        mock_create_email_for_pause_resume_job,
        mock_put_requests,
        mock_get_requests,
        mock_email_handler,
        mock_load_etcd_config,
        mock_load_ecc_config):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        etcd_config = test_util.mock_etcd_config()
        mock_load_etcd_config.return_value = etcd_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["alert_job_owners"] = True

        mock_get_requests.return_value.json.side_effect = [
            # job 1
            {"result": "Success, job paused."},
            {"errorMsg": None,
            "jobStatus": "paused",
            "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"},
            {"result": "Success, job resumed."},
            # job 2
            {"result": "Success, job paused."},
            {"errorMsg": None,
            "jobStatus": "paused",
            "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"},
            {"result": "Success, job resumed."},
            # job 3
            {"result": "Success, job paused."},
            {"errorMsg": None,
            "jobStatus": "paused",
            "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"},
            {"result": "Success, job resumed."}
        ]

        mock_put_requests.return_value.json.side_effect = [
            {
                "action": "set",
                "node": {
                    "key": "/mock-worker-one/reboot",
                    "value": "True",
                    "modifiedIndex": 39,
                    "createdIndex": 39
                }
            },
            # reboot failed for one of the nodes
            {
                "error_code": 100,
                "message": "Something went wrong",
                "cause": "Unable to open connection"
            }]

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "mock-worker-one": {"instance": "192.168.0.1:9090"},
            "mock-worker-three": {"instance": "192.168.0.3:9090"}
        }

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(
            rule_alert_handler_instance, rule_config)
        ecc_reboot_node_rule_instance.nodes_ready_for_action = [
            "mock-worker-one", "mock-worker-three"]
        ecc_reboot_node_rule_instance.jobs_ready_for_migration = {
            "87654321-wxyz":
                {
                    "user_name": "user1",
                    "vc_name": "vc1",
                    "node_names": ["mock-worker-one"],
                    "job_link": "/job-link-1"
                },
            "12345678-abcd": {
                    "user_name": "user2",
                    "vc_name": "vc2",
                    "node_names": ["mock-worker-one"],
                    "job_link": "/job-link-2"
                },
            "99999999-efgh": {
                    "user_name": "user3",
                    "vc_name": "vc3",
                    "node_names": ["mock-worker-three"],
                    "job_link": "/job-link-3"
                }
        }

        ecc_reboot_node_rule_instance.take_action()

        self.assertEqual(3, mock_create_email_for_pause_resume_job.call_count)
        self.assertEqual(2, len(rule_alert_handler_instance.rule_cache["ecc_rule"]))
        # reboot successful for this node
        self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"])
        self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"])
        # reboot failed for this node
        self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"])
        self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"])
    def test_take_action(self, mock_load_rule_config, mock_load_ecc_config,
                         mock_email_handler, mock_pod_list, mock_cordon_node,
                         mock_create_email_for_dris,
                         mock_create_email_for_job_owner):
        mock_rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = mock_rule_config
        mock_load_ecc_config.return_value = test_util.mock_ecc_config()

        alert = rule_alert_handler.RuleAlertHandler()
        ecc_rule_instance = EccDetectErrorRule(alert, mock_rule_config)
        ecc_rule_instance.new_bad_nodes = {
            "mock-worker-one": "192.168.0.1",
            "mock-worker-two": "192.168.0.2"
        }

        mock_pod_list.return_value = test_util.mock_v1_pod_list([{
            "job_name":
            "87654321-wxyz",
            "user_name":
            "user1",
            "vc_name":
            "vc1",
            "node_name":
            "mock-worker-one"
        }, {
            "job_name":
            "12345678-abcd",
            "user_name":
            "user2",
            "vc_name":
            "vc2",
            "node_name":
            "mock-worker-one"
        }, {
            "job_name":
            "12345678-abcd",
            "user_name":
            "user2",
            "vc_name":
            "vc2",
            "node_name":
            "mock-worker-two"
        }, {
            "job_name":
            "99999999-efgh",
            "user_name":
            "user3",
            "vc_name":
            "vc3",
            "node_name":
            "mock-worker-three"
        }])

        ecc_rule_instance.take_action()

        self.assertEqual(2, mock_cordon_node.call_count)
        self.assertEqual(1, mock_create_email_for_dris.call_count)
        self.assertEqual(2, mock_create_email_for_job_owner.call_count)

        self.assertTrue("ecc_rule" in alert.rule_cache)
        self.assertTrue("mock-worker-one" in alert.rule_cache["ecc_rule"])
        self.assertEqual(
            "192.168.0.1",
            alert.rule_cache["ecc_rule"]["mock-worker-one"]["instance"])
        self.assertTrue("mock-worker-two" in alert.rule_cache["ecc_rule"])
        self.assertEqual(
            "192.168.0.2",
            alert.rule_cache["ecc_rule"]["mock-worker-two"]["instance"])
Example #19
0
import logging
import logging.config
import importlib
import traceback
from pathlib import Path
from utils import rule_alert_handler

import rules

with open('./logging.yaml', 'r') as log_file:
    log_config = yaml.safe_load(log_file)

logging.config.dictConfig(log_config)
logger = logging.getLogger(__name__)

alert = rule_alert_handler.RuleAlertHandler()

def Run():
    try:        
        while True:
            with open('/etc/RepairManager/config/rule-config.yaml', 'r') as config_file:
                config = yaml.safe_load(config_file)

            # execute all rules listed in config
            rules_config = config['rules']
            for r_key in rules_config.keys():
                try:
                    # retrieve module and class for given rule
                    module_name = rules_config[r_key]['module_name']
                    class_name = rules_config[r_key]['class_name']
                    rule_module = importlib.import_module(module_name)
    def test_take_action(self, 
        mock_load_rule_config,
        mock_create_email_for_pause_resume_job,
        mock_get_requests,
        mock_email_handler,
        mock_load_ecc_config,
        mock_pod_list):

        rule_config = test_util.mock_rule_config()
        mock_load_rule_config.return_value = rule_config

        mock_load_ecc_config.return_value = test_util.mock_ecc_config()
        mock_load_ecc_config.return_value["alert_job_owners"] = True

        mock_get_requests.return_value.json.side_effect = [
            # job 1
            {"result": "Success, job paused."},
            {"errorMsg": None,
            "jobStatus": "paused",
            "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"},
            {"result": "Success, job resumed."},
            # job 2
            {"result": "Success, job paused."},
            {"errorMsg": None,
            "jobStatus": "paused",
            "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"},
            {"result": "Success, job resumed."},
            # job 3
            {"result": "Success, job paused."},
            {"errorMsg": None,
            "jobStatus": "paused",
            "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"},
            {"result": "Success, job resumed."}
        ]

        rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler()
        rule_alert_handler_instance.rule_cache["ecc_rule"] = {
            "mock-worker-one": {"instance": "192.168.0.1:9090"},
            "mock-worker-two": {"instance": "192.168.0.2:9090"},
            "mock-worker-three": {"instance": "192.168.0.3:9090"}
        }

        mock_pod_list.return_value = test_util.mock_v1_pod_list([
            {
                "job_name": "87654321-wxyz",
                "user_name": "user1",
                "vc_name": "vc1",
                "node_name": "mock-worker-one"
            },
            {
                "job_name": "12345678-abcd",
                "user_name": "user2",
                "vc_name": "vc2",
                "node_name": "mock-worker-one"
            },
            {
                "job_name": "99999999-efgh",
                "user_name": "user3",
                "vc_name": "vc3",
                "node_name": "mock-worker-three"
            }
        ])

        ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config)
        ecc_reboot_node_rule_instance.nodes_ready_for_action = ["mock-worker-one", "mock-worker-three"]

        ecc_reboot_node_rule_instance.take_action()

        self.assertEqual(3, mock_create_email_for_pause_resume_job.call_count)