def test_take_action_pause_failed(self, mock_load_rule_config, mock_get_requests, mock_email_load_config, mock_load_ecc_config, mock_list_pods, mock_create_email_for_issue_with_pause_resume_job): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.return_value = {"result": "Sorry, something went wrong."} rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"} } mock_list_pods.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "mock-worker-one" }]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = ["mock-worker-one"] ecc_reboot_node_rule_instance.take_action() self.assertEqual(1, mock_create_email_for_issue_with_pause_resume_job.call_count)
def test_check_status_ecc_error_detected( self, mock_load_ecc_config, mock_request_get, mock_list_node, mock_rule_alert_handler_load_config, mock_email_handler): mock_rule_config = test_util.mock_rule_config() mock_rule_alert_handler_load_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler() mock_request_get.return_value.json.return_value = _mock_prometheus_ecc_data( ) mock_list_node.return_value = test_util.mock_v1_node_list([{ "instance": "192.168.0.1", "node_name": "mock-worker-one" }, { "instance": "192.168.0.2", "node_name": "mock-worker-two" }]) ecc_rule_instance = EccDetectErrorRule(mock_rule_alert_handler, mock_rule_config) check_status_response = ecc_rule_instance.check_status() self.assertTrue(check_status_response) self.assertEqual(len(ecc_rule_instance.new_bad_nodes), 2) self.assertTrue("mock-worker-one" in ecc_rule_instance.new_bad_nodes) self.assertTrue("mock-worker-two" in ecc_rule_instance.new_bad_nodes)
def test_check_status_large_latency_detected( self, mock_load_latency_config, mock_request_get, mock_list_node, mock_rule_alert_handler_load_config, mock_email_handler, mock_create_email_for_dris): mock_rule_config = test_util.mock_rule_config() mock_rule_alert_handler_load_config.return_value = mock_rule_config mock_load_latency_config.return_value = test_util.mock_latency_config() mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler() mock_request_get.return_value.json.return_value = _mock_prometheus_latency_data( ) mock_list_node.return_value = test_util.mock_v1_node_list([{ "instance": "192.168.0.1", "node_name": "mock-worker-one" }, { "instance": "192.168.0.2", "node_name": "mock-worker-two" }]) latency_rule_instance = NvidiaSmiLatencyRule(mock_rule_alert_handler, mock_rule_config) check_status_response = latency_rule_instance.check_status() self.assertTrue(check_status_response) self.assertEqual(len(latency_rule_instance.impacted_nodes), 1) self.assertTrue( "mock-worker-one" in latency_rule_instance.impacted_nodes)
def test_take_action(self, mock_load_rule_config, mock_load_ecc_config, mock_email_handler, mock_create_email_for_dris): mock_rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_latency_config() alert = rule_alert_handler.RuleAlertHandler() latency_rule_instance = NvidiaSmiLatencyRule(alert, mock_rule_config) latency_rule_instance.impacted_nodes = { "mock-worker-one": "192.168.0.1", "mock-worker-two": "192.168.0.2" } latency_rule_instance.take_action() self.assertEqual(1, mock_create_email_for_dris.call_count) self.assertTrue("smi_latency_rule" in alert.rule_cache) self.assertTrue( "mock-worker-one" in alert.rule_cache["smi_latency_rule"]) self.assertEqual( "192.168.0.1", alert.rule_cache["smi_latency_rule"] ["mock-worker-one"]["instance"]) self.assertTrue( "mock-worker-two" in alert.rule_cache["smi_latency_rule"]) self.assertEqual( "192.168.0.2", alert.rule_cache["smi_latency_rule"] ["mock-worker-two"]["instance"])
def test_check_status_no_action_needed(self, mock_load_rule_config, mock_email_handler, mock_load_etcd_config, mock_load_ecc_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_two_days_ago = datetime.utcnow() - timedelta(days=2) time_three_days_ago = datetime.utcnow() - timedelta(days=3) time_six_days_ago = datetime.utcnow() - timedelta(days=6) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_two_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" }, # this node already has a reboot attempt, so should not trigger take action "node2": { "time_found": time_three_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.2", "reboot_requested": time_two_days_ago.strftime(rule_config['date_time_format']) } } # both nodes have not been rebooted after initial detection node_boot_times = { "192.168.0.1": str(time_three_days_ago.replace(tzinfo=timezone.utc).timestamp()), "192.168.0.2": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # at least one job running on the node mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "node1" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertFalse(response) self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
def test_check_status_node_due_for_reboot(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_six_days_ago = datetime.utcnow() - timedelta(days=6) time_five_days_ago = datetime.utcnow() - timedelta(days=5, minutes=1) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_five_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # reboot is due to be rebooted (exceeded configured deadline), should trigger take action node_boot_times = { "192.168.0.1": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # at least one job running on the node mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "node1" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action) self.assertEqual(1, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration)) self.assertTrue("87654321-wxyz" in ecc_reboot_node_rule_instance.jobs_ready_for_migration)
def test_remove_from_rule_cache(self, mock_email_handler, mock_config): mock_config.return_value = _mock_rule_config() rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule = "TestRule" cache_key = "test_key" cache_value = "test_value" rule_alert_handler_instance.rule_cache[rule] = {cache_key: cache_value} rule_alert_handler_instance.remove_from_rule_cache(rule, cache_key) self.assertTrue(rule in rule_alert_handler_instance.rule_cache) self.assertEqual(0, len(rule_alert_handler_instance.rule_cache[rule]))
def test_update_rule_cache(self, mock_email_handler, mock_config): mock_config.return_value = _mock_rule_config() rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule = "TestRule" cache_key = "test_key" cache_value = "test_value" rule_alert_handler_instance.update_rule_cache(rule, cache_key, cache_value) self.assertTrue(rule in rule_alert_handler_instance.rule_cache) self.assertTrue(cache_key in rule_alert_handler_instance.rule_cache[rule]) self.assertEqual(cache_value, rule_alert_handler_instance.rule_cache[rule][cache_key])
def test_check_rule_cache(self, mock_email_handler, mock_config): mock_config.return_value = _mock_rule_config() rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule = "TestRule" cache_key = "test_key" cache_value = "test_value" rule_alert_handler_instance.rule_cache[rule] = {cache_key: cache_value} result = rule_alert_handler_instance.check_rule_cache(rule, cache_key) self.assertTrue(result) result = rule_alert_handler_instance.check_rule_cache(rule, "should not exist") self.assertFalse(result)
def test_check_status_node_rebooted_after_detection(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list, mock_uncordon_node): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_one_days_ago = datetime.utcnow() - timedelta(days=1) now = datetime.utcnow() # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_one_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # node rebooted *after* initial ecc error detection node_boot_times = { "192.168.0.1": str(now.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) mock_pod_list.return_value = test_util.mock_v1_pod_list([]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertFalse(response) self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" not in rule_alert_handler_instance.rule_cache["ecc_rule"])
def test_check_status_no_jobs_running(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_two_days_ago = datetime.utcnow() - timedelta(days=2) rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_two_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # node not due to be rebooted node_boot_times = { "192.168.0.1": str(time_two_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # no pods running on node, should trigger take action mock_pod_list.return_value = test_util.mock_v1_pod_list([]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action) self.assertEqual(0, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration))
def test_get_rule_cache_keys(self, mock_email_handler, mock_config): mock_config.return_value = _mock_rule_config() rule = "TestRule" rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() keys = rule_alert_handler_instance.get_rule_cache_keys(rule) self.assertEqual(len(keys), 0) rule_alert_handler_instance.update_rule_cache(rule, "test_key1", "test_value1") rule_alert_handler_instance.update_rule_cache(rule, "test_key2", "test_value2") rule_alert_handler_instance.update_rule_cache(rule, "test_key3", "test_value3") keys = rule_alert_handler_instance.get_rule_cache_keys(rule) self.assertEqual(len(keys), 3) self.assertTrue("test_key1" in keys) self.assertTrue("test_key2" in keys) self.assertTrue("test_key3" in keys)
def test_check_status_ecc_error_node_already_detected( self, mock_load_ecc_config, mock_request_get, mock_list_node, mock_rule_alert_handler_load_config, mock_email_handler): mock_rule_config = test_util.mock_rule_config() mock_rule_alert_handler_load_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler() # nodes already detected in previous run mock_rule_alert_handler.rule_cache = { "ecc_rule": { "mock-worker-one": { "time_found": "2020-02-18 21:14:20.351019", "instance": "192.168.0.1" } } } mock_request_get.return_value.json.return_value = _mock_prometheus_ecc_data( ) mock_list_node.return_value = test_util.mock_v1_node_list([{ "instance": "192.168.0.1", "node_name": "mock-worker-one" }, { "instance": "192.168.0.2", "node_name": "mock-worker-two" }]) ecc_rule_instance = EccDetectErrorRule(mock_rule_alert_handler, mock_rule_config) check_status_response = ecc_rule_instance.check_status() self.assertTrue(check_status_response) self.assertEqual(len(ecc_rule_instance.new_bad_nodes), 1) self.assertTrue("mock-worker-two" in ecc_rule_instance.new_bad_nodes)
def test_check_status_time_to_take_action(self, mock_load_rule_config, mock_email_handler, mock_ecc_config, mock_request_get): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_ecc_config.return_value = test_util.mock_ecc_config() mock_ecc_config.return_value["days_until_node_reboot"] = 5 time_six_days_ago = datetime.utcnow() - timedelta(days=6) time_five_days_ago = datetime.utcnow() - timedelta(days=5, minutes=1) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_five_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } node_boot_times = { "192.168.0.1": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertEqual("node1", ecc_reboot_node_rule_instance.nodes_ready_for_action[0])
def test_clean_expired_items_in_rule_cache(self, mock_load_rule_config, mock_email_handler, mock_ecc_config, mock_request_get): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_ecc_config.return_value = test_util.mock_latency_config() mock_ecc_config.return_value["hours_until_alert_expiration"] = 4 time_one_hours_ago = datetime.utcnow() - timedelta(hours=1) time_four_hours_ago = datetime.utcnow() - timedelta(hours=4, minutes=1) # large latency alert detected previously alert = rule_alert_handler.RuleAlertHandler() alert.rule_cache["smi_latency_rule"] = { "node1": { "time_found": time_four_hours_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" }, "node2": { "time_found": time_one_hours_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.2" } } smi_latency_rule_instance = nvidia_smi_latency_rule.NvidiaSmiLatencyRule( alert, rule_config) smi_latency_rule_instance.clean_expired_items_in_rule_cache() self.assertEqual(1, len(alert.rule_cache)) self.assertTrue("node2" in alert.rule_cache["smi_latency_rule"])
def test_take_action_pause_failed(self, mock_load_rule_config, mock_put_requests, mock_get_requests, mock_email_load_config, mock_load_ecc_config, mock_load_etcd_config): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.return_value = {"result": "Sorry, something went wrong."} rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-two": {"instance": "192.168.0.2:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"}, "mock-worker-four": {"instance": "192.168.0.4:9090"} } ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = [ "mock-worker-one", "mock-worker-two", "mock-worker-three", "mock-worker-four" ] ecc_reboot_node_rule_instance.jobs_ready_for_migration = { "87654321-wxyz": { "user_name": "user1", "vc_name": "vc1", "node_names": ["mock-worker-one"], "job_link": "/job-link-1" }, # distributed job "12345678-abcd": { "user_name": "user2", "vc_name": "vc1", "node_names": ["mock-worker-two", "mock-worker-three"], "job_link": "/job-link-2" } } mock_put_requests.return_value.json.return_value = { "action": "set", "node": { "key": "/mock-worker-four/reboot", "value": "True", "modifiedIndex": 39, "createdIndex": 39 } } ecc_reboot_node_rule_instance.take_action() # node should be skipped since job migration failed self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"]) self.assertFalse("mock-worker-one" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be skipped since job migration failed (distributed job) self.assertTrue("mock-worker-two" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-two"]) self.assertFalse("mock-worker-two" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be skipped since job migration failed (distributed job) self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"]) self.assertFalse("mock-worker-three" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be successfully rebooted (had no jobs to migrate) self.assertTrue("mock-worker-four" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-four"]) self.assertTrue("mock-worker-four" in ecc_reboot_node_rule_instance.nodes_ready_for_action)
def test_take_action_reboot_failed(self, mock_load_rule_config, mock_create_email_for_pause_resume_job, mock_put_requests, mock_get_requests, mock_email_handler, mock_load_etcd_config, mock_load_ecc_config): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.side_effect = [ # job 1 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 2 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 3 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."} ] mock_put_requests.return_value.json.side_effect = [ { "action": "set", "node": { "key": "/mock-worker-one/reboot", "value": "True", "modifiedIndex": 39, "createdIndex": 39 } }, # reboot failed for one of the nodes { "error_code": 100, "message": "Something went wrong", "cause": "Unable to open connection" }] rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"} } ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule( rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = [ "mock-worker-one", "mock-worker-three"] ecc_reboot_node_rule_instance.jobs_ready_for_migration = { "87654321-wxyz": { "user_name": "user1", "vc_name": "vc1", "node_names": ["mock-worker-one"], "job_link": "/job-link-1" }, "12345678-abcd": { "user_name": "user2", "vc_name": "vc2", "node_names": ["mock-worker-one"], "job_link": "/job-link-2" }, "99999999-efgh": { "user_name": "user3", "vc_name": "vc3", "node_names": ["mock-worker-three"], "job_link": "/job-link-3" } } ecc_reboot_node_rule_instance.take_action() self.assertEqual(3, mock_create_email_for_pause_resume_job.call_count) self.assertEqual(2, len(rule_alert_handler_instance.rule_cache["ecc_rule"])) # reboot successful for this node self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"]) # reboot failed for this node self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"])
def test_take_action(self, mock_load_rule_config, mock_load_ecc_config, mock_email_handler, mock_pod_list, mock_cordon_node, mock_create_email_for_dris, mock_create_email_for_job_owner): mock_rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() alert = rule_alert_handler.RuleAlertHandler() ecc_rule_instance = EccDetectErrorRule(alert, mock_rule_config) ecc_rule_instance.new_bad_nodes = { "mock-worker-one": "192.168.0.1", "mock-worker-two": "192.168.0.2" } mock_pod_list.return_value = test_util.mock_v1_pod_list([{ "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "mock-worker-one" }, { "job_name": "12345678-abcd", "user_name": "user2", "vc_name": "vc2", "node_name": "mock-worker-one" }, { "job_name": "12345678-abcd", "user_name": "user2", "vc_name": "vc2", "node_name": "mock-worker-two" }, { "job_name": "99999999-efgh", "user_name": "user3", "vc_name": "vc3", "node_name": "mock-worker-three" }]) ecc_rule_instance.take_action() self.assertEqual(2, mock_cordon_node.call_count) self.assertEqual(1, mock_create_email_for_dris.call_count) self.assertEqual(2, mock_create_email_for_job_owner.call_count) self.assertTrue("ecc_rule" in alert.rule_cache) self.assertTrue("mock-worker-one" in alert.rule_cache["ecc_rule"]) self.assertEqual( "192.168.0.1", alert.rule_cache["ecc_rule"]["mock-worker-one"]["instance"]) self.assertTrue("mock-worker-two" in alert.rule_cache["ecc_rule"]) self.assertEqual( "192.168.0.2", alert.rule_cache["ecc_rule"]["mock-worker-two"]["instance"])
import logging import logging.config import importlib import traceback from pathlib import Path from utils import rule_alert_handler import rules with open('./logging.yaml', 'r') as log_file: log_config = yaml.safe_load(log_file) logging.config.dictConfig(log_config) logger = logging.getLogger(__name__) alert = rule_alert_handler.RuleAlertHandler() def Run(): try: while True: with open('/etc/RepairManager/config/rule-config.yaml', 'r') as config_file: config = yaml.safe_load(config_file) # execute all rules listed in config rules_config = config['rules'] for r_key in rules_config.keys(): try: # retrieve module and class for given rule module_name = rules_config[r_key]['module_name'] class_name = rules_config[r_key]['class_name'] rule_module = importlib.import_module(module_name)
def test_take_action(self, mock_load_rule_config, mock_create_email_for_pause_resume_job, mock_get_requests, mock_email_handler, mock_load_ecc_config, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.side_effect = [ # job 1 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 2 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 3 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."} ] rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-two": {"instance": "192.168.0.2:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"} } mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "mock-worker-one" }, { "job_name": "12345678-abcd", "user_name": "user2", "vc_name": "vc2", "node_name": "mock-worker-one" }, { "job_name": "99999999-efgh", "user_name": "user3", "vc_name": "vc3", "node_name": "mock-worker-three" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = ["mock-worker-one", "mock-worker-three"] ecc_reboot_node_rule_instance.take_action() self.assertEqual(3, mock_create_email_for_pause_resume_job.call_count)