def test_take_action(self, mock_load_rule_config, mock_load_ecc_config, mock_email_handler, mock_create_email_for_dris): mock_rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_latency_config() alert = rule_alert_handler.RuleAlertHandler() latency_rule_instance = NvidiaSmiLatencyRule(alert, mock_rule_config) latency_rule_instance.impacted_nodes = { "mock-worker-one": "192.168.0.1", "mock-worker-two": "192.168.0.2" } latency_rule_instance.take_action() self.assertEqual(1, mock_create_email_for_dris.call_count) self.assertTrue("smi_latency_rule" in alert.rule_cache) self.assertTrue( "mock-worker-one" in alert.rule_cache["smi_latency_rule"]) self.assertEqual( "192.168.0.1", alert.rule_cache["smi_latency_rule"] ["mock-worker-one"]["instance"]) self.assertTrue( "mock-worker-two" in alert.rule_cache["smi_latency_rule"]) self.assertEqual( "192.168.0.2", alert.rule_cache["smi_latency_rule"] ["mock-worker-two"]["instance"])
def test_check_status_ecc_error_detected( self, mock_load_ecc_config, mock_request_get, mock_list_node, mock_rule_alert_handler_load_config, mock_email_handler): mock_rule_config = test_util.mock_rule_config() mock_rule_alert_handler_load_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler() mock_request_get.return_value.json.return_value = _mock_prometheus_ecc_data( ) mock_list_node.return_value = test_util.mock_v1_node_list([{ "instance": "192.168.0.1", "node_name": "mock-worker-one" }, { "instance": "192.168.0.2", "node_name": "mock-worker-two" }]) ecc_rule_instance = EccDetectErrorRule(mock_rule_alert_handler, mock_rule_config) check_status_response = ecc_rule_instance.check_status() self.assertTrue(check_status_response) self.assertEqual(len(ecc_rule_instance.new_bad_nodes), 2) self.assertTrue("mock-worker-one" in ecc_rule_instance.new_bad_nodes) self.assertTrue("mock-worker-two" in ecc_rule_instance.new_bad_nodes)
def test_check_status_large_latency_detected( self, mock_load_latency_config, mock_request_get, mock_list_node, mock_rule_alert_handler_load_config, mock_email_handler, mock_create_email_for_dris): mock_rule_config = test_util.mock_rule_config() mock_rule_alert_handler_load_config.return_value = mock_rule_config mock_load_latency_config.return_value = test_util.mock_latency_config() mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler() mock_request_get.return_value.json.return_value = _mock_prometheus_latency_data( ) mock_list_node.return_value = test_util.mock_v1_node_list([{ "instance": "192.168.0.1", "node_name": "mock-worker-one" }, { "instance": "192.168.0.2", "node_name": "mock-worker-two" }]) latency_rule_instance = NvidiaSmiLatencyRule(mock_rule_alert_handler, mock_rule_config) check_status_response = latency_rule_instance.check_status() self.assertTrue(check_status_response) self.assertEqual(len(latency_rule_instance.impacted_nodes), 1) self.assertTrue( "mock-worker-one" in latency_rule_instance.impacted_nodes)
def test_take_action_pause_failed(self, mock_load_rule_config, mock_get_requests, mock_email_load_config, mock_load_ecc_config, mock_list_pods, mock_create_email_for_issue_with_pause_resume_job): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.return_value = {"result": "Sorry, something went wrong."} rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"} } mock_list_pods.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "mock-worker-one" }]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = ["mock-worker-one"] ecc_reboot_node_rule_instance.take_action() self.assertEqual(1, mock_create_email_for_issue_with_pause_resume_job.call_count)
def test_check_status_no_action_needed(self, mock_load_rule_config, mock_email_handler, mock_load_etcd_config, mock_load_ecc_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_two_days_ago = datetime.utcnow() - timedelta(days=2) time_three_days_ago = datetime.utcnow() - timedelta(days=3) time_six_days_ago = datetime.utcnow() - timedelta(days=6) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_two_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" }, # this node already has a reboot attempt, so should not trigger take action "node2": { "time_found": time_three_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.2", "reboot_requested": time_two_days_ago.strftime(rule_config['date_time_format']) } } # both nodes have not been rebooted after initial detection node_boot_times = { "192.168.0.1": str(time_three_days_ago.replace(tzinfo=timezone.utc).timestamp()), "192.168.0.2": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # at least one job running on the node mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "node1" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertFalse(response) self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
def test_check_status_node_due_for_reboot(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_six_days_ago = datetime.utcnow() - timedelta(days=6) time_five_days_ago = datetime.utcnow() - timedelta(days=5, minutes=1) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_five_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # reboot is due to be rebooted (exceeded configured deadline), should trigger take action node_boot_times = { "192.168.0.1": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # at least one job running on the node mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "node1" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action) self.assertEqual(1, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration)) self.assertTrue("87654321-wxyz" in ecc_reboot_node_rule_instance.jobs_ready_for_migration)
def test_check_status_ecc_error_not_found(self, mock_load_ecc_config, mock_request_get, mock_list_node, mock_rule_alert_handler, mock_get_node_address_info): mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_request_get.return_value.json.return_value = test_util.mock_empty_prometheus_metric_data( ) ecc_rule_instance = EccDetectErrorRule(mock_rule_alert_handler, test_util.mock_rule_config()) check_status_response = ecc_rule_instance.check_status() self.assertFalse(check_status_response) self.assertEqual(len(ecc_rule_instance.new_bad_nodes), 0)
def test_check_status_node_rebooted_after_detection(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list, mock_uncordon_node): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_one_days_ago = datetime.utcnow() - timedelta(days=1) now = datetime.utcnow() # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_one_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # node rebooted *after* initial ecc error detection node_boot_times = { "192.168.0.1": str(now.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) mock_pod_list.return_value = test_util.mock_v1_pod_list([]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertFalse(response) self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" not in rule_alert_handler_instance.rule_cache["ecc_rule"])
def test_check_status_no_jobs_running(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_two_days_ago = datetime.utcnow() - timedelta(days=2) rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_two_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # node not due to be rebooted node_boot_times = { "192.168.0.1": str(time_two_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # no pods running on node, should trigger take action mock_pod_list.return_value = test_util.mock_v1_pod_list([]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action) self.assertEqual(0, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration))
def test_check_status_ecc_error_node_already_detected( self, mock_load_ecc_config, mock_request_get, mock_list_node, mock_rule_alert_handler_load_config, mock_email_handler): mock_rule_config = test_util.mock_rule_config() mock_rule_alert_handler_load_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_rule_alert_handler = rule_alert_handler.RuleAlertHandler() # nodes already detected in previous run mock_rule_alert_handler.rule_cache = { "ecc_rule": { "mock-worker-one": { "time_found": "2020-02-18 21:14:20.351019", "instance": "192.168.0.1" } } } mock_request_get.return_value.json.return_value = _mock_prometheus_ecc_data( ) mock_list_node.return_value = test_util.mock_v1_node_list([{ "instance": "192.168.0.1", "node_name": "mock-worker-one" }, { "instance": "192.168.0.2", "node_name": "mock-worker-two" }]) ecc_rule_instance = EccDetectErrorRule(mock_rule_alert_handler, mock_rule_config) check_status_response = ecc_rule_instance.check_status() self.assertTrue(check_status_response) self.assertEqual(len(ecc_rule_instance.new_bad_nodes), 1) self.assertTrue("mock-worker-two" in ecc_rule_instance.new_bad_nodes)
def test_clean_expired_items_in_rule_cache(self, mock_load_rule_config, mock_email_handler, mock_ecc_config, mock_request_get): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_ecc_config.return_value = test_util.mock_latency_config() mock_ecc_config.return_value["hours_until_alert_expiration"] = 4 time_one_hours_ago = datetime.utcnow() - timedelta(hours=1) time_four_hours_ago = datetime.utcnow() - timedelta(hours=4, minutes=1) # large latency alert detected previously alert = rule_alert_handler.RuleAlertHandler() alert.rule_cache["smi_latency_rule"] = { "node1": { "time_found": time_four_hours_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" }, "node2": { "time_found": time_one_hours_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.2" } } smi_latency_rule_instance = nvidia_smi_latency_rule.NvidiaSmiLatencyRule( alert, rule_config) smi_latency_rule_instance.clean_expired_items_in_rule_cache() self.assertEqual(1, len(alert.rule_cache)) self.assertTrue("node2" in alert.rule_cache["smi_latency_rule"])
def test_check_status_time_to_take_action(self, mock_load_rule_config, mock_email_handler, mock_ecc_config, mock_request_get): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_ecc_config.return_value = test_util.mock_ecc_config() mock_ecc_config.return_value["days_until_node_reboot"] = 5 time_six_days_ago = datetime.utcnow() - timedelta(days=6) time_five_days_ago = datetime.utcnow() - timedelta(days=5, minutes=1) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_five_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } node_boot_times = { "192.168.0.1": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertEqual("node1", ecc_reboot_node_rule_instance.nodes_ready_for_action[0])
def test_take_action(self, mock_load_rule_config, mock_load_ecc_config, mock_email_handler, mock_pod_list, mock_cordon_node, mock_create_email_for_dris, mock_create_email_for_job_owner): mock_rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = mock_rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() alert = rule_alert_handler.RuleAlertHandler() ecc_rule_instance = EccDetectErrorRule(alert, mock_rule_config) ecc_rule_instance.new_bad_nodes = { "mock-worker-one": "192.168.0.1", "mock-worker-two": "192.168.0.2" } mock_pod_list.return_value = test_util.mock_v1_pod_list([{ "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "mock-worker-one" }, { "job_name": "12345678-abcd", "user_name": "user2", "vc_name": "vc2", "node_name": "mock-worker-one" }, { "job_name": "12345678-abcd", "user_name": "user2", "vc_name": "vc2", "node_name": "mock-worker-two" }, { "job_name": "99999999-efgh", "user_name": "user3", "vc_name": "vc3", "node_name": "mock-worker-three" }]) ecc_rule_instance.take_action() self.assertEqual(2, mock_cordon_node.call_count) self.assertEqual(1, mock_create_email_for_dris.call_count) self.assertEqual(2, mock_create_email_for_job_owner.call_count) self.assertTrue("ecc_rule" in alert.rule_cache) self.assertTrue("mock-worker-one" in alert.rule_cache["ecc_rule"]) self.assertEqual( "192.168.0.1", alert.rule_cache["ecc_rule"]["mock-worker-one"]["instance"]) self.assertTrue("mock-worker-two" in alert.rule_cache["ecc_rule"]) self.assertEqual( "192.168.0.2", alert.rule_cache["ecc_rule"]["mock-worker-two"]["instance"])
def test_take_action(self, mock_load_rule_config, mock_create_email_for_pause_resume_job, mock_get_requests, mock_email_handler, mock_load_ecc_config, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.side_effect = [ # job 1 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 2 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 3 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."} ] rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-two": {"instance": "192.168.0.2:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"} } mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "mock-worker-one" }, { "job_name": "12345678-abcd", "user_name": "user2", "vc_name": "vc2", "node_name": "mock-worker-one" }, { "job_name": "99999999-efgh", "user_name": "user3", "vc_name": "vc3", "node_name": "mock-worker-three" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = ["mock-worker-one", "mock-worker-three"] ecc_reboot_node_rule_instance.take_action() self.assertEqual(3, mock_create_email_for_pause_resume_job.call_count)
def test_take_action_reboot_failed(self, mock_load_rule_config, mock_create_email_for_pause_resume_job, mock_put_requests, mock_get_requests, mock_email_handler, mock_load_etcd_config, mock_load_ecc_config): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.side_effect = [ # job 1 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 2 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 3 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."} ] mock_put_requests.return_value.json.side_effect = [ { "action": "set", "node": { "key": "/mock-worker-one/reboot", "value": "True", "modifiedIndex": 39, "createdIndex": 39 } }, # reboot failed for one of the nodes { "error_code": 100, "message": "Something went wrong", "cause": "Unable to open connection" }] rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"} } ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule( rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = [ "mock-worker-one", "mock-worker-three"] ecc_reboot_node_rule_instance.jobs_ready_for_migration = { "87654321-wxyz": { "user_name": "user1", "vc_name": "vc1", "node_names": ["mock-worker-one"], "job_link": "/job-link-1" }, "12345678-abcd": { "user_name": "user2", "vc_name": "vc2", "node_names": ["mock-worker-one"], "job_link": "/job-link-2" }, "99999999-efgh": { "user_name": "user3", "vc_name": "vc3", "node_names": ["mock-worker-three"], "job_link": "/job-link-3" } } ecc_reboot_node_rule_instance.take_action() self.assertEqual(3, mock_create_email_for_pause_resume_job.call_count) self.assertEqual(2, len(rule_alert_handler_instance.rule_cache["ecc_rule"])) # reboot successful for this node self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"]) # reboot failed for this node self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"])
def test_take_action_pause_failed(self, mock_load_rule_config, mock_put_requests, mock_get_requests, mock_email_load_config, mock_load_ecc_config, mock_load_etcd_config): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.return_value = {"result": "Sorry, something went wrong."} rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-two": {"instance": "192.168.0.2:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"}, "mock-worker-four": {"instance": "192.168.0.4:9090"} } ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = [ "mock-worker-one", "mock-worker-two", "mock-worker-three", "mock-worker-four" ] ecc_reboot_node_rule_instance.jobs_ready_for_migration = { "87654321-wxyz": { "user_name": "user1", "vc_name": "vc1", "node_names": ["mock-worker-one"], "job_link": "/job-link-1" }, # distributed job "12345678-abcd": { "user_name": "user2", "vc_name": "vc1", "node_names": ["mock-worker-two", "mock-worker-three"], "job_link": "/job-link-2" } } mock_put_requests.return_value.json.return_value = { "action": "set", "node": { "key": "/mock-worker-four/reboot", "value": "True", "modifiedIndex": 39, "createdIndex": 39 } } ecc_reboot_node_rule_instance.take_action() # node should be skipped since job migration failed self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"]) self.assertFalse("mock-worker-one" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be skipped since job migration failed (distributed job) self.assertTrue("mock-worker-two" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-two"]) self.assertFalse("mock-worker-two" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be skipped since job migration failed (distributed job) self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"]) self.assertFalse("mock-worker-three" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be successfully rebooted (had no jobs to migrate) self.assertTrue("mock-worker-four" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-four"]) self.assertTrue("mock-worker-four" in ecc_reboot_node_rule_instance.nodes_ready_for_action)