def test_check_status_no_action_needed(self, mock_load_rule_config, mock_email_handler, mock_load_etcd_config, mock_load_ecc_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_two_days_ago = datetime.utcnow() - timedelta(days=2) time_three_days_ago = datetime.utcnow() - timedelta(days=3) time_six_days_ago = datetime.utcnow() - timedelta(days=6) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_two_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" }, # this node already has a reboot attempt, so should not trigger take action "node2": { "time_found": time_three_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.2", "reboot_requested": time_two_days_ago.strftime(rule_config['date_time_format']) } } # both nodes have not been rebooted after initial detection node_boot_times = { "192.168.0.1": str(time_three_days_ago.replace(tzinfo=timezone.utc).timestamp()), "192.168.0.2": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # at least one job running on the node mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "node1" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertFalse(response) self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action))
def test_check_status_node_due_for_reboot(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_six_days_ago = datetime.utcnow() - timedelta(days=6) time_five_days_ago = datetime.utcnow() - timedelta(days=5, minutes=1) # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_five_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # reboot is due to be rebooted (exceeded configured deadline), should trigger take action node_boot_times = { "192.168.0.1": str(time_six_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # at least one job running on the node mock_pod_list.return_value = test_util.mock_v1_pod_list([ { "job_name": "87654321-wxyz", "user_name": "user1", "vc_name": "vc1", "node_name": "node1" } ]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action) self.assertEqual(1, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration)) self.assertTrue("87654321-wxyz" in ecc_reboot_node_rule_instance.jobs_ready_for_migration)
def test_check_status_node_rebooted_after_detection(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list, mock_uncordon_node): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_one_days_ago = datetime.utcnow() - timedelta(days=1) now = datetime.utcnow() # ecc error detection occured in previous iteration rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_one_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # node rebooted *after* initial ecc error detection node_boot_times = { "192.168.0.1": str(now.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) mock_pod_list.return_value = test_util.mock_v1_pod_list([]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertFalse(response) self.assertEqual(0, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" not in rule_alert_handler_instance.rule_cache["ecc_rule"])
def test_check_status_no_jobs_running(self, mock_load_rule_config, mock_email_handler, mock_load_ecc_config, mock_load_etcd_config, mock_request_get, mock_pod_list): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["days_until_node_reboot"] = 5 time_two_days_ago = datetime.utcnow() - timedelta(days=2) rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "node1": { "time_found": time_two_days_ago.strftime(rule_config['date_time_format']), "instance": "192.168.0.1" } } # node not due to be rebooted node_boot_times = { "192.168.0.1": str(time_two_days_ago.replace(tzinfo=timezone.utc).timestamp()) } mock_request_get.return_value.json.return_value = _mock_prometheus_node_boot_time_response(node_boot_times) # no pods running on node, should trigger take action mock_pod_list.return_value = test_util.mock_v1_pod_list([]) ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) response = ecc_reboot_node_rule_instance.check_status() self.assertTrue(response) self.assertEqual(1, len(ecc_reboot_node_rule_instance.nodes_ready_for_action)) self.assertTrue("node1" in ecc_reboot_node_rule_instance.nodes_ready_for_action) self.assertEqual(0, len(ecc_reboot_node_rule_instance.jobs_ready_for_migration))
def test_take_action_reboot_failed(self, mock_load_rule_config, mock_create_email_for_pause_resume_job, mock_put_requests, mock_get_requests, mock_email_handler, mock_load_etcd_config, mock_load_ecc_config): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.side_effect = [ # job 1 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 2 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."}, # job 3 {"result": "Success, job paused."}, {"errorMsg": None, "jobStatus": "paused", "jobTime": "Thu, 30 Jan 2020 23:43:00 GMT"}, {"result": "Success, job resumed."} ] mock_put_requests.return_value.json.side_effect = [ { "action": "set", "node": { "key": "/mock-worker-one/reboot", "value": "True", "modifiedIndex": 39, "createdIndex": 39 } }, # reboot failed for one of the nodes { "error_code": 100, "message": "Something went wrong", "cause": "Unable to open connection" }] rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"} } ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule( rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = [ "mock-worker-one", "mock-worker-three"] ecc_reboot_node_rule_instance.jobs_ready_for_migration = { "87654321-wxyz": { "user_name": "user1", "vc_name": "vc1", "node_names": ["mock-worker-one"], "job_link": "/job-link-1" }, "12345678-abcd": { "user_name": "user2", "vc_name": "vc2", "node_names": ["mock-worker-one"], "job_link": "/job-link-2" }, "99999999-efgh": { "user_name": "user3", "vc_name": "vc3", "node_names": ["mock-worker-three"], "job_link": "/job-link-3" } } ecc_reboot_node_rule_instance.take_action() self.assertEqual(3, mock_create_email_for_pause_resume_job.call_count) self.assertEqual(2, len(rule_alert_handler_instance.rule_cache["ecc_rule"])) # reboot successful for this node self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"]) # reboot failed for this node self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"])
def test_take_action_pause_failed(self, mock_load_rule_config, mock_put_requests, mock_get_requests, mock_email_load_config, mock_load_ecc_config, mock_load_etcd_config): rule_config = test_util.mock_rule_config() mock_load_rule_config.return_value = rule_config etcd_config = test_util.mock_etcd_config() mock_load_etcd_config.return_value = etcd_config mock_load_ecc_config.return_value = test_util.mock_ecc_config() mock_load_ecc_config.return_value["alert_job_owners"] = True mock_get_requests.return_value.json.return_value = {"result": "Sorry, something went wrong."} rule_alert_handler_instance = rule_alert_handler.RuleAlertHandler() rule_alert_handler_instance.rule_cache["ecc_rule"] = { "mock-worker-one": {"instance": "192.168.0.1:9090"}, "mock-worker-two": {"instance": "192.168.0.2:9090"}, "mock-worker-three": {"instance": "192.168.0.3:9090"}, "mock-worker-four": {"instance": "192.168.0.4:9090"} } ecc_reboot_node_rule_instance = ecc_reboot_node_rule.EccRebootNodeRule(rule_alert_handler_instance, rule_config) ecc_reboot_node_rule_instance.nodes_ready_for_action = [ "mock-worker-one", "mock-worker-two", "mock-worker-three", "mock-worker-four" ] ecc_reboot_node_rule_instance.jobs_ready_for_migration = { "87654321-wxyz": { "user_name": "user1", "vc_name": "vc1", "node_names": ["mock-worker-one"], "job_link": "/job-link-1" }, # distributed job "12345678-abcd": { "user_name": "user2", "vc_name": "vc1", "node_names": ["mock-worker-two", "mock-worker-three"], "job_link": "/job-link-2" } } mock_put_requests.return_value.json.return_value = { "action": "set", "node": { "key": "/mock-worker-four/reboot", "value": "True", "modifiedIndex": 39, "createdIndex": 39 } } ecc_reboot_node_rule_instance.take_action() # node should be skipped since job migration failed self.assertTrue("mock-worker-one" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-one"]) self.assertFalse("mock-worker-one" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be skipped since job migration failed (distributed job) self.assertTrue("mock-worker-two" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-two"]) self.assertFalse("mock-worker-two" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be skipped since job migration failed (distributed job) self.assertTrue("mock-worker-three" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertFalse("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-three"]) self.assertFalse("mock-worker-three" in ecc_reboot_node_rule_instance.nodes_ready_for_action) # node should be successfully rebooted (had no jobs to migrate) self.assertTrue("mock-worker-four" in rule_alert_handler_instance.rule_cache["ecc_rule"]) self.assertTrue("reboot_requested" in rule_alert_handler_instance.rule_cache["ecc_rule"]["mock-worker-four"]) self.assertTrue("mock-worker-four" in ecc_reboot_node_rule_instance.nodes_ready_for_action)