def test_is_action_info_stale(self, time_mock): rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(5, 60, 5, 16, True, False, "") time_mock.return_value = 0 self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME")) rm.actions["COMPONENT_NAME"] = { "lastAttempt": 0, "count": 0, "lastReset": 0, "lifetimeCount": 0, "warnedLastAttempt": False, "warnedLastReset": False, "warnedThresholdReached": False } time_mock.return_value = 3600 self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME")) rm.actions["COMPONENT_NAME"] = { "lastAttempt": 1, "count": 1, "lastReset": 0, "lifetimeCount": 1, "warnedLastAttempt": False, "warnedLastReset": False, "warnedThresholdReached": False } time_mock.return_value = 3601 self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME")) time_mock.return_value = 3602 self.assertTrue(rm.is_action_info_stale("COMPONENT_NAME"))
def test_defaults(self): rm = RecoveryManager() self.assertFalse(rm.enabled()) self.assertEqual(None, rm.get_install_command("NODEMANAGER")) self.assertEqual(None, rm.get_start_command("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER"))
def test_is_action_info_stale(self, time_mock): rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(5, 60, 5, 16, True, False, False, "", -1) time_mock.return_value = 0 self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME")) rm.actions["COMPONENT_NAME"] = { "lastAttempt": 0, "count": 0, "lastReset": 0, "lifetimeCount": 0, "warnedLastAttempt": False, "warnedLastReset": False, "warnedThresholdReached": False } time_mock.return_value = 3600 self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME")) rm.actions["COMPONENT_NAME"] = { "lastAttempt": 1, "count": 1, "lastReset": 0, "lifetimeCount": 1, "warnedLastAttempt": False, "warnedLastReset": False, "warnedThresholdReached": False } time_mock.return_value = 3601 self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME")) time_mock.return_value = 3602 self.assertTrue(rm.is_action_info_stale("COMPONENT_NAME"))
def test_defaults(self): rm = RecoveryManager(tempfile.mktemp()) self.assertFalse(rm.enabled()) self.assertEqual(None, rm.get_install_command("NODEMANAGER")) self.assertEqual(None, rm.get_start_command("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) pass
def test_execute_status_command(self, CustomServiceOrchestrator_mock, build_mock, execute_command_mock, requestComponentSecurityState_mock, requestComponentStatus_mock, status_update_callback): CustomServiceOrchestrator_mock.return_value = None dummy_controller = MagicMock() actionQueue = ActionQueue(AmbariConfig(), dummy_controller) build_mock.return_value = {'dummy report': '' } dummy_controller.recovery_manager = RecoveryManager(tempfile.mktemp()) requestComponentStatus_mock.reset_mock() requestComponentStatus_mock.return_value = {'exitcode': 0 } requestComponentSecurityState_mock.reset_mock() requestComponentSecurityState_mock.return_value = 'UNKNOWN' actionQueue.execute_status_command(self.status_command) report = actionQueue.result() expected = {'dummy report': '', 'securityState' : 'UNKNOWN'} self.assertEqual(len(report['componentStatus']), 1) self.assertEqual(report['componentStatus'][0], expected) self.assertTrue(requestComponentStatus_mock.called)
def init(self): """ Initialize properties """ self.config = AmbariConfig.get_resolved_config() self.is_registered = False self.metadata_cache = ClusterMetadataCache( self.config.cluster_cache_dir) self.topology_cache = ClusterTopologyCache( self.config.cluster_cache_dir, self.config) self.host_level_params_cache = ClusterHostLevelParamsCache( self.config.cluster_cache_dir) self.configurations_cache = ClusterConfigurationCache( self.config.cluster_cache_dir) self.alert_definitions_cache = ClusterAlertDefinitionsCache( self.config.cluster_cache_dir) self.configuration_builder = ConfigurationBuilder(self) self.stale_alerts_monitor = StaleAlertsMonitor(self) self.file_cache = FileCache(self.config) self.customServiceOrchestrator = CustomServiceOrchestrator(self) self.recovery_manager = RecoveryManager(self.config.recovery_cache_dir) self.commandStatuses = CommandStatusDict(self) self.action_queue = ActionQueue(self) self.alert_scheduler_handler = AlertSchedulerHandler(self)
def init(self): """ Initialize properties """ self.is_registered = False self.metadata_cache = ClusterMetadataCache( self.config.cluster_cache_dir) self.topology_cache = ClusterTopologyCache( self.config.cluster_cache_dir, self.config) self.host_level_params_cache = ClusterHostLevelParamsCache( self.config.cluster_cache_dir) self.configurations_cache = ClusterConfigurationCache( self.config.cluster_cache_dir) self.alert_definitions_cache = ClusterAlertDefinitionsCache( self.config.cluster_cache_dir) self.configuration_builder = ConfigurationBuilder(self) self.stale_alerts_monitor = StaleAlertsMonitor(self) self.server_responses_listener = ServerResponsesListener() self.file_cache = FileCache(self.config) self.customServiceOrchestrator = CustomServiceOrchestrator(self) self.recovery_manager = RecoveryManager() self.commandStatuses = CommandStatusDict(self) self.init_threads()
def test_reset_queue(self, CustomServiceOrchestrator_mock, get_mock, process_command_mock, gpeo_mock): CustomServiceOrchestrator_mock.return_value = None dummy_controller = MagicMock() dummy_controller.recovery_manager = RecoveryManager(tempfile.mktemp()) config = MagicMock() gpeo_mock.return_value = 0 config.get_parallel_exec_option = gpeo_mock actionQueue = ActionQueue(config, dummy_controller) actionQueue.start() actionQueue.put([self.datanode_install_command, self.hbase_install_command]) self.assertEqual(2, actionQueue.commandQueue.qsize()) self.assertTrue(actionQueue.tasks_in_progress_or_pending()) actionQueue.reset() self.assertTrue(actionQueue.commandQueue.empty()) self.assertFalse(actionQueue.tasks_in_progress_or_pending()) time.sleep(0.1) actionQueue.stop() actionQueue.join() self.assertEqual(actionQueue.stopped(), True, 'Action queue is not stopped.')
def test_process_commands(self, mock_uds): rm = RecoveryManager(tempfile.mktemp(), True) rm.process_status_commands(None) self.assertFalse(mock_uds.called) rm.process_status_commands([]) self.assertFalse(mock_uds.called) rm.process_status_commands([self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_status_commands([self.command, self.exec_command1, self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")], [call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_execution_commands([self.exec_command1, self.exec_command2, self.exec_command3]) mock_uds.assert_has_calls([call("NODEMANAGER", "INSTALLED")], [call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_execution_commands([self.exec_command1, self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "INSTALLED")]) rm.process_execution_commands([self.exec_command4]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")]) pass
def test_recovery_required2(self): rm = RecoveryManager(tempfile.mktemp(), True, True) rm.update_config(15, 5, 1, 16, True, False, "", "") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm = RecoveryManager(tempfile.mktemp(), True, True) rm.update_config(15, 5, 1, 16, True, False, "NODEMANAGER", "") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) rm = RecoveryManager(tempfile.mktemp(), True, True) rm.update_config(15, 5, 1, 16, True, False, "", "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertTrue(rm.requires_recovery("DATANODE")) rm.update_config(15, 5, 1, 16, True, False, "", "NODEMANAGER") rm.update_config(15, 5, 1, 16, True, False, "NODEMANAGER", "") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) rm.update_config(15, 5, 1, 16, True, False, "NODEMANAGER", "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) pass
def test_configured_for_recovery(self): rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(12, 5, 1, 15, True, False, "A,B") self.assertTrue(rm.configured_for_recovery("A")) self.assertTrue(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "") self.assertFalse(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "A") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "A") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) rm.update_config(5, 5, 1, 11, True, False, "A, D, F ") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) self.assertTrue(rm.configured_for_recovery("D")) self.assertFalse(rm.configured_for_recovery("E")) self.assertTrue(rm.configured_for_recovery("F"))
def test_recovery_required(self): rm = RecoveryManager(tempfile.mktemp(), True, False) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "STARTED") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "XYS") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm = RecoveryManager(tempfile.mktemp(), True, True) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "START") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "START") self.assertFalse(rm.requires_recovery("NODEMANAGER")) pass
def test_get_recovery_commands(self, time_mock): time_mock.side_effect = \ [1000, 1001, 1002, 1003, 1100, 1101, 1102, 1200, 1201, 1203, 4000, 4001, 4002, 4003, 4100, 4101, 4102, 4103, 4200, 4201, 4202, 4300, 4301, 4302] rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(15, 5, 1, 16, True, False, "", "") command1 = copy.deepcopy(self.command) rm.store_or_update_command(command1) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertEqual("INSTALLED", rm.get_current_status("NODEMANAGER")) self.assertEqual("STARTED", rm.get_desired_status("NODEMANAGER")) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "STARTED") # Starts at 1100 commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") # Starts at 1200 commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_config(2, 5, 1, 5, True, True, "", "") rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) rm.update_config(12, 5, 1, 15, True, False, "", "") rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") rm.store_or_update_command(command1) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_config_staleness("NODEMANAGER", False) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) command_install = copy.deepcopy(self.command) command_install["desiredState"] = "INSTALLED" rm.store_or_update_command(command_install) rm.update_config_staleness("NODEMANAGER", True) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "STARTED") rm.update_desired_status("NODEMANAGER", "STARTED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("CUSTOM_COMMAND", commands[0]["roleCommand"]) self.assertEqual("RESTART", commands[0]["hostLevelParams"]["custom_command"]) rm.update_current_status("NODEMANAGER", "STARTED") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("STOP", commands[0]["roleCommand"]) pass
def test_auto_execute_command(self, status_update_callback_mock, open_mock): # Make file read calls visible def open_side_effect(file, mode): if mode == 'r': file_mock = MagicMock() file_mock.read.return_value = "Read from " + str(file) return file_mock else: return self.original_open(file, mode) open_mock.side_effect = open_side_effect config = AmbariConfig() tempdir = tempfile.gettempdir() config.set('agent', 'prefix', tempdir) config.set('agent', 'cache_dir', "/var/lib/ambari-agent/cache") config.set('agent', 'tolerate_download_failures', "true") dummy_controller = MagicMock() dummy_controller.recovery_manager = RecoveryManager(tempfile.mktemp()) dummy_controller.recovery_manager.update_config(5, 5, 1, 11, True, False, "", -1) actionQueue = ActionQueue(config, dummy_controller) unfreeze_flag = threading.Event() python_execution_result_dict = { 'stdout': 'out', 'stderr': 'stderr', 'structuredOut' : '' } def side_effect(command, tmpoutfile, tmperrfile, override_output_files=True, retry=False): unfreeze_flag.wait() return python_execution_result_dict def patched_aq_execute_command(command): # We have to perform patching for separate thread in the same thread with patch.object(CustomServiceOrchestrator, "runCommand") as runCommand_mock: runCommand_mock.side_effect = side_effect actionQueue.process_command(command) python_execution_result_dict['status'] = 'COMPLETE' python_execution_result_dict['exitcode'] = 0 self.assertFalse(actionQueue.tasks_in_progress_or_pending()) # We call method in a separate thread execution_thread = Thread(target = patched_aq_execute_command , args = (self.datanode_auto_start_command, )) execution_thread.start() # check in progress report # wait until ready while True: time.sleep(0.1) if actionQueue.tasks_in_progress_or_pending(): break # Continue command execution unfreeze_flag.set() # wait until ready check_queue = True while check_queue: report = actionQueue.result() if not actionQueue.tasks_in_progress_or_pending(): break time.sleep(0.1) self.assertEqual(len(report['reports']), 0) ## Test failed execution python_execution_result_dict['status'] = 'FAILED' python_execution_result_dict['exitcode'] = 13 # We call method in a separate thread execution_thread = Thread(target = patched_aq_execute_command , args = (self.datanode_auto_start_command, )) execution_thread.start() unfreeze_flag.set() # check in progress report # wait until ready while check_queue: report = actionQueue.result() if not actionQueue.tasks_in_progress_or_pending(): break time.sleep(0.1) self.assertEqual(len(report['reports']), 0)
def test_process_commands(self, mock_uds): rm = RecoveryManager(True) rm.process_status_commands(None) self.assertFalse(mock_uds.called) rm.process_status_commands([]) self.assertFalse(mock_uds.called) rm.process_status_commands([self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_status_commands( [self.command, self.exec_command1, self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")], [call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_execution_commands( [self.exec_command1, self.exec_command2, self.exec_command3]) mock_uds.assert_has_calls([call("NODEMANAGER", "INSTALLED")], [call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_execution_commands([self.exec_command1, self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "INSTALLED")]) pass
def test_configured_for_recovery(self): rm = RecoveryManager(True) self.assertTrue(rm.configured_for_recovery("A")) self.assertTrue(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "", "") self.assertTrue(rm.configured_for_recovery("A")) self.assertTrue(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "A", "") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "", "B,C") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) rm.update_config(5, 5, 1, 11, True, False, "A, D, F ", "B,C") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) self.assertTrue(rm.configured_for_recovery("D")) self.assertFalse(rm.configured_for_recovery("E")) self.assertTrue(rm.configured_for_recovery("F"))
def test_reset_if_window_passed_since_last_attempt(self, time_mock): time_mock.side_effect = \ [1000, 1071, 1372] rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(2, 5, 1, 4, True, True, "", "") rm.execute("COMPONENT") actions = rm.get_actions_copy()["COMPONENT"] self.assertEquals(actions['lastReset'], 1000) rm.execute("COMPONENT") actions = rm.get_actions_copy()["COMPONENT"] self.assertEquals(actions['lastReset'], 1000) #reset if window_in_sec seconds passed since last attempt rm.execute("COMPONENT") actions = rm.get_actions_copy()["COMPONENT"] self.assertEquals(actions['lastReset'], 1372)
def test_configured_for_recovery(self): rm = RecoveryManager(tempfile.mktemp(), True) self.assertTrue(rm.configured_for_recovery("A")) self.assertTrue(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "", "") self.assertTrue(rm.configured_for_recovery("A")) self.assertTrue(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "A", "") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, "", "B,C") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) rm.update_config(5, 5, 1, 11, True, False, "A, D, F ", "B,C") self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) self.assertTrue(rm.configured_for_recovery("D")) self.assertFalse(rm.configured_for_recovery("E")) self.assertTrue(rm.configured_for_recovery("F"))
def test_command_count(self): rm = RecoveryManager(tempfile.mktemp(), True) self.assertFalse(rm.has_active_command()) rm.start_execution_command() self.assertTrue(rm.has_active_command()) rm.start_execution_command() self.assertTrue(rm.has_active_command()) rm.stop_execution_command() self.assertTrue(rm.has_active_command()) rm.stop_execution_command() self.assertFalse(rm.has_active_command())
def test_command_expiry(self, time_mock): time_mock.side_effect = \ [1000, 1001, 1002, 1003, 1104, 1105, 1106, 1807, 1808, 1809, 1810, 1811, 1812] rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(5, 5, 1, 11, True, False, "", "") command1 = copy.deepcopy(self.command) rm.store_or_update_command(command1) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) #1807 command is stale commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) rm.store_or_update_command(command1) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) pass
def test_recovery_report(self, time_mock): time_mock.side_effect = \ [1000, 1071, 1072, 1470, 1471, 1472, 1543, 1644, 1815] rm = RecoveryManager(tempfile.mktemp()) rec_st = rm.get_recovery_status() self.assertEquals(rec_st, {"summary": "DISABLED"}) rm.update_config(2, 5, 1, 4, True, True, "", "") rec_st = rm.get_recovery_status() self.assertEquals(rec_st, {"summary": "RECOVERABLE", "componentReports": []}) rm.execute("PUMA") rec_st = rm.get_recovery_status() self.assertEquals(rec_st, {"summary": "RECOVERABLE", "componentReports": [{"name": "PUMA", "numAttempts": 1, "limitReached": False}]}) rm.execute("PUMA") rm.execute("LION") rec_st = rm.get_recovery_status() self.assertEquals(rec_st, {"summary": "RECOVERABLE", "componentReports": [ {"name": "LION", "numAttempts": 1, "limitReached": False}, {"name": "PUMA", "numAttempts": 2, "limitReached": False} ]}) rm.execute("PUMA") rm.execute("LION") rm.execute("PUMA") rm.execute("PUMA") rm.execute("LION") rec_st = rm.get_recovery_status() self.assertEquals(rec_st, {"summary": "PARTIALLY_RECOVERABLE", "componentReports": [ {"name": "LION", "numAttempts": 3, "limitReached": False}, {"name": "PUMA", "numAttempts": 4, "limitReached": True} ]}) rm.execute("LION") rec_st = rm.get_recovery_status() self.assertEquals(rec_st, {"summary": "UNRECOVERABLE", "componentReports": [ {"name": "LION", "numAttempts": 4, "limitReached": True}, {"name": "PUMA", "numAttempts": 4, "limitReached": True} ]}) pass
def test_update_rm_config(self, mock_uc): rm = RecoveryManager(tempfile.mktemp()) rm.update_configuration_from_registration(None) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, True, "", "")]) mock_uc.reset_mock() rm.update_configuration_from_registration({}) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, True, "", "")]) mock_uc.reset_mock() rm.update_configuration_from_registration( {"recoveryConfig": { "type" : "DEFAULT"}} ) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, True, "", "")]) mock_uc.reset_mock() rm.update_configuration_from_registration( {"recoveryConfig": { "type" : "FULL"}} ) mock_uc.assert_has_calls([call(6, 60, 5, 12, True, False, "", "")]) mock_uc.reset_mock() rm.update_configuration_from_registration( {"recoveryConfig": { "type" : "AUTO_START", "max_count" : "med"}} ) mock_uc.assert_has_calls([call(6, 60, 5, 12, True, True, "", "")]) mock_uc.reset_mock() rm.update_configuration_from_registration( {"recoveryConfig": { "type" : "AUTO_START", "maxCount" : "5", "windowInMinutes" : 20, "retryGap" : 2, "maxLifetimeCount" : 5, "enabledComponents" : " A,B", "disabledComponents" : "C"}} ) mock_uc.assert_has_calls([call(5, 20, 2, 5, True, True, " A,B", "C")])
def test_get_recovery_commands(self, time_mock): time_mock.side_effect = \ [1000, 2000, 3000, 4000, 5000, 6000] rm = RecoveryManager(True) rm.update_config(10, 5, 1, 11, True, False) command1 = copy.deepcopy(self.command) rm.store_or_update_command(command1) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "STARTED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_config(2, 5, 1, 5, True, True) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) pass
def test_sliding_window(self, time_mock): time_mock.side_effect = \ [1000, 1001, 1002, 1003, 1004, 1071, 1150, 1151, 1152, 1153, 1400, 1401, 1500, 1571, 1572, 1653, 1900, 1971, 2300, 2301] rm = RecoveryManager(tempfile.mktemp(), True, False) self.assertTrue(rm.enabled()) config = rm.update_config(0, 60, 5, 12, True, False, "") self.assertFalse(rm.enabled()) rm.update_config(6, 60, 5, 12, True, False, "") self.assertTrue(rm.enabled()) rm.update_config(6, 0, 5, 12, True, False, "") self.assertFalse(rm.enabled()) rm.update_config(6, 60, 0, 12, True, False, "") self.assertFalse(rm.enabled()) rm.update_config(6, 60, 1, 12, True, False, None) self.assertTrue(rm.enabled()) rm.update_config(6, 60, 61, 12, True, False, None) self.assertFalse(rm.enabled()) rm.update_config(6, 60, 5, 4, True, False, "") self.assertFalse(rm.enabled()) # maximum 2 in 2 minutes and at least 1 minute wait rm.update_config(2, 5, 1, 4, True, False, "") self.assertTrue(rm.enabled()) # T = 1000-2 self.assertTrue(rm.may_execute("NODEMANAGER")) self.assertTrue(rm.may_execute("NODEMANAGER")) self.assertTrue(rm.may_execute("NODEMANAGER")) # T = 1003-4 self.assertTrue(rm.execute("NODEMANAGER")) self.assertFalse(rm.execute("NODEMANAGER")) # too soon # T = 1071 self.assertTrue(rm.execute("NODEMANAGER")) # 60+ seconds passed # T = 1150-3 self.assertFalse(rm.execute("NODEMANAGER")) # limit 2 exceeded self.assertFalse(rm.may_execute("NODEMANAGER")) self.assertTrue(rm.execute("DATANODE")) self.assertTrue(rm.may_execute("NAMENODE")) # T = 1400-1 self.assertTrue(rm.execute("NODEMANAGER")) # windows reset self.assertFalse(rm.may_execute("NODEMANAGER")) # too soon # maximum 2 in 2 minutes and no min wait rm.update_config(2, 5, 1, 5, True, True, "") # T = 1500-3 self.assertTrue(rm.execute("NODEMANAGER2")) self.assertTrue(rm.may_execute("NODEMANAGER2")) self.assertTrue(rm.execute("NODEMANAGER2")) self.assertFalse(rm.execute("NODEMANAGER2")) # max limit # T = 1900-2 self.assertTrue(rm.execute("NODEMANAGER2")) self.assertTrue(rm.execute("NODEMANAGER2")) # T = 2300-2 # lifetime max reached self.assertTrue(rm.execute("NODEMANAGER2")) self.assertFalse(rm.execute("NODEMANAGER2")) pass
def test_recovery_required(self): rm = RecoveryManager(True, False) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "STARTED") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "XYS") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm = RecoveryManager(True, True) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "START") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "START") self.assertFalse(rm.requires_recovery("NODEMANAGER")) pass
def test_store_from_status_and_use(self): rm = RecoveryManager(tempfile.mktemp(), True) command1 = copy.deepcopy(self.command) rm.store_or_update_command(command1) self.assertTrue(rm.command_exists("NODEMANAGER", "EXECUTION_COMMAND")) install_command = rm.get_install_command("NODEMANAGER") start_command = rm.get_start_command("NODEMANAGER") self.assertEqual("INSTALL", install_command["roleCommand"]) self.assertEqual("START", start_command["roleCommand"]) self.assertEqual("AUTO_EXECUTION_COMMAND", install_command["commandType"]) self.assertEqual("AUTO_EXECUTION_COMMAND", start_command["commandType"]) self.assertEqual("NODEMANAGER", install_command["role"]) self.assertEqual("NODEMANAGER", start_command["role"]) self.assertEquals(install_command["configurations"], start_command["configurations"]) self.assertEqual(2, install_command["taskId"]) self.assertEqual(3, start_command["taskId"]) self.assertEqual(None, rm.get_install_command("component2")) self.assertEqual(None, rm.get_start_command("component2")) self.assertTrue(rm.remove_command("NODEMANAGER")) self.assertFalse(rm.remove_command("NODEMANAGER")) self.assertEqual(None, rm.get_install_command("NODEMANAGER")) self.assertEqual(None, rm.get_start_command("NODEMANAGER")) self.assertEqual(None, rm.get_install_command("component2")) self.assertEqual(None, rm.get_start_command("component2")) rm.store_or_update_command(command1) self.assertTrue(rm.command_exists("NODEMANAGER", "EXECUTION_COMMAND")) rm.set_paused(True) self.assertEqual(None, rm.get_install_command("NODEMANAGER")) self.assertEqual(None, rm.get_start_command("NODEMANAGER")) pass
def test_recovery_required(self): rm = RecoveryManager(MagicMock(), False) rm.update_config( 12, 5, 1, 15, True, False, False, ) rm.update_recovery_config({ 'recoveryConfig': { 'components': [{ 'component_name': 'NODEMANAGER', 'service_name': 'YARN', 'desired_state': 'INSTALLED' }] } }) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "STARTED") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "XYS") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm = RecoveryManager(MagicMock(), True) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "START") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "START") self.assertFalse(rm.requires_recovery("NODEMANAGER"))
def test_process_commands(self, mock_uds): rm = RecoveryManager(tempfile.mktemp(), True) rm.process_status_commands(None) self.assertFalse(mock_uds.called) rm.process_status_commands([]) self.assertFalse(mock_uds.called) rm.process_status_commands([self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_status_commands( [self.command, self.exec_command1, self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")], [call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.update_config(12, 5, 1, 15, True, False, "NODEMANAGER") rm.process_execution_commands( [self.exec_command1, self.exec_command2, self.exec_command3]) mock_uds.assert_has_calls([call("NODEMANAGER", "INSTALLED")], [call("NODEMANAGER", "STARTED")]) mock_uds.reset_mock() rm.process_execution_commands([self.exec_command1, self.command]) mock_uds.assert_has_calls([call("NODEMANAGER", "INSTALLED")]) rm.process_execution_commands([self.exec_command4]) mock_uds.assert_has_calls([call("NODEMANAGER", "STARTED")]) pass
def test_update_rm_config(self, mock_uc): rm = RecoveryManager(tempfile.mktemp()) rm.update_configuration_from_registration(None) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, True, "")]) mock_uc.reset_mock() rm.update_configuration_from_registration({}) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, True, "")]) mock_uc.reset_mock() rm.update_configuration_from_registration( {"recoveryConfig": { "type": "DEFAULT" }}) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, True, "")]) mock_uc.reset_mock() rm.update_configuration_from_registration( {"recoveryConfig": { "type": "FULL" }}) mock_uc.assert_has_calls([call(6, 60, 5, 12, True, False, "")]) mock_uc.reset_mock() rm.update_configuration_from_registration( {"recoveryConfig": { "type": "AUTO_START", "max_count": "med" }}) mock_uc.assert_has_calls([call(6, 60, 5, 12, True, True, "")]) mock_uc.reset_mock() rm.update_configuration_from_registration({ "recoveryConfig": { "type": "AUTO_START", "maxCount": "5", "windowInMinutes": 20, "retryGap": 2, "maxLifetimeCount": 5, "components": " A,B" } }) mock_uc.assert_has_calls([call(5, 20, 2, 5, True, True, " A,B")])
def test_recovery_required2(self): rm = RecoveryManager(tempfile.mktemp(), True, True) rm.update_config(15, 5, 1, 16, True, False, "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm = RecoveryManager(tempfile.mktemp(), True, True) rm.update_config(15, 5, 1, 16, True, False, "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) rm = RecoveryManager(tempfile.mktemp(), True, True) rm.update_config(15, 5, 1, 16, True, False, "") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) rm.update_config(15, 5, 1, 16, True, False, "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) pass
def test_command_expiry(self, time_mock): time_mock.side_effect = \ [1000, 1001, 1002, 1003, 1104, 1105, 1106, 1807, 1808, 1809, 1810, 1811, 1812] rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(5, 5, 1, 11, True, False, "") command1 = copy.deepcopy(self.command) rm.store_or_update_command(command1) rm.update_config(12, 5, 1, 15, True, False, "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) #1807 command is stale commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) rm.store_or_update_command(command1) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) pass
def test_get_recovery_commands(self, time_mock): time_mock.side_effect = \ [1000, 1001, 1002, 1003, 1100, 1101, 1102, 1200, 1201, 1203, 4000, 4001, 4002, 4003, 4100, 4101, 4102, 4103, 4200, 4201, 4202, 4300, 4301, 4302] rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(15, 5, 1, 16, True, False, "") command1 = copy.deepcopy(self.command) rm.store_or_update_command(command1) rm.update_config(12, 5, 1, 15, True, False, "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertEqual("INSTALLED", rm.get_current_status("NODEMANAGER")) self.assertEqual("STARTED", rm.get_desired_status("NODEMANAGER")) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "STARTED") # Starts at 1100 commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") # Starts at 1200 commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_config(2, 5, 1, 5, True, True, "") rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) rm.update_config(12, 5, 1, 15, True, False, "NODEMANAGER") rm.update_current_status("NODEMANAGER", "INIT") rm.update_desired_status("NODEMANAGER", "INSTALLED") rm.store_or_update_command(command1) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_config_staleness("NODEMANAGER", False) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) command_install = copy.deepcopy(self.command) command_install["desiredState"] = "INSTALLED" rm.store_or_update_command(command_install) rm.update_config_staleness("NODEMANAGER", True) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("INSTALL", commands[0]["roleCommand"]) rm.update_current_status("NODEMANAGER", "STARTED") rm.update_desired_status("NODEMANAGER", "STARTED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("CUSTOM_COMMAND", commands[0]["roleCommand"]) self.assertEqual("RESTART", commands[0]["hostLevelParams"]["custom_command"]) rm.update_current_status("NODEMANAGER", "STARTED") rm.update_desired_status("NODEMANAGER", "INSTALLED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("STOP", commands[0]["roleCommand"]) pass
def test_sliding_window(self, time_mock): time_mock.side_effect = \ [1000, 1001, 1002, 1003, 1004, 1071, 1150, 1151, 1152, 1153, 1400, 1401, 1500, 1571, 1572, 1653, 1900, 1971, 2300, 2301] rm = RecoveryManager(tempfile.mktemp(), True, False) self.assertTrue(rm.enabled()) rm.update_config(0, 60, 5, 12, True, False, "", "") self.assertFalse(rm.enabled()) rm.update_config(6, 60, 5, 12, True, False, "", "") self.assertTrue(rm.enabled()) rm.update_config(6, 0, 5, 12, True, False, "", "") self.assertFalse(rm.enabled()) rm.update_config(6, 60, 0, 12, True, False, "", "") self.assertFalse(rm.enabled()) rm.update_config(6, 60, 1, 12, True, False, None, None) self.assertTrue(rm.enabled()) rm.update_config(6, 60, 61, 12, True, False, "", None) self.assertFalse(rm.enabled()) rm.update_config(6, 60, 5, 0, True, False, None, "") self.assertFalse(rm.enabled()) rm.update_config(6, 60, 5, 4, True, False, "", "") self.assertFalse(rm.enabled()) # maximum 2 in 2 minutes and at least 1 minute wait rm.update_config(2, 5, 1, 4, True, False, "", "") self.assertTrue(rm.enabled()) # T = 1000-2 self.assertTrue(rm.may_execute("NODEMANAGER")) self.assertTrue(rm.may_execute("NODEMANAGER")) self.assertTrue(rm.may_execute("NODEMANAGER")) # T = 1003-4 self.assertTrue(rm.execute("NODEMANAGER")) self.assertFalse(rm.execute("NODEMANAGER")) # too soon # T = 1071 self.assertTrue(rm.execute("NODEMANAGER")) # 60+ seconds passed # T = 1150-3 self.assertFalse(rm.execute("NODEMANAGER")) # limit 2 exceeded self.assertFalse(rm.may_execute("NODEMANAGER")) self.assertTrue(rm.execute("DATANODE")) self.assertTrue(rm.may_execute("NAMENODE")) # T = 1400-1 self.assertTrue(rm.execute("NODEMANAGER")) # windows reset self.assertFalse(rm.may_execute("NODEMANAGER")) # too soon # maximum 2 in 2 minutes and no min wait rm.update_config(2, 5, 1, 5, True, True, "", "") # T = 1500-3 self.assertTrue(rm.execute("NODEMANAGER2")) self.assertTrue(rm.may_execute("NODEMANAGER2")) self.assertTrue(rm.execute("NODEMANAGER2")) self.assertFalse(rm.execute("NODEMANAGER2")) # max limit # T = 1900-2 self.assertTrue(rm.execute("NODEMANAGER2")) self.assertTrue(rm.execute("NODEMANAGER2")) # T = 2300-2 # lifetime max reached self.assertTrue(rm.execute("NODEMANAGER2")) self.assertFalse(rm.execute("NODEMANAGER2")) pass
def test_recovery_report(self, time_mock): time_mock.side_effect = \ [1000, 1071, 1072, 1470, 1471, 1472, 1543, 1644, 1815] rm = RecoveryManager(tempfile.mktemp()) rec_st = rm.get_recovery_status() self.assertEquals(rec_st, {"summary": "DISABLED"}) rm.update_config(2, 5, 1, 4, True, True, "") rec_st = rm.get_recovery_status() self.assertEquals(rec_st, { "summary": "RECOVERABLE", "componentReports": [] }) rm.execute("PUMA") rec_st = rm.get_recovery_status() self.assertEquals( rec_st, { "summary": "RECOVERABLE", "componentReports": [{ "name": "PUMA", "numAttempts": 1, "limitReached": False }] }) rm.execute("PUMA") rm.execute("LION") rec_st = rm.get_recovery_status() self.assertEquals( rec_st, { "summary": "RECOVERABLE", "componentReports": [{ "name": "LION", "numAttempts": 1, "limitReached": False }, { "name": "PUMA", "numAttempts": 2, "limitReached": False }] }) rm.execute("PUMA") rm.execute("LION") rm.execute("PUMA") rm.execute("PUMA") rm.execute("LION") rec_st = rm.get_recovery_status() self.assertEquals( rec_st, { "summary": "PARTIALLY_RECOVERABLE", "componentReports": [{ "name": "LION", "numAttempts": 3, "limitReached": False }, { "name": "PUMA", "numAttempts": 4, "limitReached": True }] }) rm.execute("LION") rec_st = rm.get_recovery_status() self.assertEquals( rec_st, { "summary": "UNRECOVERABLE", "componentReports": [{ "name": "LION", "numAttempts": 4, "limitReached": True }, { "name": "PUMA", "numAttempts": 4, "limitReached": True }] }) pass
def test_recovery_required2(self): rm = RecoveryManager(True, True) rm.update_config(15, 5, 1, 16, True, False, False, [ {'component_name': 'NODEMANAGER', 'service_name': 'YARN', 'desired_state': 'INSTALLED'} ]) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm = RecoveryManager( True, True) rm.update_config(15, 5, 1, 16, True, False, False, [ {'component_name': 'NODEMANAGER', 'service_name': 'YARN', 'desired_state': 'INSTALLED'} ]) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) rm = RecoveryManager(True, True) rm.update_config(15, 5, 1, 16, True, False, False, "") rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertFalse(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE")) rm.update_config(15, 5, 1, 16, True, False, False, [ {'component_name': 'NODEMANAGER', 'service_name': 'YARN', 'desired_state': 'INSTALLED'} ]) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") self.assertTrue(rm.requires_recovery("NODEMANAGER")) rm.update_current_status("DATANODE", "INSTALLED") rm.update_desired_status("DATANODE", "STARTED") self.assertFalse(rm.requires_recovery("DATANODE"))
def test_update_rm_config(self, mock_uc): rm = RecoveryManager() rm.update_recovery_config(None) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, False, False, [])]) mock_uc.reset_mock() rm.update_recovery_config({}) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, False, False, [])]) mock_uc.reset_mock() rm.update_recovery_config( {"recoveryConfig": { "type" : "DEFAULT"}} ) mock_uc.assert_has_calls([call(6, 60, 5, 12, False, False, False, [])]) mock_uc.reset_mock() rm.update_recovery_config( {"recoveryConfig": { "type" : "FULL"}} ) mock_uc.assert_has_calls([call(6, 60, 5, 12, True, False, False, [])]) mock_uc.reset_mock() rm.update_recovery_config( {"recoveryConfig": { "type" : "AUTO_START", "max_count" : "med"}} ) mock_uc.assert_has_calls([call(6, 60, 5, 12, True, True, False, [])]) mock_uc.reset_mock() rm.update_recovery_config( {"recoveryConfig": { "type" : "AUTO_INSTALL_START", "max_count" : "med"}} ) mock_uc.assert_has_calls([call(6, 60, 5, 12, True, False, True, [])]) mock_uc.reset_mock() rm.update_recovery_config( {"recoveryConfig": { "type": "AUTO_START", "maxCount": "5", "windowInMinutes" : 20, "retryGap": 2, "maxLifetimeCount" : 5, "components": [ { "service_name": "A", "component_name": "A", "desired_state": "INSTALLED" }, { "service_name": "B", "component_name": "B", "desired_state": "INSTALLED" } ], "recoveryTimestamp": 1}} ) mock_uc.assert_has_calls([call(5, 20, 2, 5, True, True, False, [ {'component_name': 'A', 'service_name': 'A', 'desired_state': 'INSTALLED'}, {'component_name': 'B', 'service_name': 'B', 'desired_state': 'INSTALLED'} ])])
def test_reset_if_window_passed_since_last_attempt(self, time_mock): time_mock.side_effect = \ [1000, 1071, 1372] rm = RecoveryManager(tempfile.mktemp(), True) rm.update_config(2, 5, 1, 4, True, True, "") rm.execute("COMPONENT") actions = rm.get_actions_copy()["COMPONENT"] self.assertEquals(actions['lastReset'], 1000) rm.execute("COMPONENT") actions = rm.get_actions_copy()["COMPONENT"] self.assertEquals(actions['lastReset'], 1000) #reset if window_in_sec seconds passed since last attempt rm.execute("COMPONENT") actions = rm.get_actions_copy()["COMPONENT"] self.assertEquals(actions['lastReset'], 1372)
def test_command_expiry(self, time_mock): time_mock.side_effect = \ [1000, 1001, 1104, 1105, 1106, 1807, 1808, 1809, 1810, 1811, 1812] rm = RecoveryManager(True) rm.update_config(5, 5, 0, 11, True, False, False, "") command1 = copy.deepcopy(self.command) #rm.store_or_update_command(command1) rm.update_config(12, 5, 1, 15, True, False, False, [ {'component_name': 'NODEMANAGER', 'service_name': 'YARN', 'desired_state': 'INSTALLED'} ]) rm.update_current_status("NODEMANAGER", "INSTALLED") rm.update_desired_status("NODEMANAGER", "STARTED") commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"]) rm.retry_gap_in_sec = 60 #1807 command is stale commands = rm.get_recovery_commands() self.assertEqual(0, len(commands)) commands = rm.get_recovery_commands() self.assertEqual(1, len(commands)) self.assertEqual("START", commands[0]["roleCommand"])
def test_build_long_result(self, result_mock): config = AmbariConfig.AmbariConfig() config.set('agent', 'prefix', 'tmp') config.set('agent', 'cache_dir', "/var/lib/ambari-agent/cache") config.set('agent', 'tolerate_download_failures', "true") dummy_controller = MagicMock() dummy_controller.recovery_manager = RecoveryManager(tempfile.mktemp()) actionQueue = ActionQueue(config, dummy_controller) result_mock.return_value = { 'reports': [{'status': 'IN_PROGRESS', 'stderr': 'Read from /tmp/errors-3.txt', 'stdout': 'Read from /tmp/output-3.txt', 'clusterName': u'cc', 'roleCommand': u'INSTALL', 'serviceName': u'HDFS', 'role': u'DATANODE', 'actionId': '1-1', 'taskId': 3, 'exitCode': 777}, {'status': 'COMPLETED', 'stderr': 'stderr', 'stdout': 'out', 'clusterName': 'clusterName', 'roleCommand': 'UPGRADE', 'serviceName': 'serviceName', 'role': 'role', 'actionId': 17, 'taskId': 'taskId', 'exitCode': 0}, {'status': 'FAILED', 'stderr': 'stderr', 'stdout': 'out', 'clusterName': u'cc', 'roleCommand': u'INSTALL', 'serviceName': u'HDFS', 'role': u'DATANODE', 'actionId': '1-1', 'taskId': 3, 'exitCode': 13}, {'status': 'COMPLETED', 'stderr': 'stderr', 'stdout': 'out', 'clusterName': u'cc', 'configurationTags': {'global': {'tag': 'v1'}}, 'roleCommand': u'INSTALL', 'serviceName': u'HDFS', 'role': u'DATANODE', 'actionId': '1-1', 'taskId': 3, 'exitCode': 0} ], 'componentStatus': [ {'status': 'HEALTHY', 'componentName': 'DATANODE'}, {'status': 'UNHEALTHY', 'componentName': 'NAMENODE'}, ], } heartbeat = Heartbeat(actionQueue) hb = heartbeat.build(10) hb['hostname'] = 'hostname' hb['timestamp'] = 'timestamp' expected = {'nodeStatus': {'status': 'HEALTHY', 'cause': 'NONE'}, 'recoveryReport': {'summary': 'DISABLED'}, 'recoveryTimestamp': -1, 'timestamp': 'timestamp', 'hostname': 'hostname', 'responseId': 10, 'reports': [ {'status': 'IN_PROGRESS', 'roleCommand': u'INSTALL', 'serviceName': u'HDFS', 'role': u'DATANODE', 'actionId': '1-1', 'stderr': 'Read from /tmp/errors-3.txt', 'stdout': 'Read from /tmp/output-3.txt', 'clusterName': u'cc', 'taskId': 3, 'exitCode': 777}, {'status': 'COMPLETED', 'roleCommand': 'UPGRADE', 'serviceName': 'serviceName', 'role': 'role', 'actionId': 17, 'stderr': 'stderr', 'stdout': 'out', 'clusterName': 'clusterName', 'taskId': 'taskId', 'exitCode': 0}, {'status': 'FAILED', 'roleCommand': u'INSTALL', 'serviceName': u'HDFS', 'role': u'DATANODE', 'actionId': '1-1', 'stderr': 'stderr', 'stdout': 'out', 'clusterName': u'cc', 'taskId': 3, 'exitCode': 13}, {'status': 'COMPLETED', 'stdout': 'out', 'configurationTags': {'global': {'tag': 'v1'}}, 'taskId': 3, 'exitCode': 0, 'roleCommand': u'INSTALL', 'clusterName': u'cc', 'serviceName': u'HDFS', 'role': u'DATANODE', 'actionId': '1-1', 'stderr': 'stderr'}], 'componentStatus': [ {'status': 'HEALTHY', 'componentName': 'DATANODE'}, {'status': 'UNHEALTHY', 'componentName': 'NAMENODE'}]} self.assertEqual.__self__.maxDiff = None self.assertEquals(hb, expected)
def test_configured_for_recovery(self): rm = RecoveryManager(True) rm.update_config(12, 5, 1, 15, True, False, False, [ {'component_name': 'A', 'service_name': 'A', 'desired_state': 'INSTALLED'}, {'component_name': 'B', 'service_name': 'B', 'desired_state': 'INSTALLED'}, ]) self.assertTrue(rm.configured_for_recovery("A")) self.assertTrue(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, False, []) self.assertFalse(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, False, [ {'component_name': 'A', 'service_name': 'A', 'desired_state': 'INSTALLED'} ]) self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) rm.update_config(5, 5, 1, 11, True, False, False, [ {'component_name': 'A', 'service_name': 'A', 'desired_state': 'INSTALLED'} ]) self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) rm.update_config(5, 5, 1, 11, True, False, False, [ {'component_name': 'A', 'service_name': 'A', 'desired_state': 'INSTALLED'}, {'component_name': 'D', 'service_name': 'D', 'desired_state': 'INSTALLED'}, {'component_name': 'F', 'service_name': 'F', 'desired_state': 'INSTALLED'} ]) self.assertTrue(rm.configured_for_recovery("A")) self.assertFalse(rm.configured_for_recovery("B")) self.assertFalse(rm.configured_for_recovery("C")) self.assertTrue(rm.configured_for_recovery("D")) self.assertFalse(rm.configured_for_recovery("E")) self.assertTrue(rm.configured_for_recovery("F"))