def _collect_ramdisk_logs_storage_fail( self, expected_exception, mock_collect, mock_store): mock_store.side_effect = expected_exception logs = 'Gary the Snail' mock_collect.return_value = {'command_result': {'system_logs': logs}} driver_utils.collect_ramdisk_logs(self.node) mock_store.assert_called_once_with(self.node, logs)
def test_collect_ramdisk_logs_IPA_command_fail( self, mock_collect, mock_store, mock_log): error_str = 'MR. KRABS! I WANNA GO TO BED!' mock_collect.return_value = {'faultstring': error_str} driver_utils.collect_ramdisk_logs(self.node) # assert store was never invoked self.assertFalse(mock_store.called) mock_log.assert_called_once_with( mock.ANY, {'node': self.node.uuid, 'error': error_str})
def set_failed_state(task, msg, collect_logs=True): """Sets the deploy status as failed with relevant messages. This method sets the deployment as fail with the given message. It sets node's provision_state to DEPLOYFAIL and updates last_error with the given error message. It also powers off the baremetal node. :param task: a TaskManager instance containing the node to act on. :param msg: the message to set in last_error of the node. :param collect_logs: boolean indicating whether to attempt collect logs from IPA-based ramdisk. Defaults to True. Actual log collection is also affected by CONF.agent.deploy_logs_collect config option. """ node = task.node if (collect_logs and CONF.agent.deploy_logs_collect in ('on_failure', 'always')): driver_utils.collect_ramdisk_logs(node) try: task.process_event('fail') except exception.InvalidState: msg2 = (_LE('Internal error. Node %(node)s in provision state ' '"%(state)s" could not transition to a failed state.') % { 'node': node.uuid, 'state': node.provision_state }) LOG.exception(msg2) if CONF.deploy.power_off_after_deploy_failure: try: manager_utils.node_power_action(task, states.POWER_OFF) except Exception: msg2 = (_LE('Node %s failed to power off while handling deploy ' 'failure. This may be a serious condition. Node ' 'should be removed from Ironic or put in maintenance ' 'mode until the problem is resolved.') % node.uuid) LOG.exception(msg2) # NOTE(deva): node_power_action() erases node.last_error # so we need to set it here. node.last_error = msg node.save()
def set_failed_state(task, msg, collect_logs=True): """Sets the deploy status as failed with relevant messages. This method sets the deployment as fail with the given message. It sets node's provision_state to DEPLOYFAIL and updates last_error with the given error message. It also powers off the baremetal node. :param task: a TaskManager instance containing the node to act on. :param msg: the message to set in last_error of the node. :param collect_logs: boolean indicating whether to attempt collect logs from IPA-based ramdisk. Defaults to True. Actual log collection is also affected by CONF.agent.deploy_logs_collect config option. """ node = task.node if (collect_logs and CONF.agent.deploy_logs_collect in ('on_failure', 'always')): driver_utils.collect_ramdisk_logs(node) try: task.process_event('fail') except exception.InvalidState: msg2 = (_LE('Internal error. Node %(node)s in provision state ' '"%(state)s" could not transition to a failed state.') % {'node': node.uuid, 'state': node.provision_state}) LOG.exception(msg2) if CONF.deploy.power_off_after_deploy_failure: try: manager_utils.node_power_action(task, states.POWER_OFF) except Exception: msg2 = (_LE('Node %s failed to power off while handling deploy ' 'failure. This may be a serious condition. Node ' 'should be removed from Ironic or put in maintenance ' 'mode until the problem is resolved.') % node.uuid) LOG.exception(msg2) # NOTE(deva): node_power_action() erases node.last_error # so we need to set it here. node.last_error = msg node.save()
def test_collect_ramdisk_logs_storage_command_fail( self, mock_collect, mock_store): mock_collect.side_effect = exception.IronicException('boom') self.assertIsNone(driver_utils.collect_ramdisk_logs(self.node)) self.assertFalse(mock_store.called)
def test_collect_ramdisk_logs(self, mock_collect, mock_store): logs = 'Gary the Snail' mock_collect.return_value = {'command_result': {'system_logs': logs}} driver_utils.collect_ramdisk_logs(self.node) mock_store.assert_called_once_with(self.node, logs)
def do_next_clean_step(task, step_index): """Do cleaning, starting from the specified clean step. :param task: a TaskManager instance with an exclusive lock :param step_index: The first clean step in the list to execute. This is the index (from 0) into the list of clean steps in the node's driver_internal_info['clean_steps']. Is None if there are no steps to execute. """ node = task.node # For manual cleaning, the target provision state is MANAGEABLE, # whereas for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE if step_index is None: steps = [] else: steps = node.driver_internal_info['clean_steps'][step_index:] LOG.info('Executing %(state)s on node %(node)s, remaining steps: ' '%(steps)s', {'node': node.uuid, 'steps': steps, 'state': node.provision_state}) # Execute each step until we hit an async step or run out of steps for ind, step in enumerate(steps): # Save which step we're about to start so we can restart # if necessary node.clean_step = step driver_internal_info = node.driver_internal_info driver_internal_info['clean_step_index'] = step_index + ind node.driver_internal_info = driver_internal_info node.save() interface = getattr(task.driver, step.get('interface')) LOG.info('Executing %(step)s on node %(node)s', {'step': step, 'node': node.uuid}) try: result = interface.execute_clean_step(task, step) except Exception as e: if isinstance(e, exception.AgentConnectionFailed): if task.node.driver_internal_info.get('cleaning_reboot'): LOG.info('Agent is not yet running on node %(node)s ' 'after cleaning reboot, waiting for agent to ' 'come up to run next clean step %(step)s.', {'node': node.uuid, 'step': step}) driver_internal_info['skip_current_clean_step'] = False node.driver_internal_info = driver_internal_info target_state = (states.MANAGEABLE if manual_clean else None) task.process_event('wait', target_state=target_state) return msg = (_('Node %(node)s failed step %(step)s: ' '%(exc)s') % {'node': node.uuid, 'exc': e, 'step': node.clean_step}) LOG.exception(msg) driver_utils.collect_ramdisk_logs(task.node, label='cleaning') utils.cleaning_error_handler(task, msg) return # Check if the step is done or not. The step should return # states.CLEANWAIT if the step is still being executed, or # None if the step is done. if result == states.CLEANWAIT: # Kill this worker, the async step will make an RPC call to # continue_node_clean to continue cleaning LOG.info('Clean step %(step)s on node %(node)s being ' 'executed asynchronously, waiting for driver.', {'node': node.uuid, 'step': step}) target_state = states.MANAGEABLE if manual_clean else None task.process_event('wait', target_state=target_state) return elif result is not None: msg = (_('While executing step %(step)s on node ' '%(node)s, step returned invalid value: %(val)s') % {'step': step, 'node': node.uuid, 'val': result}) LOG.error(msg) return utils.cleaning_error_handler(task, msg) LOG.info('Node %(node)s finished clean step %(step)s', {'node': node.uuid, 'step': step}) if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(task.node, label='cleaning') # Clear clean_step node.clean_step = None driver_internal_info = node.driver_internal_info driver_internal_info['clean_steps'] = None driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('cleaning_reboot', None) driver_internal_info.pop('cleaning_polling', None) driver_internal_info.pop('agent_secret_token', None) driver_internal_info.pop('agent_secret_token_pregenerated', None) # Remove agent_url if not utils.fast_track_able(task): driver_internal_info.pop('agent_url', None) node.driver_internal_info = driver_internal_info node.save() try: task.driver.deploy.tear_down_cleaning(task) except Exception as e: msg = (_('Failed to tear down from cleaning for node %(node)s, ' 'reason: %(err)s') % {'node': node.uuid, 'err': e}) LOG.exception(msg) return utils.cleaning_error_handler(task, msg, tear_down_cleaning=False) LOG.info('Node %s cleaning complete', node.uuid) event = 'manage' if manual_clean or node.retired else 'done' # NOTE(rloo): No need to specify target prov. state; we're done task.process_event(event)
def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. This method initiates a reboot on the node. On success, it marks the deploy as complete. On failure, it logs the error and marks deploy as failure. :param task: a TaskManager object containing the node :raises: InstanceDeployFailure, if node reboot failed. """ wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.agent.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait ) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(node) # Whether ironic should power off the node via out-of-band or # in-band methods oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: self._client.power_off(node) _wait_until_powered_off(task) except Exception as e: LOG.warning('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' '%(cls)s: %(error)s', {'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'cls': e.__class__.__name__, 'error': e}, exc_info=not isinstance( e, exception.IronicException)) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node result = self._client.sync(node) error = result.get('faultstring') if error: if 'Unknown command' in error: error = _('The version of the IPA ramdisk used in ' 'the deployment do not support the ' 'command "sync"') LOG.warning( 'Failed to flush the file system prior to hard ' 'rebooting the node %(node)s. Error: %(error)s', {'node': node.uuid, 'error': error}) manager_utils.node_power_action(task, states.POWER_OFF) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) log_and_raise_deployment_error(task, msg, exc=e) try: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) # NOTE(mgoddard): Don't collect logs since the node has been # powered off. log_and_raise_deployment_error(task, msg, collect_logs=False, exc=e) if not node.deploy_step: # TODO(rloo): delete this 'if' part after deprecation period, when # we expect all (out-of-tree) drivers to support deploy steps. # After which we will always notify_conductor_resume_deploy(). task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) else: manager_utils.notify_conductor_resume_deploy(task)
def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. This method initiates a reboot on the node. On success, it marks the deploy as complete. On failure, it logs the error and marks deploy as failure. :param task: a TaskManager object containing the node :raises: InstanceDeployFailure, if node reboot failed. """ wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.agent.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(node) # Whether ironic should power off the node via out-of-band or # in-band methods oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: self._client.power_off(node) _wait_until_powered_off(task) except Exception as e: LOG.warning( _LW('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' 'Error: %(error)s'), { 'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'error': e }) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node result = self._client.sync(node) error = result.get('faultstring') if error: if 'Unknown command' in error: error = _('The version of the IPA ramdisk used in ' 'the deployment do not support the ' 'command "sync"') LOG.warning( _LW('Failed to flush the file system prior to hard ' 'rebooting the node %(node)s. Error: %(error)s'), { 'node': node.uuid, 'error': error }) manager_utils.node_power_action(task, states.POWER_OFF) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' 'Error: %(error)s') % { 'node': node.uuid, 'error': e }) log_and_raise_deployment_error(task, msg) task.process_event('done') LOG.info(_LI('Deployment to node %s done'), task.node.uuid)
def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. This method initiates a reboot on the node. On success, it marks the deploy as complete. On failure, it logs the error and marks deploy as failure. :param task: a TaskManager object containing the node :raises: InstanceDeployFailure, if node reboot failed. """ wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.agent.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait ) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(node) # Whether ironic should power off the node via out-of-band or # in-band methods oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: self._client.power_off(node) _wait_until_powered_off(task) except Exception as e: LOG.warning( _LW('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' 'Error: %(error)s'), {'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'error': e}) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node result = self._client.sync(node) error = result.get('faultstring') if error: if 'Unknown command' in error: error = _('The version of the IPA ramdisk used in ' 'the deployment do not support the ' 'command "sync"') LOG.warning(_LW( 'Failed to flush the file system prior to hard ' 'rebooting the node %(node)s. Error: %(error)s'), {'node': node.uuid, 'error': error}) manager_utils.node_power_action(task, states.POWER_OFF) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' 'Error: %(error)s') % {'node': node.uuid, 'error': e}) log_and_raise_deployment_error(task, msg) task.process_event('done') LOG.info(_LI('Deployment to node %s done'), task.node.uuid)