def process_next_step(self, task, step_type): """Start the next clean/deploy step if the previous one is complete. :param task: a TaskManager instance :param step_type: "clean" or "deploy" """ # Run the next step as soon as agent heartbeats in deploy.deploy if step_type == 'deploy' and self.in_core_deploy_step(task): manager_utils.notify_conductor_resume_deploy(task)
def reboot_to_instance(self, task): node = task.node LOG.info('Ansible complete deploy on node %s', node.uuid) LOG.debug('Rebooting node %s to instance', node.uuid) manager_utils.node_set_boot_device(task, 'disk', persistent=True) self.reboot_and_finish_deploy(task) task.driver.boot.clean_up_ramdisk(task) # TODO(dtantsur): remove these two calls when this function becomes a # real deploy step. task.process_event('wait') manager_utils.notify_conductor_resume_deploy(task)
def _resume_current_operation(self, task): """Continue cleaning/deployment of the node. For asynchronous operations, it is necessary to notify the conductor manager to continue the cleaning/deployment operation after a job has finished. This is done through an RPC call. The notify_conductor_resume_* wrapper methods provide that. :param task: a TaskManager instance with node to act on """ if task.node.clean_step: manager_utils.notify_conductor_resume_clean(task) else: manager_utils.notify_conductor_resume_deploy(task)
def reboot_to_instance(self, task): node = task.node LOG.info('Ansible complete deploy on node %s', node.uuid) LOG.debug('Rebooting node %s to instance', node.uuid) manager_utils.node_set_boot_device(task, 'disk', persistent=True) self.reboot_and_finish_deploy(task) task.driver.boot.clean_up_ramdisk(task) if not node.deploy_step: # TODO(rloo): delete this 'if' part after deprecation period, when # we expect all (out-of-tree) drivers to support deploy steps. # After which we will always notify_conductor_resume_deploy(). task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) else: manager_utils.notify_conductor_resume_deploy(task)
def reboot_to_instance(self, task): node = task.node LOG.info('Ansible complete deploy on node %s', node.uuid) LOG.debug('Rebooting node %s to instance', node.uuid) manager_utils.node_set_boot_device(task, 'disk', persistent=True) self.reboot_and_finish_deploy(task) task.driver.boot.clean_up_ramdisk(task) if not node.deploy_step: # TODO(rloo): delete this 'if' part after deprecation period, when # we expect all (out-of-tree) drivers to support deploy steps. # After which we will always notify_conductor_resume_deploy(). task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) else: manager_utils.notify_conductor_resume_deploy(task)
def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. This method initiates a reboot on the node. On success, it marks the deploy as complete. On failure, it logs the error and marks deploy as failure. :param task: a TaskManager object containing the node :raises: InstanceDeployFailure, if node reboot failed. """ wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.agent.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait ) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(node) # Whether ironic should power off the node via out-of-band or # in-band methods oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: self._client.power_off(node) _wait_until_powered_off(task) except Exception as e: LOG.warning('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' '%(cls)s: %(error)s', {'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'cls': e.__class__.__name__, 'error': e}, exc_info=not isinstance( e, exception.IronicException)) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node result = self._client.sync(node) error = result.get('faultstring') if error: if 'Unknown command' in error: error = _('The version of the IPA ramdisk used in ' 'the deployment do not support the ' 'command "sync"') LOG.warning( 'Failed to flush the file system prior to hard ' 'rebooting the node %(node)s. Error: %(error)s', {'node': node.uuid, 'error': error}) manager_utils.node_power_action(task, states.POWER_OFF) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) log_and_raise_deployment_error(task, msg, exc=e) try: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) # NOTE(mgoddard): Don't collect logs since the node has been # powered off. log_and_raise_deployment_error(task, msg, collect_logs=False, exc=e) if not node.deploy_step: # TODO(rloo): delete this 'if' part after deprecation period, when # we expect all (out-of-tree) drivers to support deploy steps. # After which we will always notify_conductor_resume_deploy(). task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) else: manager_utils.notify_conductor_resume_deploy(task)
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if (task.node.provision_state not in self.heartbeat_allowed_states and not manager_utils.fast_track_able(task)): LOG.error( 'Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', { 'node': task.node.uuid, 'state': task.node.provision_state }) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning( 'Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version # Record the last heartbeat event time in UTC, so we can make # decisions about it later. Can be decoded to datetime object with: # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f") driver_internal_info['agent_last_heartbeat'] = str( timeutils.utcnow().isoformat()) node.driver_internal_info = driver_internal_info node.save() if node.provision_state in _HEARTBEAT_RECORD_ONLY: # We shouldn't take any additional action. The agent will # silently continue to heartbeat to ironic until user initiated # state change occurs causing it to match a state below. LOG.debug( 'Heartbeat from %(node)s recorded to identify the ' 'node as on-line.', {'node': task.node.uuid}) return if node.maintenance: return self._heartbeat_in_maintenance(task) # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point if the driver is polling for completion of a step. if node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): msg = _('Node failed to deploy.') self.continue_deploy(task) elif self.deploy_is_done(task): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) else: node.touch_provisioning() else: # The exceptions from RPC are not possible as we using cast # here # Check if the driver is polling for completion of a step, # via the 'deployment_polling' flag. polling = node.driver_internal_info.get( 'deployment_polling', False) if not polling: manager_utils.notify_conductor_resume_deploy(task) node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning conductor_steps.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') # Check if the driver is polling for completion of a step, # via the 'cleaning_polling' flag. polling = node.driver_internal_info.get( 'cleaning_polling', False) if not polling: self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state(task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. This method initiates a reboot on the node. On success, it marks the deploy as complete. On failure, it logs the error and marks deploy as failure. :param task: a TaskManager object containing the node :raises: InstanceDeployFailure, if node reboot failed. """ wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.agent.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait ) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(node) # Whether ironic should power off the node via out-of-band or # in-band methods oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: self._client.power_off(node) _wait_until_powered_off(task) except Exception as e: LOG.warning('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' '%(cls)s: %(error)s', {'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'cls': e.__class__.__name__, 'error': e}, exc_info=not isinstance( e, exception.IronicException)) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node result = self._client.sync(node) error = result.get('faultstring') if error: if 'Unknown command' in error: error = _('The version of the IPA ramdisk used in ' 'the deployment do not support the ' 'command "sync"') LOG.warning( 'Failed to flush the file system prior to hard ' 'rebooting the node %(node)s. Error: %(error)s', {'node': node.uuid, 'error': error}) manager_utils.node_power_action(task, states.POWER_OFF) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) log_and_raise_deployment_error(task, msg, exc=e) try: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) # NOTE(mgoddard): Don't collect logs since the node has been # powered off. log_and_raise_deployment_error(task, msg, collect_logs=False, exc=e) if not node.deploy_step: # TODO(rloo): delete this 'if' part after deprecation period, when # we expect all (out-of-tree) drivers to support deploy steps. # After which we will always notify_conductor_resume_deploy(). task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) else: manager_utils.notify_conductor_resume_deploy(task)
def _set_success(self, task): if task.node.clean_step: manager_utils.notify_conductor_resume_clean(task) else: manager_utils.notify_conductor_resume_deploy(task)