def unrescue(self, task): """Attempt to move a rescued node back to active state. :param task: a TaskManager instance. :raises: NetworkError if the rescue ports cannot be removed. :raises: InvalidParameterValue when the wrong power state is specified or the wrong driver info is specified for power management. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. :raises: any boot interface's prepare_instance exceptions. :returns: Returns states.ACTIVE """ manager_utils.node_power_action(task, states.POWER_OFF) # NOTE(TheJulia): Revealing that the power is off at any time can # cause external power sync to decide that the node must be off. # This may result in a post-rescued insance being turned off # unexpectedly after unrescue. # TODO(TheJulia): Once we have power/state callbacks to nova, # the reset of the power_state can be removed. task.node.power_state = states.POWER_ON task.node.save() self.clean_up(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore) task.driver.boot.prepare_instance(task) manager_utils.node_power_action(task, states.POWER_ON) return states.ACTIVE
def deploy(self, task): if 'configdrive' in task.node.instance_info: LOG.warning('A configuration drive is present with ' 'in the deployment request of node %(node)s. ' 'The configuration drive will be ignored for ' 'this deployment.', {'node': task.node}) manager_utils.node_power_action(task, states.POWER_OFF) # Tenant neworks must enable connectivity to the boot # location, as reboot() can otherwise be very problematic. # IDEA(TheJulia): Maybe a "trusted environment" mode flag # that we otherwise fail validation on for drivers that # require explicit security postures. power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) # calling boot.prepare_instance will also set the node # to PXE boot, and update PXE templates accordingly task.driver.boot.prepare_instance(task) # Power-on the instance, with PXE prepared, we're done. manager_utils.node_power_action(task, states.POWER_ON) LOG.info('Deployment setup for node %s done', task.node.uuid) return None
def prepare(self, task): """Prepare the deployment environment for this task's node. Generates the TFTP configuration for PXE-booting both the deployment and user images, fetches the TFTP image from Glance and add it to the local cache. :param task: a TaskManager instance containing the node to act on. :raises: NetworkError: if the previous cleaning ports cannot be removed or if new cleaning ports cannot be created. :raises: InvalidParameterValue when the wrong power state is specified or the wrong driver info is specified for power management. :raises: StorageError If the storage driver is unable to attach the configured volumes. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. :raises: any boot interface's prepare_ramdisk exceptions. """ node = task.node deploy_utils.populate_storage_driver_internal_info(task) if node.provision_state in [states.ACTIVE, states.ADOPTING]: task.driver.boot.prepare_instance(task) else: if node.provision_state == states.DEPLOYING: fast_track_deploy = manager_utils.is_fast_track(task) if fast_track_deploy: # The agent has already recently checked in and we are # configured to take that as an indicator that we can # skip ahead. LOG.debug('The agent for node %(node)s has recently ' 'checked in, and the node power will remain ' 'unmodified.', {'node': task.node.uuid}) else: # Adding the node to provisioning network so that the dhcp # options get added for the provisioning port. manager_utils.node_power_action(task, states.POWER_OFF) # NOTE(vdrok): in case of rebuild, we have tenant network # already configured, unbind tenant ports if present if task.driver.storage.should_write_image(task): if not fast_track_deploy: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.unconfigure_tenant_networks(task) task.driver.network.add_provisioning_network(task) if not fast_track_deploy: manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.driver.storage.attach_volumes(task) if (not task.driver.storage.should_write_image(task) or fast_track_deploy): # We have nothing else to do as this is handled in the # backend storage system, and we can return to the caller # as we do not need to boot the agent to deploy. # Alternatively, we are in a fast track deployment # and have nothing else to do. return deploy_opts = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, deploy_opts)
def tear_down_inband_cleaning(task, manage_boot=True): """Tears down the environment setup for in-band cleaning. This method does the following: 1. Powers off the bare metal node (unless the node is fast tracked or there was a cleaning failure). 2. If 'manage_boot' parameter is set to true, it also calls the 'clean_up_ramdisk' method of boot interface to clean up the environment that was set for booting agent ramdisk. 3. Deletes the cleaning ports which were setup as part of cleaning. :param task: a TaskManager object containing the node :param manage_boot: If this is set to True, this method calls the 'clean_up_ramdisk' method of boot interface to boot the agent ramdisk. If False, it skips this step. :raises: NetworkError, NodeCleaningFailure if the cleaning ports cannot be removed. """ fast_track = manager_utils.is_fast_track(task) node = task.node cleaning_failure = (node.fault == faults.CLEAN_FAILURE) if not (fast_track or cleaning_failure): manager_utils.node_power_action(task, states.POWER_OFF) if manage_boot: task.driver.boot.clean_up_ramdisk(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_cleaning_network(task) if not (fast_track or cleaning_failure): manager_utils.restore_power_state_if_needed(task, power_state_to_restore)
def deploy(self, task): if 'configdrive' in task.node.instance_info: LOG.warning( 'A configuration drive is present with ' 'in the deployment request of node %(node)s. ' 'The configuration drive will be ignored for ' 'this deployment.', {'node': task.node}) manager_utils.node_power_action(task, states.POWER_OFF) # Tenant neworks must enable connectivity to the boot # location, as reboot() can otherwise be very problematic. # IDEA(TheJulia): Maybe a "trusted environment" mode flag # that we otherwise fail validation on for drivers that # require explicit security postures. power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore) # calling boot.prepare_instance will also set the node # to PXE boot, and update PXE templates accordingly task.driver.boot.prepare_instance(task) # Power-on the instance, with PXE prepared, we're done. manager_utils.node_power_action(task, states.POWER_ON) LOG.info('Deployment setup for node %s done', task.node.uuid) return None
def prepare_cleaning(self, task): """Boot into the ramdisk to prepare for cleaning. :param task: a TaskManager object containing the node :raises NodeCleaningFailure: if the previous cleaning ports cannot be removed or if new cleaning ports cannot be created :returns: None or states.CLEANWAIT for async prepare. """ node = task.node conductor_steps.set_node_cleaning_steps(task) if not node.driver_internal_info['clean_steps']: # no clean steps configured, nothing to do. return fast_track = manager_utils.is_fast_track(task) power_state_to_restore = None if not fast_track: power_state_to_restore = manager_utils.power_on_node_if_needed( task) task.driver.network.add_cleaning_network(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore) boot_opt = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, boot_opt) if not fast_track: manager_utils.node_power_action(task, states.REBOOT) return states.CLEANWAIT
def tear_down(self, task): """Tear down a previous deployment on the task's node. Power off the node. All actual clean-up is done in the clean_up() method which should be called separately. :param task: a TaskManager instance containing the node to act on. :returns: deploy state DELETED. :raises: NetworkError if the cleaning ports cannot be removed. :raises: InvalidParameterValue when the wrong state is specified or the wrong driver info is specified. :raises: StorageError when volume detachment fails. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. """ manager_utils.node_power_action(task, states.POWER_OFF) task.driver.storage.detach_volumes(task) deploy_utils.tear_down_storage_configuration(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.unconfigure_tenant_networks(task) # NOTE(mgoddard): If the deployment was unsuccessful the node may have # ports on the provisioning network which were not deleted. task.driver.network.remove_provisioning_network(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) return states.DELETED
def tear_down(self, task): """Tear down a previous deployment on the task's node. Power off the node. All actual clean-up is done in the clean_up() method which should be called separately. :param task: a TaskManager instance containing the node to act on. :returns: deploy state DELETED. :raises: NetworkError if the cleaning ports cannot be removed. :raises: InvalidParameterValue when the wrong state is specified or the wrong driver info is specified. :raises: StorageError when volume detachment fails. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. """ manager_utils.node_power_action(task, states.POWER_OFF) task.driver.storage.detach_volumes(task) deploy_utils.tear_down_storage_configuration(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.unconfigure_tenant_networks(task) # NOTE(mgoddard): If the deployment was unsuccessful the node may have # ports on the provisioning network which were not deleted. task.driver.network.remove_provisioning_network(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore) return states.DELETED
def prepare(self, task): """Prepare the deployment environment for this task's node. Generates the TFTP configuration for PXE-booting both the deployment and user images, fetches the TFTP image from Glance and add it to the local cache. :param task: a TaskManager instance containing the node to act on. :raises: NetworkError: if the previous cleaning ports cannot be removed or if new cleaning ports cannot be created. :raises: InvalidParameterValue when the wrong power state is specified or the wrong driver info is specified for power management. :raises: StorageError If the storage driver is unable to attach the configured volumes. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. :raises: any boot interface's prepare_ramdisk exceptions. """ node = task.node deploy_utils.populate_storage_driver_internal_info(task) if node.provision_state in [states.ACTIVE, states.ADOPTING]: task.driver.boot.prepare_instance(task) else: if node.provision_state == states.DEPLOYING: fast_track_deploy = manager_utils.is_fast_track(task) if fast_track_deploy: # The agent has already recently checked in and we are # configured to take that as an indicator that we can # skip ahead. LOG.debug( 'The agent for node %(node)s has recently ' 'checked in, and the node power will remain ' 'unmodified.', {'node': task.node.uuid}) else: # Adding the node to provisioning network so that the dhcp # options get added for the provisioning port. manager_utils.node_power_action(task, states.POWER_OFF) # NOTE(vdrok): in case of rebuild, we have tenant network # already configured, unbind tenant ports if present if task.driver.storage.should_write_image(task): if not fast_track_deploy: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.unconfigure_tenant_networks(task) task.driver.network.add_provisioning_network(task) if not fast_track_deploy: manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.driver.storage.attach_volumes(task) if (not task.driver.storage.should_write_image(task) or fast_track_deploy): # We have nothing else to do as this is handled in the # backend storage system, and we can return to the caller # as we do not need to boot the agent to deploy. # Alternatively, we are in a fast track deployment # and have nothing else to do. return deploy_opts = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, deploy_opts)
def _finalize_rescue(self, task): """Call ramdisk to prepare rescue mode and verify result. :param task: A TaskManager instance :raises: InstanceRescueFailure, if rescuing failed """ node = task.node try: result = self._client.finalize_rescue(node) except exception.IronicException as e: raise exception.InstanceRescueFailure(node=node.uuid, instance=node.instance_uuid, reason=e) if ((not result.get('command_status')) or result.get('command_status') != 'SUCCEEDED'): # NOTE(mariojv) Caller will clean up failed rescue in exception # handler. fail_reason = (_('Agent returned bad result for command ' 'finalize_rescue: %(result)s') % {'result': result.get('command_error')}) raise exception.InstanceRescueFailure(node=node.uuid, instance=node.instance_uuid, reason=fail_reason) task.process_event('resume') task.driver.rescue.clean_up(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.process_event('done')
def _finalize_rescue(self, task): """Call ramdisk to prepare rescue mode and verify result. :param task: A TaskManager instance :raises: InstanceRescueFailure, if rescuing failed """ node = task.node try: result = self._client.finalize_rescue(node) except exception.IronicException as e: raise exception.InstanceRescueFailure(node=node.uuid, instance=node.instance_uuid, reason=e) if ((not result.get('command_status')) or result.get('command_status') != 'SUCCEEDED'): # NOTE(mariojv) Caller will clean up failed rescue in exception # handler. fail_reason = (_('Agent returned bad result for command ' 'finalize_rescue: %(result)s') % {'result': result.get('command_error')}) raise exception.InstanceRescueFailure(node=node.uuid, instance=node.instance_uuid, reason=fail_reason) task.process_event('resume') task.driver.rescue.clean_up(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.process_event('done')
def deploy(self, task): """Perform a deployment to a node. Perform the necessary work to deploy an image onto the specified node. This method will be called after prepare(), which may have already performed any preparatory steps, such as pre-caching some data for the node. :param task: a TaskManager instance. :returns: status of the deploy. One of ironic.common.states. """ if task.driver.storage.should_write_image(task): manager_utils.node_power_action(task, states.REBOOT) return states.DEPLOYWAIT else: # TODO(TheJulia): At some point, we should de-dupe this code # as it is nearly identical to the iscsi deploy interface. # This is not being done now as it is expected to be # refactored in the near future. manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.driver.boot.prepare_instance(task) manager_utils.node_power_action(task, states.POWER_ON) LOG.info('Deployment to node %s done', task.node.uuid) return None
def rescue(self, task): """Boot a rescue ramdisk on the node. :param task: a TaskManager instance. :raises: NetworkError if the tenant ports cannot be removed. :raises: InvalidParameterValue when the wrong power state is specified or the wrong driver info is specified for power management. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. :raises: any boot interface's prepare_ramdisk exceptions. :returns: Returns states.RESCUEWAIT """ manager_utils.node_power_action(task, states.POWER_OFF) # NOTE(TheJulia): Revealing that the power is off at any time can # cause external power sync to decide that the node must be off. # This may result in a post-rescued instance being turned off # unexpectedly after rescue has started. # TODO(TheJulia): Once we have power/state callbacks to nova, # the reset of the power_state can be removed. task.node.power_state = states.POWER_ON task.node.save() task.driver.boot.clean_up_instance(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.unconfigure_tenant_networks(task) task.driver.network.add_rescuing_network(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore) if CONF.agent.manage_agent_boot: ramdisk_opts = deploy_utils.build_agent_options(task.node) # prepare_ramdisk will set the boot device task.driver.boot.prepare_ramdisk(task, ramdisk_opts) manager_utils.node_power_action(task, states.POWER_ON) return states.RESCUEWAIT
def deploy(self, task): """Start deployment of the task's node. Fetches instance image, updates the DHCP port options for next boot, and issues a reboot request to the power driver. This causes the node to boot into the deployment ramdisk and triggers the next phase of PXE-based deployment via agent heartbeats. :param task: a TaskManager instance containing the node to act on. :returns: deploy state DEPLOYWAIT. """ node = task.node if task.driver.storage.should_write_image(task): deploy_utils.cache_instance_image(task.context, node) check_image_size(task) manager_utils.node_power_action(task, states.REBOOT) return states.DEPLOYWAIT else: # TODO(TheJulia): At some point, we should de-dupe this code # as it is nearly identical to the agent deploy interface. # This is not being done now as it is expected to be # refactored in the near future. manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.driver.boot.prepare_instance(task) manager_utils.node_power_action(task, states.POWER_ON) return None
def tear_down(self, task): """Tear down a previous deployment on the task's node.""" manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.unconfigure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) return states.DELETED
def tear_down(self, task): """Tear down a previous deployment on the task's node.""" manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.unconfigure_tenant_networks(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore) return states.DELETED
def deploy(self, task): """Start deployment of the task's node. Fetches instance image, updates the DHCP port options for next boot, and issues a reboot request to the power driver. This causes the node to boot into the deployment ramdisk and triggers the next phase of PXE-based deployment via agent heartbeats. :param task: a TaskManager instance containing the node to act on. :returns: deploy state DEPLOYWAIT. """ node = task.node if manager_utils.is_fast_track(task): LOG.debug('Performing a fast track deployment for %(node)s.', {'node': task.node.uuid}) deploy_utils.cache_instance_image(task.context, node) check_image_size(task) # Update the database for the API and the task tracking resumes # the state machine state going from DEPLOYWAIT -> DEPLOYING task.process_event('wait') self.continue_deploy(task) elif task.driver.storage.should_write_image(task): # Standard deploy process deploy_utils.cache_instance_image(task.context, node) check_image_size(task) # Check if the driver has already performed a reboot in a previous # deploy step. if not task.node.driver_internal_info.get('deployment_reboot', False): manager_utils.node_power_action(task, states.REBOOT) info = task.node.driver_internal_info info.pop('deployment_reboot', None) task.node.driver_internal_info = info task.node.save() return states.DEPLOYWAIT else: # Boot to an Storage Volume # TODO(TheJulia): At some point, we should de-dupe this code # as it is nearly identical to the agent deploy interface. # This is not being done now as it is expected to be # refactored in the near future. manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.driver.boot.prepare_instance(task) manager_utils.node_power_action(task, states.POWER_ON) return None
def reboot_and_finish_deploy(self, task): wait = CONF.ansible.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.ansible.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: node_address = _get_node_ip(task) playbook, user, key = _parse_ansible_driver_info( node, action='shutdown') node_list = [(node.uuid, node_address, user, node.extra)] extra_vars = _prepare_extra_vars(node_list) _run_playbook(node, playbook, extra_vars, key) _wait_until_powered_off(task) except Exception as e: LOG.warning( 'Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' 'Error: %(error)s', { 'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'error': e }) # NOTE(pas-ha) flush is a part of deploy playbook # so if it finished successfully we can safely # power off the node out-of-band manager_utils.node_power_action(task, states.POWER_OFF) else: manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' 'Error: %(error)s') % { 'node': node.uuid, 'error': e }) agent_base.log_and_raise_deployment_error(task, msg)
def prepare_inband_cleaning(task, manage_boot=True): """Prepares the node to boot into agent for in-band cleaning. This method does the following: 1. Prepares the cleaning ports for the bare metal node and updates the clean parameters in node's driver_internal_info. 2. If 'manage_boot' parameter is set to true, it also calls the 'prepare_ramdisk' method of boot interface to boot the agent ramdisk. 3. Reboots the bare metal node. :param task: a TaskManager object containing the node :param manage_boot: If this is set to True, this method calls the 'prepare_ramdisk' method of boot interface to boot the agent ramdisk. If False, it skips preparing the boot agent ramdisk using boot interface, and assumes that the environment is setup to automatically boot agent ramdisk every time bare metal node is rebooted. :returns: states.CLEANWAIT to signify an asynchronous prepare. :raises: NetworkError, NodeCleaningFailure if the previous cleaning ports cannot be removed or if new cleaning ports cannot be created. :raises: InvalidParameterValue if cleaning network UUID config option has an invalid value. """ fast_track = manager_utils.is_fast_track(task) if not fast_track: power_state_to_restore = manager_utils.power_on_node_if_needed(task) # WARNING(TheJulia): When fast track is available, trying to plug the # cleaning network is problematic and in practice this may fail if # cleaning/provisioning/discovery all take place on different # networks when.. # Translation: Here be a realistically unavoidable footgun # fast track support. # TODO(TheJulia): Lets improve this somehow such that the agent host # gracefully handles these sorts of changes. task.driver.network.add_cleaning_network(task) if not fast_track: manager_utils.restore_power_state_if_needed(task, power_state_to_restore) # Append required config parameters to node's driver_internal_info # to pass to IPA. agent_add_clean_params(task) if manage_boot: ramdisk_opts = build_agent_options(task.node) task.driver.boot.prepare_ramdisk(task, ramdisk_opts) if not fast_track: manager_utils.node_power_action(task, states.REBOOT) # Tell the conductor we are waiting for the agent to boot. return states.CLEANWAIT
def tear_down_cleaning(self, task): """Clean up the PXE and DHCP files after cleaning. :param task: a TaskManager object containing the node :raises NodeCleaningFailure: if the cleaning ports cannot be removed """ manager_utils.node_power_action(task, states.POWER_OFF) task.driver.boot.clean_up_ramdisk(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_cleaning_network(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore)
def tear_down_cleaning(self, task): """Clean up the PXE and DHCP files after cleaning. :param task: a TaskManager object containing the node :raises NodeCleaningFailure: if the cleaning ports cannot be removed """ manager_utils.node_power_action(task, states.POWER_OFF) task.driver.boot.clean_up_ramdisk(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_cleaning_network(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore)
def reboot_and_finish_deploy(self, task): wait = CONF.ansible.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.ansible.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait ) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: node_address = _get_node_ip(task) playbook, user, key = _parse_ansible_driver_info( node, action='shutdown') node_list = [(node.uuid, node_address, user, node.extra)] extra_vars = _prepare_extra_vars(node_list) _run_playbook(node, playbook, extra_vars, key) _wait_until_powered_off(task) except Exception as e: LOG.warning('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' 'Error: %(error)s', {'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'error': e}) # NOTE(pas-ha) flush is a part of deploy playbook # so if it finished successfully we can safely # power off the node out-of-band manager_utils.node_power_action(task, states.POWER_OFF) else: manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' 'Error: %(error)s') % {'node': node.uuid, 'error': e}) agent_base.log_and_raise_deployment_error(task, msg)
def clean_up(self, task): """Clean up after RESCUEWAIT timeout/failure or finishing rescue. Rescue password should be removed from the node and ramdisk boot environment should be cleaned if Ironic is managing the ramdisk boot. :param task: a TaskManager instance with the node. :raises: NetworkError if the rescue ports cannot be removed. """ manager_utils.remove_node_rescue_password(task.node, save=True) if CONF.agent.manage_agent_boot: task.driver.boot.clean_up_ramdisk(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_rescuing_network(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore)
def deploy(self, task): """Start deployment of the task's node. Fetches instance image, updates the DHCP port options for next boot, and issues a reboot request to the power driver. This causes the node to boot into the deployment ramdisk and triggers the next phase of PXE-based deployment via agent heartbeats. :param task: a TaskManager instance containing the node to act on. :returns: deploy state DEPLOYWAIT. """ node = task.node if manager_utils.is_fast_track(task): LOG.debug('Performing a fast track deployment for %(node)s.', {'node': task.node.uuid}) deploy_utils.cache_instance_image(task.context, node) check_image_size(task) # Update the database for the API and the task tracking resumes # the state machine state going from DEPLOYWAIT -> DEPLOYING task.process_event('wait') self.continue_deploy(task) elif task.driver.storage.should_write_image(task): # Standard deploy process deploy_utils.cache_instance_image(task.context, node) check_image_size(task) manager_utils.node_power_action(task, states.REBOOT) return states.DEPLOYWAIT else: # Boot to an Storage Volume # TODO(TheJulia): At some point, we should de-dupe this code # as it is nearly identical to the agent deploy interface. # This is not being done now as it is expected to be # refactored in the near future. manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.driver.boot.prepare_instance(task) manager_utils.node_power_action(task, states.POWER_ON) return None
def prepare(self, task): """Prepare the deployment environment for this node.""" node = task.node # TODO(pas-ha) investigate takeover scenario if node.provision_state == states.DEPLOYING: # adding network-driver dependent provisioning ports manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.add_provisioning_network(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) if node.provision_state not in [states.ACTIVE, states.ADOPTING]: node.instance_info = deploy_utils.build_instance_info_for_deploy( task) node.save() boot_opt = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, boot_opt)
def tear_down_cleaning(self, task): """Clean up the PXE and DHCP files after cleaning. :param task: a TaskManager object containing the node :raises NodeCleaningFailure: if the cleaning ports cannot be removed """ fast_track = manager_utils.is_fast_track(task) node = task.node cleaning_failure = (node.fault == faults.CLEAN_FAILURE) if not (fast_track or cleaning_failure): manager_utils.node_power_action(task, states.POWER_OFF) task.driver.boot.clean_up_ramdisk(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_cleaning_network(task) if not (fast_track or cleaning_failure): manager_utils.restore_power_state_if_needed( task, power_state_to_restore)
def prepare(self, task): """Prepare the deployment environment for this node.""" node = task.node # TODO(pas-ha) investigate takeover scenario if node.provision_state == states.DEPLOYING: # adding network-driver dependent provisioning ports manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.add_provisioning_network(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) if node.provision_state not in [states.ACTIVE, states.ADOPTING]: node.instance_info = deploy_utils.build_instance_info_for_deploy( task) node.save() boot_opt = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, boot_opt)
def deploy(self, task): """Perform a deployment to a node. Perform the necessary work to deploy an image onto the specified node. This method will be called after prepare(), which may have already performed any preparatory steps, such as pre-caching some data for the node. :param task: a TaskManager instance. :returns: status of the deploy. One of ironic.common.states. """ if manager_utils.is_fast_track(task): LOG.debug('Performing a fast track deployment for %(node)s.', {'node': task.node.uuid}) # Update the database for the API and the task tracking resumes # the state machine state going from DEPLOYWAIT -> DEPLOYING task.process_event('wait') self.continue_deploy(task) elif task.driver.storage.should_write_image(task): # Check if the driver has already performed a reboot in a previous # deploy step. if not task.node.driver_internal_info.get('deployment_reboot'): manager_utils.node_power_action(task, states.REBOOT) info = task.node.driver_internal_info info.pop('deployment_reboot', None) task.node.driver_internal_info = info task.node.save() return states.DEPLOYWAIT else: # TODO(TheJulia): At some point, we should de-dupe this code # as it is nearly identical to the iscsi deploy interface. # This is not being done now as it is expected to be # refactored in the near future. manager_utils.node_power_action(task, states.POWER_OFF) power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) task.driver.boot.prepare_instance(task) manager_utils.node_power_action(task, states.POWER_ON) LOG.info('Deployment to node %s done', task.node.uuid) return None
def prepare_cleaning(self, task): """Boot into the ramdisk to prepare for cleaning. :param task: a TaskManager object containing the node :raises NodeCleaningFailure: if the previous cleaning ports cannot be removed or if new cleaning ports cannot be created :returns: None or states.CLEANWAIT for async prepare. """ node = task.node conductor_steps.set_node_cleaning_steps(task) if not node.driver_internal_info['clean_steps']: # no clean steps configured, nothing to do. return power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.add_cleaning_network(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) boot_opt = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, boot_opt) manager_utils.node_power_action(task, states.REBOOT) return states.CLEANWAIT
def tear_down(self, task): """Tear down a previous deployment on the task's node. :param task: a TaskManager instance. :returns: status of the deploy. One of ironic.common.states. :raises: NetworkError if the cleaning ports cannot be removed. :raises: InvalidParameterValue when the wrong power state is specified or the wrong driver info is specified for power management. :raises: StorageError when the storage interface attached volumes fail to detach. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. """ manager_utils.node_power_action(task, states.POWER_OFF) task.driver.storage.detach_volumes(task) deploy_utils.tear_down_storage_configuration(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.unconfigure_tenant_networks(task) # NOTE(mgoddard): If the deployment was unsuccessful the node may have # ports on the provisioning network which were not deleted. task.driver.network.remove_provisioning_network(task) manager_utils.restore_power_state_if_needed(task, power_state_to_restore) return states.DELETED
def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. This method initiates a reboot on the node. On success, it marks the deploy as complete. On failure, it logs the error and marks deploy as failure. :param task: a TaskManager object containing the node :raises: InstanceDeployFailure, if node reboot failed. """ wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.agent.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait ) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(node) # Whether ironic should power off the node via out-of-band or # in-band methods oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: self._client.power_off(node) _wait_until_powered_off(task) except Exception as e: LOG.warning('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' '%(cls)s: %(error)s', {'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'cls': e.__class__.__name__, 'error': e}, exc_info=not isinstance( e, exception.IronicException)) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node result = self._client.sync(node) error = result.get('faultstring') if error: if 'Unknown command' in error: error = _('The version of the IPA ramdisk used in ' 'the deployment do not support the ' 'command "sync"') LOG.warning( 'Failed to flush the file system prior to hard ' 'rebooting the node %(node)s. Error: %(error)s', {'node': node.uuid, 'error': error}) manager_utils.node_power_action(task, states.POWER_OFF) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) log_and_raise_deployment_error(task, msg, exc=e) try: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) # NOTE(mgoddard): Don't collect logs since the node has been # powered off. log_and_raise_deployment_error(task, msg, collect_logs=False, exc=e) if not node.deploy_step: # TODO(rloo): delete this 'if' part after deprecation period, when # we expect all (out-of-tree) drivers to support deploy steps. # After which we will always notify_conductor_resume_deploy(). task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) else: manager_utils.notify_conductor_resume_deploy(task)
def prepare(self, task): """Prepare the deployment environment for this node. :param task: a TaskManager instance. :raises: NetworkError: if the previous cleaning ports cannot be removed or if new cleaning ports cannot be created. :raises: InvalidParameterValue when the wrong power state is specified or the wrong driver info is specified for power management. :raises: StorageError If the storage driver is unable to attach the configured volumes. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. :raises: exception.ImageRefValidationFailed if image_source is not Glance href and is not HTTP(S) URL. :raises: exception.InvalidParameterValue if network validation fails. :raises: any boot interface's prepare_ramdisk exceptions. """ node = task.node deploy_utils.populate_storage_driver_internal_info(task) if node.provision_state == states.DEPLOYING: # Validate network interface to ensure that it supports boot # options configured on the node. try: task.driver.network.validate(task) except exception.InvalidParameterValue: # For 'neutron' network interface validation will fail # if node is using 'netboot' boot option while provisioning # a whole disk image. Updating 'boot_option' in node's # 'instance_info' to 'local for backward compatibility. # TODO(stendulker): Fail here once the default boot # option is local. with excutils.save_and_reraise_exception(reraise=False) as ctx: instance_info = node.instance_info capabilities = instance_info.get('capabilities', {}) if 'boot_option' not in capabilities: capabilities['boot_option'] = 'local' instance_info['capabilities'] = capabilities node.instance_info = instance_info node.save() # Re-validate the network interface task.driver.network.validate(task) else: ctx.reraise = True # Adding the node to provisioning network so that the dhcp # options get added for the provisioning port. manager_utils.node_power_action(task, states.POWER_OFF) if task.driver.storage.should_write_image(task): # NOTE(vdrok): in case of rebuild, we have tenant network # already configured, unbind tenant ports if present power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.unconfigure_tenant_networks(task) task.driver.network.add_provisioning_network(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) # Signal to storage driver to attach volumes task.driver.storage.attach_volumes(task) if not task.driver.storage.should_write_image(task): # We have nothing else to do as this is handled in the # backend storage system, and we can return to the caller # as we do not need to boot the agent to deploy. return if node.provision_state in (states.ACTIVE, states.UNRESCUING): # Call is due to conductor takeover task.driver.boot.prepare_instance(task) elif node.provision_state != states.ADOPTING: if node.provision_state not in (states.RESCUING, states.RESCUEWAIT, states.RESCUE, states.RESCUEFAIL): node.instance_info = ( deploy_utils.build_instance_info_for_deploy(task)) node.save() if CONF.agent.manage_agent_boot: deploy_opts = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, deploy_opts)
def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. This method initiates a reboot on the node. On success, it marks the deploy as complete. On failure, it logs the error and marks deploy as failure. :param task: a TaskManager object containing the node :raises: InstanceDeployFailure, if node reboot failed. """ wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000 attempts = CONF.agent.post_deploy_get_power_state_retries + 1 @retrying.retry( stop_max_attempt_number=attempts, retry_on_result=lambda state: state != states.POWER_OFF, wait_fixed=wait ) def _wait_until_powered_off(task): return task.driver.power.get_power_state(task) node = task.node if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(node) # Whether ironic should power off the node via out-of-band or # in-band methods oob_power_off = strutils.bool_from_string( node.driver_info.get('deploy_forces_oob_reboot', False)) try: if not oob_power_off: try: self._client.power_off(node) _wait_until_powered_off(task) except Exception as e: LOG.warning('Failed to soft power off node %(node_uuid)s ' 'in at least %(timeout)d seconds. ' '%(cls)s: %(error)s', {'node_uuid': node.uuid, 'timeout': (wait * (attempts - 1)) / 1000, 'cls': e.__class__.__name__, 'error': e}, exc_info=not isinstance( e, exception.IronicException)) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node result = self._client.sync(node) error = result.get('faultstring') if error: if 'Unknown command' in error: error = _('The version of the IPA ramdisk used in ' 'the deployment do not support the ' 'command "sync"') LOG.warning( 'Failed to flush the file system prior to hard ' 'rebooting the node %(node)s. Error: %(error)s', {'node': node.uuid, 'error': error}) manager_utils.node_power_action(task, states.POWER_OFF) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) log_and_raise_deployment_error(task, msg, exc=e) try: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) manager_utils.restore_power_state_if_needed( task, power_state_to_restore) manager_utils.node_power_action(task, states.POWER_ON) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' '%(cls)s: %(error)s') % {'node': node.uuid, 'cls': e.__class__.__name__, 'error': e}) # NOTE(mgoddard): Don't collect logs since the node has been # powered off. log_and_raise_deployment_error(task, msg, collect_logs=False, exc=e) if not node.deploy_step: # TODO(rloo): delete this 'if' part after deprecation period, when # we expect all (out-of-tree) drivers to support deploy steps. # After which we will always notify_conductor_resume_deploy(). task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) else: manager_utils.notify_conductor_resume_deploy(task)
def prepare(self, task): """Prepare the deployment environment for this node. :param task: a TaskManager instance. :raises: NetworkError: if the previous cleaning ports cannot be removed or if new cleaning ports cannot be created. :raises: InvalidParameterValue when the wrong power state is specified or the wrong driver info is specified for power management. :raises: StorageError If the storage driver is unable to attach the configured volumes. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. :raises: exception.ImageRefValidationFailed if image_source is not Glance href and is not HTTP(S) URL. :raises: exception.InvalidParameterValue if network validation fails. :raises: any boot interface's prepare_ramdisk exceptions. """ def _update_instance_info(): node.instance_info = ( deploy_utils.build_instance_info_for_deploy(task)) node.save() node = task.node deploy_utils.populate_storage_driver_internal_info(task) if node.provision_state == states.DEPLOYING: # Validate network interface to ensure that it supports boot # options configured on the node. try: task.driver.network.validate(task) except exception.InvalidParameterValue: # For 'neutron' network interface validation will fail # if node is using 'netboot' boot option while provisioning # a whole disk image. Updating 'boot_option' in node's # 'instance_info' to 'local for backward compatibility. # TODO(stendulker): Fail here once the default boot # option is local. # NOTE(TheJulia): Fixing the default boot mode only # masks the failure as the lack of a user definition # can be perceived as both an invalid configuration and # reliance upon the default configuration. The reality # being that in most scenarios, users do not want network # booting, so the changed default should be valid. with excutils.save_and_reraise_exception(reraise=False) as ctx: instance_info = node.instance_info capabilities = utils.parse_instance_info_capabilities(node) if 'boot_option' not in capabilities: capabilities['boot_option'] = 'local' instance_info['capabilities'] = capabilities node.instance_info = instance_info node.save() # Re-validate the network interface task.driver.network.validate(task) else: ctx.reraise = True # Determine if this is a fast track sequence fast_track_deploy = manager_utils.is_fast_track(task) if fast_track_deploy: # The agent has already recently checked in and we are # configured to take that as an indicator that we can # skip ahead. LOG.debug( 'The agent for node %(node)s has recently checked ' 'in, and the node power will remain unmodified.', {'node': task.node.uuid}) else: # Powering off node to setup networking for port and # ensure that the state is reset if it is inadvertently # on for any unknown reason. manager_utils.node_power_action(task, states.POWER_OFF) if task.driver.storage.should_write_image(task): # NOTE(vdrok): in case of rebuild, we have tenant network # already configured, unbind tenant ports if present if not fast_track_deploy: power_state_to_restore = ( manager_utils.power_on_node_if_needed(task)) task.driver.network.unconfigure_tenant_networks(task) task.driver.network.add_provisioning_network(task) if not fast_track_deploy: manager_utils.restore_power_state_if_needed( task, power_state_to_restore) else: # Fast track sequence in progress _update_instance_info() # Signal to storage driver to attach volumes task.driver.storage.attach_volumes(task) if (not task.driver.storage.should_write_image(task) or fast_track_deploy): # We have nothing else to do as this is handled in the # backend storage system, and we can return to the caller # as we do not need to boot the agent to deploy. # Alternatively, we could be in a fast track deployment # and again, we should have nothing to do here. return if node.provision_state in (states.ACTIVE, states.UNRESCUING): # Call is due to conductor takeover task.driver.boot.prepare_instance(task) elif node.provision_state != states.ADOPTING: if node.provision_state not in (states.RESCUING, states.RESCUEWAIT, states.RESCUE, states.RESCUEFAIL): _update_instance_info() if CONF.agent.manage_agent_boot: deploy_opts = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, deploy_opts)