Ejemplo n.º 1
0
    def process_next_step(self, task, step_type):
        """Start the next clean/deploy step if the previous one is complete.

        :param task: a TaskManager instance
        :param step_type: "clean" or "deploy"
        """
        # Run the next step as soon as agent heartbeats in deploy.deploy
        if step_type == 'deploy' and self.in_core_deploy_step(task):
            manager_utils.notify_conductor_resume_deploy(task)
Ejemplo n.º 2
0
    def reboot_to_instance(self, task):
        node = task.node
        LOG.info('Ansible complete deploy on node %s', node.uuid)

        LOG.debug('Rebooting node %s to instance', node.uuid)
        manager_utils.node_set_boot_device(task, 'disk', persistent=True)
        self.reboot_and_finish_deploy(task)
        task.driver.boot.clean_up_ramdisk(task)

        # TODO(dtantsur): remove these two calls when this function becomes a
        # real deploy step.
        task.process_event('wait')
        manager_utils.notify_conductor_resume_deploy(task)
Ejemplo n.º 3
0
    def _resume_current_operation(self, task):
        """Continue cleaning/deployment of the node.

        For asynchronous operations, it is necessary to notify the
        conductor manager to continue the cleaning/deployment operation
        after a job has finished. This is done through an RPC call. The
        notify_conductor_resume_* wrapper methods provide that.

        :param task: a TaskManager instance with node to act on
        """
        if task.node.clean_step:
            manager_utils.notify_conductor_resume_clean(task)
        else:
            manager_utils.notify_conductor_resume_deploy(task)
Ejemplo n.º 4
0
    def reboot_to_instance(self, task):
        node = task.node
        LOG.info('Ansible complete deploy on node %s', node.uuid)

        LOG.debug('Rebooting node %s to instance', node.uuid)
        manager_utils.node_set_boot_device(task, 'disk', persistent=True)
        self.reboot_and_finish_deploy(task)
        task.driver.boot.clean_up_ramdisk(task)

        if not node.deploy_step:
            # TODO(rloo): delete this 'if' part after deprecation period, when
            # we expect all (out-of-tree) drivers to support deploy steps.
            # After which we will always notify_conductor_resume_deploy().
            task.process_event('done')
            LOG.info('Deployment to node %s done', task.node.uuid)
        else:
            manager_utils.notify_conductor_resume_deploy(task)
Ejemplo n.º 5
0
    def reboot_to_instance(self, task):
        node = task.node
        LOG.info('Ansible complete deploy on node %s', node.uuid)

        LOG.debug('Rebooting node %s to instance', node.uuid)
        manager_utils.node_set_boot_device(task, 'disk', persistent=True)
        self.reboot_and_finish_deploy(task)
        task.driver.boot.clean_up_ramdisk(task)

        if not node.deploy_step:
            # TODO(rloo): delete this 'if' part after deprecation period, when
            # we expect all (out-of-tree) drivers to support deploy steps.
            # After which we will always notify_conductor_resume_deploy().
            task.process_event('done')
            LOG.info('Deployment to node %s done', task.node.uuid)
        else:
            manager_utils.notify_conductor_resume_deploy(task)
Ejemplo n.º 6
0
    def reboot_and_finish_deploy(self, task):
        """Helper method to trigger reboot on the node and finish deploy.

        This method initiates a reboot on the node. On success, it
        marks the deploy as complete. On failure, it logs the error
        and marks deploy as failure.

        :param task: a TaskManager object containing the node
        :raises: InstanceDeployFailure, if node reboot failed.
        """
        wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000
        attempts = CONF.agent.post_deploy_get_power_state_retries + 1

        @retrying.retry(
            stop_max_attempt_number=attempts,
            retry_on_result=lambda state: state != states.POWER_OFF,
            wait_fixed=wait
        )
        def _wait_until_powered_off(task):
            return task.driver.power.get_power_state(task)

        node = task.node

        if CONF.agent.deploy_logs_collect == 'always':
            driver_utils.collect_ramdisk_logs(node)

        # Whether ironic should power off the node via out-of-band or
        # in-band methods
        oob_power_off = strutils.bool_from_string(
            node.driver_info.get('deploy_forces_oob_reboot', False))

        try:
            if not oob_power_off:
                try:
                    self._client.power_off(node)
                    _wait_until_powered_off(task)
                except Exception as e:
                    LOG.warning('Failed to soft power off node %(node_uuid)s '
                                'in at least %(timeout)d seconds. '
                                '%(cls)s: %(error)s',
                                {'node_uuid': node.uuid,
                                 'timeout': (wait * (attempts - 1)) / 1000,
                                 'cls': e.__class__.__name__, 'error': e},
                                exc_info=not isinstance(
                                    e, exception.IronicException))
                    manager_utils.node_power_action(task, states.POWER_OFF)
            else:
                # Flush the file system prior to hard rebooting the node
                result = self._client.sync(node)
                error = result.get('faultstring')
                if error:
                    if 'Unknown command' in error:
                        error = _('The version of the IPA ramdisk used in '
                                  'the deployment do not support the '
                                  'command "sync"')
                    LOG.warning(
                        'Failed to flush the file system prior to hard '
                        'rebooting the node %(node)s. Error: %(error)s',
                        {'node': node.uuid, 'error': error})

                manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            log_and_raise_deployment_error(task, msg, exc=e)

        try:
            power_state_to_restore = (
                manager_utils.power_on_node_if_needed(task))
            task.driver.network.remove_provisioning_network(task)
            task.driver.network.configure_tenant_networks(task)
            manager_utils.restore_power_state_if_needed(
                task, power_state_to_restore)
            manager_utils.node_power_action(task, states.POWER_ON)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            # NOTE(mgoddard): Don't collect logs since the node has been
            # powered off.
            log_and_raise_deployment_error(task, msg, collect_logs=False,
                                           exc=e)

        if not node.deploy_step:
            # TODO(rloo): delete this 'if' part after deprecation period, when
            # we expect all (out-of-tree) drivers to support deploy steps.
            # After which we will always notify_conductor_resume_deploy().
            task.process_event('done')
            LOG.info('Deployment to node %s done', task.node.uuid)
        else:
            manager_utils.notify_conductor_resume_deploy(task)
Ejemplo n.º 7
0
    def heartbeat(self, task, callback_url, agent_version):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        :param agent_version: The version of the agent that is heartbeating
        """
        # NOTE(pas-ha) immediately skip the rest if nothing to do
        if (task.node.provision_state not in self.heartbeat_allowed_states
                and not manager_utils.fast_track_able(task)):
            LOG.error(
                'Heartbeat from node %(node)s in unsupported '
                'provision state %(state)s, not taking any action.', {
                    'node': task.node.uuid,
                    'state': task.node.provision_state
                })
            return

        try:
            task.upgrade_lock()
        except exception.NodeLocked:
            LOG.warning(
                'Node %s is currently locked, skipping heartbeat '
                'processing (will retry on the next heartbeat)',
                task.node.uuid)
            return

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)
        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url
        driver_internal_info['agent_version'] = agent_version
        # Record the last heartbeat event time in UTC, so we can make
        # decisions about it later. Can be decoded to datetime object with:
        # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f")
        driver_internal_info['agent_last_heartbeat'] = str(
            timeutils.utcnow().isoformat())
        node.driver_internal_info = driver_internal_info
        node.save()

        if node.provision_state in _HEARTBEAT_RECORD_ONLY:
            # We shouldn't take any additional action. The agent will
            # silently continue to heartbeat to ironic until user initiated
            # state change occurs causing it to match a state below.
            LOG.debug(
                'Heartbeat from %(node)s recorded to identify the '
                'node as on-line.', {'node': task.node.uuid})
            return

        if node.maintenance:
            return self._heartbeat_in_maintenance(task)

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
            # are currently in the core deploy.deploy step. Other deploy steps
            # may cause the agent to boot, but we should not trigger deployment
            # at that point if the driver is polling for completion of a step.
            if node.provision_state == states.DEPLOYWAIT:
                if self.in_core_deploy_step(task):
                    if not self.deploy_has_started(task):
                        msg = _('Node failed to deploy.')
                        self.continue_deploy(task)
                    elif self.deploy_is_done(task):
                        msg = _('Node failed to move to active state.')
                        self.reboot_to_instance(task)
                    else:
                        node.touch_provisioning()
                else:
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    # Check if the driver is polling for completion of a step,
                    # via the 'deployment_polling' flag.
                    polling = node.driver_internal_info.get(
                        'deployment_polling', False)
                    if not polling:
                        manager_utils.notify_conductor_resume_deploy(task)
                    node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug('Node %s just booted to start cleaning.',
                              node.uuid)
                    msg = _('Node failed to start the first cleaning step.')
                    # First, cache the clean steps
                    self.refresh_clean_steps(task)
                    # Then set/verify node clean steps and start cleaning
                    conductor_steps.set_node_cleaning_steps(task)
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    manager_utils.notify_conductor_resume_clean(task)
                else:
                    msg = _('Node failed to check cleaning progress.')
                    # Check if the driver is polling for completion of a step,
                    # via the 'cleaning_polling' flag.
                    polling = node.driver_internal_info.get(
                        'cleaning_polling', False)
                    if not polling:
                        self.continue_cleaning(task)
            elif (node.provision_state == states.RESCUEWAIT):
                msg = _('Node failed to perform rescue operation.')
                self._finalize_rescue(task)
        except Exception as e:
            err_info = {'msg': msg, 'e': e}
            last_error = _('Asynchronous exception: %(msg)s '
                           'Exception: %(e)s for node') % err_info
            errmsg = last_error + ' %(node)s'
            LOG.exception(errmsg, {'node': node.uuid})
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task,
                                              last_error,
                                              collect_logs=bool(self._client))
            elif node.provision_state in (states.RESCUING, states.RESCUEWAIT):
                manager_utils.rescuing_error_handler(task, last_error)
Ejemplo n.º 8
0
    def reboot_and_finish_deploy(self, task):
        """Helper method to trigger reboot on the node and finish deploy.

        This method initiates a reboot on the node. On success, it
        marks the deploy as complete. On failure, it logs the error
        and marks deploy as failure.

        :param task: a TaskManager object containing the node
        :raises: InstanceDeployFailure, if node reboot failed.
        """
        wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000
        attempts = CONF.agent.post_deploy_get_power_state_retries + 1

        @retrying.retry(
            stop_max_attempt_number=attempts,
            retry_on_result=lambda state: state != states.POWER_OFF,
            wait_fixed=wait
        )
        def _wait_until_powered_off(task):
            return task.driver.power.get_power_state(task)

        node = task.node

        if CONF.agent.deploy_logs_collect == 'always':
            driver_utils.collect_ramdisk_logs(node)

        # Whether ironic should power off the node via out-of-band or
        # in-band methods
        oob_power_off = strutils.bool_from_string(
            node.driver_info.get('deploy_forces_oob_reboot', False))

        try:
            if not oob_power_off:
                try:
                    self._client.power_off(node)
                    _wait_until_powered_off(task)
                except Exception as e:
                    LOG.warning('Failed to soft power off node %(node_uuid)s '
                                'in at least %(timeout)d seconds. '
                                '%(cls)s: %(error)s',
                                {'node_uuid': node.uuid,
                                 'timeout': (wait * (attempts - 1)) / 1000,
                                 'cls': e.__class__.__name__, 'error': e},
                                exc_info=not isinstance(
                                    e, exception.IronicException))
                    manager_utils.node_power_action(task, states.POWER_OFF)
            else:
                # Flush the file system prior to hard rebooting the node
                result = self._client.sync(node)
                error = result.get('faultstring')
                if error:
                    if 'Unknown command' in error:
                        error = _('The version of the IPA ramdisk used in '
                                  'the deployment do not support the '
                                  'command "sync"')
                    LOG.warning(
                        'Failed to flush the file system prior to hard '
                        'rebooting the node %(node)s. Error: %(error)s',
                        {'node': node.uuid, 'error': error})

                manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            log_and_raise_deployment_error(task, msg, exc=e)

        try:
            power_state_to_restore = (
                manager_utils.power_on_node_if_needed(task))
            task.driver.network.remove_provisioning_network(task)
            task.driver.network.configure_tenant_networks(task)
            manager_utils.restore_power_state_if_needed(
                task, power_state_to_restore)
            manager_utils.node_power_action(task, states.POWER_ON)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            # NOTE(mgoddard): Don't collect logs since the node has been
            # powered off.
            log_and_raise_deployment_error(task, msg, collect_logs=False,
                                           exc=e)

        if not node.deploy_step:
            # TODO(rloo): delete this 'if' part after deprecation period, when
            # we expect all (out-of-tree) drivers to support deploy steps.
            # After which we will always notify_conductor_resume_deploy().
            task.process_event('done')
            LOG.info('Deployment to node %s done', task.node.uuid)
        else:
            manager_utils.notify_conductor_resume_deploy(task)
Ejemplo n.º 9
0
 def _set_success(self, task):
     if task.node.clean_step:
         manager_utils.notify_conductor_resume_clean(task)
     else:
         manager_utils.notify_conductor_resume_deploy(task)