Esempio n. 1
0
    def _initiate_cleaning(self, task):
        """Initiates the steps required to start cleaning for the node.

        This method polls each interface of the driver for getting the
        clean steps and notifies Ironic conductor to resume cleaning.
        On error, it sets the node to CLEANFAIL state and populates
        node.last_error with the error message.

        :param task: a TaskManager instance containing the node to act on.
        """
        LOG.warning(
            _LW(
                "Bash deploy ramdisk doesn't support in-band cleaning. "
                "Please use the ironic-python-agent (IPA) ramdisk "
                "instead for node %s. "
            ),
            task.node.uuid,
        )
        try:
            manager_utils.set_node_cleaning_steps(task)
            self.notify_conductor_resume_clean(task)
        except Exception as e:
            last_error = _(
                "Encountered exception for node %(node)s " "while initiating cleaning. Error:  %(error)s"
            ) % {"node": task.node.uuid, "error": e}
            return manager_utils.cleaning_error_handler(task, last_error)
Esempio n. 2
0
    def test_set_node_cleaning_steps(self, mock_steps):
        mock_steps.return_value = self.clean_steps

        node = obj_utils.create_test_node(
            self.context, driver='fake',
            provision_state=states.CLEANING,
            target_provision_state=states.AVAILABLE,
            last_error=None,
            clean_step=None)

        with task_manager.acquire(
                self.context, node['id'], shared=False) as task:
            conductor_utils.set_node_cleaning_steps(task)
            node.refresh()
            self.assertEqual(self.clean_steps,
                             task.node.driver_internal_info['clean_steps'])
            self.assertEqual({}, node.clean_step)
Esempio n. 3
0
    def prepare_cleaning(self, task):
        """Boot into the ramdisk to prepare for cleaning.

        :param task: a TaskManager object containing the node
        :raises NodeCleaningFailure: if the previous cleaning ports cannot
                be removed or if new cleaning ports cannot be created
        :returns: None or states.CLEANWAIT for async prepare.
        """
        node = task.node
        manager_utils.set_node_cleaning_steps(task)
        if not node.driver_internal_info['clean_steps']:
            # no clean steps configured, nothing to do.
            return
        task.driver.network.add_cleaning_network(task)
        boot_opt = deploy_utils.build_agent_options(node)
        task.driver.boot.prepare_ramdisk(task, boot_opt)
        manager_utils.node_power_action(task, states.REBOOT)
        return states.CLEANWAIT
Esempio n. 4
0
    def test_set_node_cleaning_steps_automated(self, mock_steps, mock_validate_user_steps):
        mock_steps.return_value = self.clean_steps

        node = obj_utils.create_test_node(
            self.context,
            driver="fake",
            provision_state=states.CLEANING,
            target_provision_state=states.AVAILABLE,
            last_error=None,
            clean_step=None,
        )

        with task_manager.acquire(self.context, node.uuid, shared=False) as task:
            conductor_utils.set_node_cleaning_steps(task)
            node.refresh()
            self.assertEqual(self.clean_steps, node.driver_internal_info["clean_steps"])
            self.assertEqual({}, node.clean_step)
            mock_steps.assert_called_once_with(task, enabled=True)
            self.assertFalse(mock_validate_user_steps.called)
Esempio n. 5
0
    def test_set_node_cleaning_steps_automated(self, mock_steps,
                                               mock_validate_user_steps):
        mock_steps.return_value = self.clean_steps

        node = obj_utils.create_test_node(
            self.context,
            driver='fake',
            provision_state=states.CLEANING,
            target_provision_state=states.AVAILABLE,
            last_error=None,
            clean_step=None)

        with task_manager.acquire(self.context, node.uuid,
                                  shared=False) as task:
            conductor_utils.set_node_cleaning_steps(task)
            node.refresh()
            self.assertEqual(self.clean_steps,
                             node.driver_internal_info['clean_steps'])
            self.assertEqual({}, node.clean_step)
            mock_steps.assert_called_once_with(task, enabled=True)
            self.assertFalse(mock_validate_user_steps.called)
Esempio n. 6
0
    def test_set_node_cleaning_steps_manual(self, mock_steps,
                                            mock_validate_user_steps):
        clean_steps = [self.deploy_raid]
        mock_steps.return_value = self.clean_steps

        node = obj_utils.create_test_node(
            self.context, driver='fake',
            provision_state=states.CLEANING,
            target_provision_state=states.MANAGEABLE,
            last_error=None,
            clean_step=None,
            driver_internal_info={'clean_steps': clean_steps})

        with task_manager.acquire(
                self.context, node.uuid, shared=False) as task:
            conductor_utils.set_node_cleaning_steps(task)
            node.refresh()
            self.assertEqual(clean_steps,
                             task.node.driver_internal_info['clean_steps'])
            self.assertEqual({}, node.clean_step)
            self.assertFalse(mock_steps.called)
            mock_validate_user_steps.assert_called_once_with(task, clean_steps)
Esempio n. 7
0
    def prepare_cleaning(self, task):
        """Boot into the ramdisk to prepare for cleaning.

        :param task: a TaskManager object containing the node
        :raises NodeCleaningFailure: if the previous cleaning ports cannot
                be removed or if new cleaning ports cannot be created
        :returns: None or states.CLEANWAIT for async prepare.
        """
        node = task.node
        use_callback = CONF.ansible.use_ramdisk_callback
        if use_callback:
            manager_utils.set_node_cleaning_steps(task)
            if not node.driver_internal_info['clean_steps']:
                # no clean steps configured, nothing to do.
                return
        task.driver.network.add_cleaning_network(task)
        boot_opt = deploy_utils.build_agent_options(node)
        task.driver.boot.prepare_ramdisk(task, boot_opt)
        manager_utils.node_power_action(task, states.REBOOT)
        if use_callback:
            return states.CLEANWAIT

        ip_addr = _get_node_ip(task)
        LOG.debug('IP of node %(node)s is %(ip)s', {
            'node': node.uuid,
            'ip': ip_addr
        })
        driver_internal_info = node.driver_internal_info
        driver_internal_info['ansible_cleaning_ip'] = ip_addr
        node.driver_internal_info = driver_internal_info
        node.save()
        playbook, user, key = _parse_ansible_driver_info(task.node,
                                                         action='clean')
        node_list = [(node.uuid, ip_addr, user, node.extra)]
        extra_vars = _prepare_extra_vars(node_list)

        LOG.debug('Waiting ramdisk on node %s for cleaning', node.uuid)
        _run_playbook(playbook, extra_vars, key, tags=['wait'])
        LOG.info(_LI('Node %s is ready for cleaning'), node.uuid)
Esempio n. 8
0
    def _initiate_cleaning(self, task):
        """Initiates the steps required to start cleaning for the node.

        This method polls each interface of the driver for getting the
        clean steps and notifies Ironic conductor to resume cleaning.
        On error, it sets the node to CLEANFAIL state and populates
        node.last_error with the error message.

        :param task: a TaskManager instance containing the node to act on.
        """
        LOG.warning(
            _LW("Bash deploy ramdisk doesn't support in-band cleaning. "
                "Please use the ironic-python-agent (IPA) ramdisk "
                "instead for node %s. "), task.node.uuid)
        try:
            manager_utils.set_node_cleaning_steps(task)
            self.notify_conductor_resume_clean(task)
        except Exception as e:
            last_error = (
                _('Encountered exception for node %(node)s '
                  'while initiating cleaning. Error:  %(error)s') %
                {'node': task.node.uuid, 'error': e})
            return manager_utils.cleaning_error_handler(task, last_error)
Esempio n. 9
0
    def test_set_node_cleaning_steps_manual(self, mock_steps,
                                            mock_validate_user_steps):
        clean_steps = [self.deploy_raid]
        mock_steps.return_value = self.clean_steps

        node = obj_utils.create_test_node(
            self.context,
            driver='fake',
            provision_state=states.CLEANING,
            target_provision_state=states.MANAGEABLE,
            last_error=None,
            clean_step=None,
            driver_internal_info={'clean_steps': clean_steps})

        with task_manager.acquire(self.context, node.uuid,
                                  shared=False) as task:
            conductor_utils.set_node_cleaning_steps(task)
            node.refresh()
            self.assertEqual(clean_steps,
                             node.driver_internal_info['clean_steps'])
            self.assertEqual({}, node.clean_step)
            self.assertFalse(mock_steps.called)
            mock_validate_user_steps.assert_called_once_with(task, clean_steps)
Esempio n. 10
0
    def prepare_cleaning(self, task):
        """Boot into the ramdisk to prepare for cleaning.

        :param task: a TaskManager object containing the node
        :raises NodeCleaningFailure: if the previous cleaning ports cannot
                be removed or if new cleaning ports cannot be created
        :returns: None or states.CLEANWAIT for async prepare.
        """
        node = task.node
        use_callback = CONF.ansible.use_ramdisk_callback
        if use_callback:
            manager_utils.set_node_cleaning_steps(task)
            if not node.driver_internal_info['clean_steps']:
                # no clean steps configured, nothing to do.
                return
        task.driver.network.add_cleaning_network(task)
        boot_opt = deploy_utils.build_agent_options(node)
        task.driver.boot.prepare_ramdisk(task, boot_opt)
        manager_utils.node_power_action(task, states.REBOOT)
        if use_callback:
            return states.CLEANWAIT

        ip_addr = _get_node_ip_dhcp(task)
        LOG.debug('IP of node %(node)s is %(ip)s',
                  {'node': node.uuid, 'ip': ip_addr})
        driver_internal_info = node.driver_internal_info
        driver_internal_info['ansible_cleaning_ip'] = ip_addr
        node.driver_internal_info = driver_internal_info
        node.save()
        playbook, user, key = _parse_ansible_driver_info(
            task.node, action='clean')
        node_list = [(node.uuid, ip_addr, user, node.extra)]
        extra_vars = _prepare_extra_vars(node_list)

        LOG.debug('Waiting ramdisk on node %s for cleaning', node.uuid)
        _run_playbook(playbook, extra_vars, key, tags=['wait'])
        LOG.info('Node %s is ready for cleaning', node.uuid)
Esempio n. 11
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        node = task.node
        command = self._get_completed_cleaning_command(task)
        LOG.debug(
            'Cleaning command status for node %(node)s on step %(step)s:'
            ' %(command)s', {
                'node': node.uuid,
                'step': node.clean_step,
                'command': command
            })

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') % {
                         'node': node.uuid,
                         'err': command.get('command_error'),
                         'step': node.clean_step
                     })
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(
                _LI('Node %s detected a clean version mismatch, '
                    'resetting clean steps and rebooting the node.'),
                node.uuid)
            try:
                manager_utils.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = (_('Could not restart cleaning on node %(node)s: '
                         '%(err)s.') % {
                             'node': node.uuid,
                             'err': command.get('command_error'),
                             'step': node.clean_step
                         })
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)
            self.notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug(
                    'For node %(node)s, executing post clean step '
                    'hook %(method)s for clean step %(step)s' % {
                        'method': clean_step_hook.__name__,
                        'node': node.uuid,
                        'step': node.clean_step
                    })
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post clean step hook '
                             '%(method)s failed for clean step %(step)s.'
                             'Error: %(error)s') % {
                                 'method': clean_step_hook.__name__,
                                 'node': node.uuid,
                                 'error': e,
                                 'step': node.clean_step
                             })
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            LOG.info(
                _LI('Agent on node %s returned cleaning command success, '
                    'moving to next clean step'), node.uuid)
            self.notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') % {
                         'node': node.uuid,
                         'err': command.get('command_status'),
                         'step': node.clean_step
                     })
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
Esempio n. 12
0
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # NOTE(pas-ha) immediately skip the rest if nothing to do
        if task.node.provision_state not in self.heartbeat_allowed_states:
            LOG.debug('Heartbeat from node %(node)s in unsupported '
                      'provision state %(state)s, not taking any action.',
                      {'node': task.node.uuid,
                       'state': task.node.provision_state})
            return

        task.upgrade_lock()

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url
        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT and
                  not self.deploy_has_started(task)):
                msg = _('Node failed to deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug('Node %s just booted to start cleaning.',
                              node.uuid)
                    msg = _('Node failed to start the first cleaning step.')
                    # First, cache the clean steps
                    self.refresh_clean_steps(task)
                    # Then set/verify node clean steps and start cleaning
                    manager_utils.set_node_cleaning_steps(task)
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    _notify_conductor_resume_clean(task)
                else:
                    msg = _('Node failed to check cleaning progress.')
                    self.continue_cleaning(task)

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(
                    task, last_error, collect_logs=bool(self._client))
Esempio n. 13
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        node = task.node
        command = self._get_completed_cleaning_command(task)
        LOG.debug(
            "Cleaning command status for node %(node)s on step %(step)s:" " %(command)s",
            {"node": node.uuid, "step": node.clean_step, "command": command},
        )

        if not command:
            # Command is not done yet
            return

        if command.get("command_status") == "FAILED":
            msg = _("Agent returned error for clean step %(step)s on node " "%(node)s : %(err)s.") % {
                "node": node.uuid,
                "err": command.get("command_error"),
                "step": node.clean_step,
            }
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get("command_status") == "CLEAN_VERSION_MISMATCH":
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(
                _LI("Node %s detected a clean version mismatch, " "resetting clean steps and rebooting the node."),
                node.uuid,
            )
            try:
                manager_utils.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = _("Could not restart cleaning on node %(node)s: " "%(err)s.") % {
                    "node": node.uuid,
                    "err": command.get("command_error"),
                    "step": node.clean_step,
                }
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)
            self.notify_conductor_resume_clean(task)

        elif command.get("command_status") == "SUCCEEDED":
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug(
                    "For node %(node)s, executing post clean step "
                    "hook %(method)s for clean step %(step)s"
                    % {"method": clean_step_hook.__name__, "node": node.uuid, "step": node.clean_step}
                )
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = _(
                        "For node %(node)s, post clean step hook "
                        "%(method)s failed for clean step %(step)s."
                        "Error: %(error)s"
                    ) % {"method": clean_step_hook.__name__, "node": node.uuid, "error": e, "step": node.clean_step}
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            LOG.info(_LI("Agent on node %s returned cleaning command success, " "moving to next clean step"), node.uuid)
            self.notify_conductor_resume_clean(task)
        else:
            msg = _("Agent returned unknown status for clean step %(step)s " "on node %(node)s : %(err)s.") % {
                "node": node.uuid,
                "err": command.get("command_status"),
                "step": node.clean_step,
            }
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
Esempio n. 14
0
    def heartbeat(self, task, **kwargs):
        """Method for agent to periodically check in.

        The agent should be sending its agent_url (so Ironic can talk back)
        as a kwarg. kwargs should have the following format::

         {
             'agent_url': 'http://AGENT_HOST:AGENT_PORT'
         }

        AGENT_PORT defaults to 9999.
        """
        node = task.node
        driver_internal_info = node.driver_internal_info
        LOG.debug(
            'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.',
            {'node': node.uuid,
             'heartbeat': driver_internal_info.get('agent_last_heartbeat')})
        driver_internal_info['agent_last_heartbeat'] = int(time.time())
        try:
            driver_internal_info['agent_url'] = kwargs['agent_url']
        except KeyError:
            raise exception.MissingParameterValue(_('For heartbeat operation, '
                                                    '"agent_url" must be '
                                                    'specified.'))

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT and
                  not self.deploy_has_started(task)):
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task, **kwargs)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task, **kwargs)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_has_started(task)):
                node.touch_provisioning()
            # TODO(lucasagomes): CLEANING here for backwards compat
            # with previous code, otherwise nodes in CLEANING when this
            # is deployed would fail. Should be removed once the Mitaka
            # release starts.
            elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self._refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        self.notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task, **kwargs)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task, last_error)
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # TODO(dtantsur): upgrade lock only if we actually take action other
        # than updating the last timestamp.
        task.upgrade_lock()

        node = task.node
        driver_internal_info = node.driver_internal_info
        LOG.debug(
            'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.', {
                'node': node.uuid,
                'heartbeat': driver_internal_info.get('agent_last_heartbeat')
            })
        driver_internal_info['agent_last_heartbeat'] = int(time.time())
        try:
            driver_internal_info['agent_url'] = callback_url
        except KeyError:
            raise exception.MissingParameterValue(
                _('For heartbeat operation, '
                  '"agent_url" must be '
                  'specified.'))

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug(
                    'Heartbeat from node %(node)s in maintenance mode; '
                    'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT
                  and not self.deploy_has_started(task)):
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self._refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        _notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task, last_error)
Esempio n. 16
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, the agent
        compares the version of all hardware managers at the start of the
        cleaning (the agent's get_clean_steps() call) and before executing
        each clean step. If the version has changed between steps, the agent is
        unable to tell if an ordering change will cause a cleaning issue so
        it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
        the entire cleaning cycle. For manual cleaning, we don't.
        """
        node = task.node
        # For manual clean, the target provision state is MANAGEABLE, whereas
        # for automated cleaning, it is (the default) AVAILABLE.
        manual_clean = node.target_provision_state == states.MANAGEABLE
        command = self._get_completed_cleaning_command(task)
        LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': node.uuid,
                                   'step': node.clean_step,
                                   'command': command})

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_error'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Cache the new clean steps (and 'hardware_manager_version')
            try:
                self._refresh_clean_steps(task)
            except exception.NodeCleaningFailure as e:
                msg = (_('Could not continue cleaning on node '
                         '%(node)s: %(err)s.') %
                       {'node': node.uuid, 'err': e})
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)

            if manual_clean:
                # Don't restart manual cleaning if agent reboots to a new
                # version. Both are operator actions, unlike automated
                # cleaning. Manual clean steps are not necessarily idempotent
                # like automated clean steps and can be even longer running.
                LOG.info(_LI('During manual cleaning, node %(node)s detected '
                             'a clean version mismatch. Re-executing and '
                             'continuing from current step %(step)s.'),
                         {'node': node.uuid, 'step': node.clean_step})

                driver_internal_info = node.driver_internal_info
                driver_internal_info['skip_current_clean_step'] = False
                node.driver_internal_info = driver_internal_info
                node.save()
            else:
                # Restart cleaning, agent must have rebooted to new version
                LOG.info(_LI('During automated cleaning, node %s detected a '
                             'clean version mismatch. Resetting clean steps '
                             'and rebooting the node.'),
                         node.uuid)
                try:
                    manager_utils.set_node_cleaning_steps(task)
                except exception.NodeCleaningFailure:
                    msg = (_('Could not restart automated cleaning on node '
                             '%(node)s: %(err)s.') %
                           {'node': node.uuid,
                            'err': command.get('command_error'),
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            self.notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug('For node %(node)s, executing post clean step '
                          'hook %(method)s for clean step %(step)s' %
                          {'method': clean_step_hook.__name__,
                           'node': node.uuid,
                           'step': node.clean_step})
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post clean step hook '
                             '%(method)s failed for clean step %(step)s.'
                             'Error: %(error)s') %
                           {'method': clean_step_hook.__name__,
                            'node': node.uuid,
                            'error': e,
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            LOG.info(_LI('Agent on node %s returned cleaning command success, '
                         'moving to next clean step'), node.uuid)
            self.notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_status'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
Esempio n. 17
0
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # TODO(dtantsur): upgrade lock only if we actually take action other
        # than updating the last timestamp.
        task.upgrade_lock()

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url

        # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't
        # being used so remove that entry if it exists.
        # Hopefully all nodes will have been updated by Pike, so
        # we can delete this code then.
        driver_internal_info.pop('agent_last_heartbeat', None)

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT and
                  not self.deploy_has_started(task)):
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self._refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        _notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task, last_error)
Esempio n. 18
0
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # TODO(dtantsur): upgrade lock only if we actually take action other
        # than updating the last timestamp.
        task.upgrade_lock()

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url

        # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't
        # being used so remove that entry if it exists.
        # Hopefully all nodes will have been updated by Pike, so
        # we can delete this code then.
        driver_internal_info.pop('agent_last_heartbeat', None)

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug(
                    'Heartbeat from node %(node)s in maintenance mode; '
                    'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT
                  and not self.deploy_has_started(task)):
                msg = _('Node failed to deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self.refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        _notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task,
                                              last_error,
                                              collect_logs=bool(self._client))
Esempio n. 19
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        node = task.node
        command = self._get_completed_cleaning_command(task)
        LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': node.uuid,
                                   'step': node.clean_step,
                                   'command': command})

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_error'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(_LI('Node %s detected a clean version mismatch, '
                         'resetting clean steps and rebooting the node.'),
                     node.uuid)
            try:
                manager_utils.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = (_('Could not restart cleaning on node %(node)s: '
                         '%(err)s.') %
                       {'node': node.uuid,
                        'err': command.get('command_error'),
                        'step': node.clean_step})
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)
            self.notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug('For node %(node)s, executing post clean step '
                          'hook %(method)s for clean step %(step)s' %
                          {'method': clean_step_hook.__name__,
                           'node': node.uuid,
                           'step': node.clean_step})
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post clean step hook '
                             '%(method)s failed for clean step %(step)s.'
                             'Error: %(error)s') %
                           {'method': clean_step_hook.__name__,
                            'node': node.uuid,
                            'error': e,
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            LOG.info(_LI('Agent on node %s returned cleaning command success, '
                         'moving to next clean step'), node.uuid)
            self.notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_status'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
    def heartbeat(self, task, callback_url, agent_version):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        :param agent_version: The version of the agent that is heartbeating
        """
        # NOTE(pas-ha) immediately skip the rest if nothing to do
        if task.node.provision_state not in self.heartbeat_allowed_states:
            LOG.debug(
                'Heartbeat from node %(node)s in unsupported '
                'provision state %(state)s, not taking any action.', {
                    'node': task.node.uuid,
                    'state': task.node.provision_state
                })
            return

        try:
            task.upgrade_lock()
        except exception.NodeLocked:
            LOG.warning(
                'Node %s is currently locked, skipping heartbeat '
                'processing (will retry on the next heartbeat)',
                task.node.uuid)
            return

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url
        driver_internal_info['agent_version'] = agent_version
        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug(
                    'Heartbeat from node %(node)s in maintenance mode; '
                    'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT
                  and not self.deploy_has_started(task)):
                msg = _('Node failed to deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug('Node %s just booted to start cleaning.',
                              node.uuid)
                    msg = _('Node failed to start the first cleaning step.')
                    # First, cache the clean steps
                    self.refresh_clean_steps(task)
                    # Then set/verify node clean steps and start cleaning
                    manager_utils.set_node_cleaning_steps(task)
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    manager_utils.notify_conductor_resume_clean(task)
                else:
                    msg = _('Node failed to check cleaning progress.')
                    self.continue_cleaning(task)
            elif (node.provision_state == states.RESCUEWAIT):
                msg = _('Node failed to perform rescue operation.')
                self._finalize_rescue(task)
        except Exception as e:
            err_info = {'msg': msg, 'e': e}
            last_error = _('Asynchronous exception: %(msg)s '
                           'Exception: %(e)s for node') % err_info
            errmsg = last_error + ' %(node)s'
            LOG.exception(errmsg, {'node': node.uuid})
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task,
                                              last_error,
                                              collect_logs=bool(self._client))
            elif node.provision_state in (states.RESCUING, states.RESCUEWAIT):
                manager_utils.rescuing_error_handler(task, last_error)