def _cleaning_reboot(self, task):
        """Reboots a node out of band after a clean step that requires it.

        If an agent clean step has 'reboot_requested': True, reboots the
        node when the step is completed. Will put the node in CLEANFAIL
        if the node cannot be rebooted.

        :param task: a TaskManager instance
        """
        try:
            manager_utils.node_power_action(task, states.REBOOT)
        except Exception as e:
            msg = (_('Reboot requested by clean step %(step)s failed for '
                     'node %(node)s: %(err)s') %
                   {'step': task.node.clean_step,
                    'node': task.node.uuid,
                    'err': e})
            LOG.error(msg)
            # do not set cleaning_reboot if we didn't reboot
            manager_utils.cleaning_error_handler(task, msg)
            return

        # Signify that we've rebooted
        driver_internal_info = task.node.driver_internal_info
        driver_internal_info['cleaning_reboot'] = True
        task.node.driver_internal_info = driver_internal_info
        task.node.save()
Exemple #2
0
def do_node_clean_abort(task, step_name=None):
    """Internal method to abort an ongoing operation.

    :param task: a TaskManager instance with an exclusive lock
    :param step_name: The name of the clean step.
    """
    node = task.node
    try:
        task.driver.deploy.tear_down_cleaning(task)
    except Exception as e:
        LOG.exception('Failed to tear down cleaning for node %(node)s '
                      'after aborting the operation. Error: %(err)s',
                      {'node': node.uuid, 'err': e})
        error_msg = _('Failed to tear down cleaning after aborting '
                      'the operation')
        utils.cleaning_error_handler(task, error_msg,
                                     tear_down_cleaning=False,
                                     set_fail_state=False)
        return

    info_message = _('Clean operation aborted for node %s') % node.uuid
    last_error = _('By request, the clean operation was aborted')
    if step_name:
        msg = _(' after the completion of step "%s"') % step_name
        last_error += msg
        info_message += msg

    node.last_error = last_error
    node.clean_step = None
    utils.wipe_cleaning_internal_info(task)
    node.save()
    LOG.info(info_message)
Exemple #3
0
 def _heartbeat_in_maintenance(self, task):
     node = task.node
     if (node.provision_state in (states.CLEANING, states.CLEANWAIT)
             and not CONF.conductor.allow_provisioning_in_maintenance):
         LOG.error(
             'Aborting cleaning for node %s, as it is in maintenance '
             'mode', node.uuid)
         last_error = _('Cleaning aborted as node is in maintenance mode')
         manager_utils.cleaning_error_handler(task, last_error)
     elif (node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT)
           and not CONF.conductor.allow_provisioning_in_maintenance):
         LOG.error(
             'Aborting deployment for node %s, as it is in '
             'maintenance mode', node.uuid)
         last_error = _('Deploy aborted as node is in maintenance mode')
         deploy_utils.set_failed_state(task, last_error, collect_logs=False)
     elif (node.provision_state in (states.RESCUING, states.RESCUEWAIT)
           and not CONF.conductor.allow_provisioning_in_maintenance):
         LOG.error(
             'Aborting rescuing for node %s, as it is in '
             'maintenance mode', node.uuid)
         last_error = _('Rescue aborted as node is in maintenance mode')
         manager_utils.rescuing_error_handler(task, last_error)
     else:
         LOG.warning(
             'Heartbeat from node %(node)s in '
             'maintenance mode; not taking any action.',
             {'node': node.uuid})
Exemple #4
0
    def _cleaning_reboot(self, task):
        """Reboots a node out of band after a clean step that requires it.

        If an agent clean step has 'reboot_requested': True, reboots the
        node when the step is completed. Will put the node in CLEANFAIL
        if the node cannot be rebooted.

        :param task: a TaskManager instance
        """
        try:
            manager_utils.node_power_action(task, states.REBOOT)
        except Exception as e:
            msg = (_('Reboot requested by clean step %(step)s failed for '
                     'node %(node)s: %(err)s') % {
                         'step': task.node.clean_step,
                         'node': task.node.uuid,
                         'err': e
                     })
            LOG.error(msg)
            # do not set cleaning_reboot if we didn't reboot
            manager_utils.cleaning_error_handler(task, msg)
            return

        # Signify that we've rebooted
        driver_internal_info = task.node.driver_internal_info
        driver_internal_info['cleaning_reboot'] = True
        task.node.driver_internal_info = driver_internal_info
        task.node.save()
def _cleaning_reboot(task):
    """Reboots a node out of band after a clean step that requires it.

    If an agent clean step has 'reboot_requested': True, reboots the
    node when the step is completed. Will put the node in CLEANFAIL
    if the node cannot be rebooted.

    :param task: a TaskManager instance
    """
    try:
        # NOTE(fellypefca): Call prepare_ramdisk on ensure that the
        # baremetal node boots back into the ramdisk after reboot.
        deploy_opts = deploy_utils.build_agent_options(task.node)
        task.driver.boot.prepare_ramdisk(task, deploy_opts)
        manager_utils.node_power_action(task, states.REBOOT)
    except Exception as e:
        msg = (_('Reboot requested by clean step %(step)s failed for '
                 'node %(node)s: %(err)s') %
               {'step': task.node.clean_step,
                'node': task.node.uuid,
                'err': e})
        LOG.error(msg, exc_info=not isinstance(e, exception.IronicException))
        # do not set cleaning_reboot if we didn't reboot
        manager_utils.cleaning_error_handler(task, msg)
        return

    # Signify that we've rebooted
    driver_internal_info = task.node.driver_internal_info
    driver_internal_info['cleaning_reboot'] = True
    task.node.driver_internal_info = driver_internal_info
    task.node.save()
Exemple #6
0
 def test_cleaning_error_handler_no_fail(self):
     conductor_utils.cleaning_error_handler(self.task,
                                            'foo',
                                            set_fail_state=False)
     driver = self.task.driver.deploy
     driver.tear_down_cleaning.assert_called_once_with(self.task)
     self.assertFalse(self.task.process_event.called)
def _cleaning_reboot(task):
    """Reboots a node out of band after a clean step that requires it.

    If an agent clean step has 'reboot_requested': True, reboots the
    node when the step is completed. Will put the node in CLEANFAIL
    if the node cannot be rebooted.

    :param task: a TaskManager instance
    """
    try:
        # NOTE(fellypefca): Call prepare_ramdisk on ensure that the
        # baremetal node boots back into the ramdisk after reboot.
        deploy_opts = deploy_utils.build_agent_options(task.node)
        task.driver.boot.prepare_ramdisk(task, deploy_opts)
        manager_utils.node_power_action(task, states.REBOOT)
    except Exception as e:
        msg = (_('Reboot requested by clean step %(step)s failed for '
                 'node %(node)s: %(err)s') %
               {'step': task.node.clean_step,
                'node': task.node.uuid,
                'err': e})
        LOG.error(msg, exc_info=not isinstance(e, exception.IronicException))
        # do not set cleaning_reboot if we didn't reboot
        manager_utils.cleaning_error_handler(task, msg)
        return

    # Signify that we've rebooted
    driver_internal_info = task.node.driver_internal_info
    driver_internal_info['cleaning_reboot'] = True
    task.node.driver_internal_info = driver_internal_info
    task.node.save()
    def execute_clean_step(self, task, step):
        """Execute a clean step.

        :param task: a TaskManager object containing the node
        :param step: a clean step dictionary to execute
        :returns: None
        """
        node = task.node
        playbook, user, key = _parse_ansible_driver_info(
            task.node, action='clean')
        stepname = step['step']
        try:
            ip_addr = node.driver_internal_info['ansible_cleaning_ip']
        except KeyError:
            raise exception.NodeCleaningFailure(node=node.uuid,
                                                reason='undefined node IP '
                                                'addresses')
        node_list = [(node.uuid, ip_addr, user, node.extra)]
        extra_vars = _prepare_extra_vars(node_list)

        LOG.debug('Starting cleaning step %(step)s on node %(node)s',
                  {'node': node.uuid, 'step': stepname})
        step_tags = step['args'].get('tags', [])
        try:
            _run_playbook(playbook, extra_vars, key,
                          tags=step_tags)
        except exception.InstanceDeployFailure as e:
            LOG.error(_LE("Ansible failed cleaning step %(step)s "
                          "on node %(node)s."), {
                              'node': node.uuid, 'step': stepname})
            manager_utils.cleaning_error_handler(task, six.text_type(e))
        else:
            LOG.info(_LI('Ansible completed cleaning step %(step)s '
                         'on node %(node)s.'),
                     {'node': node.uuid, 'step': stepname})
Exemple #9
0
 def test_cleaning_error_handler_no_teardown(self):
     target = states.MANAGEABLE
     self.node.target_provision_state = target
     conductor_utils.cleaning_error_handler(self.task, 'foo',
                                            tear_down_cleaning=False)
     self.assertFalse(self.task.driver.deploy.tear_down_cleaning.called)
     self.task.process_event.assert_called_once_with('fail',
                                                     target_state=target)
Exemple #10
0
 def test_cleaning_error_handler_no_teardown(self):
     target = states.MANAGEABLE
     self.node.target_provision_state = target
     conductor_utils.cleaning_error_handler(self.task,
                                            'foo',
                                            tear_down_cleaning=False)
     self.assertFalse(self.task.driver.deploy.tear_down_cleaning.called)
     self.task.process_event.assert_called_once_with('fail',
                                                     target_state=target)
Exemple #11
0
    def heartbeat(self, task, callback_url):
        """Method for ansible ramdisk callback."""
        node = task.node
        address = urlparse.urlparse(callback_url).netloc.split(':')[0]

        if node.maintenance:
            # this shouldn't happen often, but skip the rest if it does.
            LOG.debug(
                'Heartbeat from node %(node)s in maintenance mode; '
                'not taking any action.', {'node': node.uuid})
        elif node.provision_state == states.DEPLOYWAIT:
            LOG.debug('Heartbeat from %(node)s.', {'node': node.uuid})
            self._upgrade_lock(task, purpose='deploy')
            node = task.node
            task.process_event('resume')
            try:
                _deploy(task, address)
            except Exception as e:
                error = _('Deploy failed for node %(node)s: '
                          'Error: %(exc)s') % {
                              'node': node.uuid,
                              'exc': six.text_type(e)
                          }
                LOG.exception(error)
                self._set_failed_state(task, error)

            else:
                LOG.info(_LI('Deployment to node %s done'), node.uuid)
                task.process_event('done')

        elif node.provision_state == states.CLEANWAIT:
            LOG.debug('Node %s just booted to start cleaning.', node.uuid)
            self._upgrade_lock(task, purpose='clean')
            node = task.node
            driver_internal_info = node.driver_internal_info
            driver_internal_info['ansible_cleaning_ip'] = address
            node.driver_internal_info = driver_internal_info
            node.save()
            try:
                _notify_conductor_resume_clean(task)
            except Exception as e:
                error = _('cleaning failed for node %(node)s: '
                          'Error: %(exc)s') % {
                              'node': node.uuid,
                              'exc': six.text_type(e)
                          }
                LOG.exception(error)
                manager_utils.cleaning_error_handler(task, error)

        else:
            LOG.warning(
                _LW('Call back from %(node)s in invalid provision '
                    'state %(state)s'), {
                        'node': node.uuid,
                        'state': node.provision_state
                    })
Exemple #12
0
 def _set_step_failed(self, task, msg, exc):
     log_msg = ("RAID configuration job failed for node %(node)s. "
                "Message: '%(message)s'." % {
                    'node': task.node.uuid,
                    'message': msg
                })
     if task.node.provision_state == states.DEPLOYING:
         manager_utils.deploying_error_handler(task, log_msg, errmsg=msg)
     else:
         manager_utils.cleaning_error_handler(task, log_msg, errmsg=msg)
    def heartbeat(self, task, callback_url):
        """Method for ansible ramdisk callback."""
        node = task.node
        address = urlparse.urlparse(callback_url).netloc.split(':')[0]

        if node.maintenance:
            # this shouldn't happen often, but skip the rest if it does.
            LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                      'not taking any action.', {'node': node.uuid})
        elif node.provision_state == states.DEPLOYWAIT:
            LOG.debug('Heartbeat from %(node)s.', {'node': node.uuid})
            self._upgrade_lock(task, purpose='deploy')
            node = task.node
            task.process_event('resume')
            try:
                _deploy(task, address)
            except Exception as e:
                error = _('Deploy failed for node %(node)s: '
                          'Error: %(exc)s') % {'node': node.uuid,
                                               'exc': six.text_type(e)}
                LOG.exception(error)
                self._set_failed_state(task, error)

            else:
                LOG.info(_LI('Deployment to node %s done'), node.uuid)
                task.process_event('done')

        elif node.provision_state == states.CLEANWAIT:
            LOG.debug('Node %s just booted to start cleaning.',
                      node.uuid)
            self._upgrade_lock(task, purpose='clean')
            node = task.node
            driver_internal_info = node.driver_internal_info
            driver_internal_info['ansible_cleaning_ip'] = address
            node.driver_internal_info = driver_internal_info
            node.save()
            try:
                _notify_conductor_resume_clean(task)
            except Exception as e:
                error = _('cleaning failed for node %(node)s: '
                          'Error: %(exc)s') % {'node': node.uuid,
                                               'exc': six.text_type(e)}
                LOG.exception(error)
                manager_utils.cleaning_error_handler(task, error)

        else:
            LOG.warning(_LW('Call back from %(node)s in invalid provision '
                            'state %(state)s'),
                        {'node': node.uuid, 'state': node.provision_state})
Exemple #14
0
 def test_cleaning_error_handler(self):
     self.node.provision_state = states.CLEANING
     target = 'baz'
     self.node.target_provision_state = target
     msg = 'error bar'
     conductor_utils.cleaning_error_handler(self.task, msg)
     self.node.save.assert_called_once_with()
     self.assertEqual({}, self.node.clean_step)
     self.assertEqual(msg, self.node.last_error)
     self.assertTrue(self.node.maintenance)
     self.assertEqual(msg, self.node.maintenance_reason)
     driver = self.task.driver.deploy
     driver.tear_down_cleaning.assert_called_once_with(self.task)
     self.task.process_event.assert_called_once_with('fail',
                                                     target_state=None)
Exemple #15
0
 def test_cleaning_error_handler(self):
     self.node.provision_state = states.CLEANING
     target = 'baz'
     self.node.target_provision_state = target
     msg = 'error bar'
     conductor_utils.cleaning_error_handler(self.task, msg)
     self.node.save.assert_called_once_with()
     self.assertEqual({}, self.node.clean_step)
     self.assertEqual(msg, self.node.last_error)
     self.assertTrue(self.node.maintenance)
     self.assertEqual(msg, self.node.maintenance_reason)
     driver = self.task.driver.deploy
     driver.tear_down_cleaning.assert_called_once_with(self.task)
     self.task.process_event.assert_called_once_with('fail',
                                                     target_state=None)
Exemple #16
0
    def _set_failed(self, task, error_message):
        """Set the node in failed state by invoking 'fail' event.

        :param task: a TaskManager instance with node to act on
        :param error_message: Error message
        """
        log_msg = ("BIOS configuration failed for node %(node)s. %(error)s " %
                   {
                       'node': task.node.uuid,
                       'error': error_message
                   })
        if task.node.clean_step:
            manager_utils.cleaning_error_handler(task, log_msg, error_message)
        else:
            manager_utils.deploying_error_handler(task, log_msg, error_message)
Exemple #17
0
    def _set_step_failed(self, task, attrs_not_updated):
        """Fail the cleaning or deployment step and log the error.

        :param task: a TaskManager instance containing the node to act on.
        :param attrs_not_updated: the BIOS attributes that were not updated.
        """
        error_msg = (_('Redfish BIOS apply_configuration step failed for node '
                       '%(node)s. Attributes %(attrs)s are not updated.') %
                     {'node': task.node.uuid, 'attrs': attrs_not_updated})
        last_error = (_('Redfish BIOS apply_configuration step failed. '
                        'Attributes %(attrs)s are not updated.') %
                      {'attrs': attrs_not_updated})
        if task.node.provision_state in [states.CLEANING, states.CLEANWAIT]:
            manager_utils.cleaning_error_handler(task, last_error)
        if task.node.provision_state in [states.DEPLOYING, states.DEPLOYWAIT]:
            manager_utils.deploying_error_handler(task, error_msg, last_error)
Exemple #18
0
 def test_cleaning_error_handler(self):
     self.node.provision_state = states.CLEANING
     target = "baz"
     self.node.target_provision_state = target
     self.node.driver_internal_info = {}
     msg = "error bar"
     conductor_utils.cleaning_error_handler(self.task, msg)
     self.node.save.assert_called_once_with()
     self.assertEqual({}, self.node.clean_step)
     self.assertFalse("clean_step_index" in self.node.driver_internal_info)
     self.assertEqual(msg, self.node.last_error)
     self.assertTrue(self.node.maintenance)
     self.assertEqual(msg, self.node.maintenance_reason)
     driver = self.task.driver.deploy
     driver.tear_down_cleaning.assert_called_once_with(self.task)
     self.task.process_event.assert_called_once_with("fail", target_state=None)
Exemple #19
0
    def _initiate_cleaning(self, task):
        """Initiates the steps required to start cleaning for the node.

        This method polls each interface of the driver for getting the
        clean steps and notifies Ironic conductor to resume cleaning.
        On error, it sets the node to CLEANFAIL state and populates
        node.last_error with the error message.

        :param task: a TaskManager instance containing the node to act on.
        """
        LOG.warning(
            _LW(
                "Bash deploy ramdisk doesn't support in-band cleaning. "
                "Please use the ironic-python-agent (IPA) ramdisk "
                "instead for node %s. "
            ),
            task.node.uuid,
        )
        try:
            manager_utils.set_node_cleaning_steps(task)
            self.notify_conductor_resume_clean(task)
        except Exception as e:
            last_error = _(
                "Encountered exception for node %(node)s " "while initiating cleaning. Error:  %(error)s"
            ) % {"node": task.node.uuid, "error": e}
            return manager_utils.cleaning_error_handler(task, last_error)
Exemple #20
0
def do_node_clean_abort(task, step_name=None):
    """Internal method to abort an ongoing operation.

    :param task: a TaskManager instance with an exclusive lock
    :param step_name: The name of the clean step.
    """
    node = task.node
    try:
        task.driver.deploy.tear_down_cleaning(task)
    except Exception as e:
        LOG.exception(
            'Failed to tear down cleaning for node %(node)s '
            'after aborting the operation. Error: %(err)s', {
                'node': node.uuid,
                'err': e
            })
        error_msg = _('Failed to tear down cleaning after aborting '
                      'the operation')
        utils.cleaning_error_handler(task,
                                     error_msg,
                                     tear_down_cleaning=False,
                                     set_fail_state=False)
        return

    info_message = _('Clean operation aborted for node %s') % node.uuid
    last_error = _('By request, the clean operation was aborted')
    if step_name:
        msg = _(' after the completion of step "%s"') % step_name
        last_error += msg
        info_message += msg

    node.last_error = last_error
    node.clean_step = None
    info = node.driver_internal_info
    # Clear any leftover metadata about cleaning
    info.pop('clean_step_index', None)
    info.pop('cleaning_reboot', None)
    info.pop('cleaning_polling', None)
    info.pop('skip_current_clean_step', None)
    info.pop('agent_url', None)
    info.pop('agent_secret_token', None)
    info.pop('agent_secret_token_pregenerated', None)
    node.driver_internal_info = info
    node.save()
    LOG.info(info_message)
Exemple #21
0
def _post_step_reboot(task, step_type):
    """Reboots a node out of band after a clean/deploy step that requires it.

    If an agent step has 'reboot_requested': True, reboots the node when
    the step is completed. Will put the node in CLEANFAIL/DEPLOYFAIL if
    the node cannot be rebooted.

    :param task: a TaskManager instance
    :param step_type: 'clean' or 'deploy'
    """
    current_step = (task.node.clean_step if step_type == 'clean'
                    else task.node.deploy_step)
    try:
        # NOTE(fellypefca): Call prepare_ramdisk on ensure that the
        # baremetal node boots back into the ramdisk after reboot.
        deploy_opts = deploy_utils.build_agent_options(task.node)
        task.driver.boot.prepare_ramdisk(task, deploy_opts)
        manager_utils.node_power_action(task, states.REBOOT)
    except Exception as e:
        msg = (_('Reboot requested by %(type)s step %(step)s failed for '
                 'node %(node)s: %(err)s') %
               {'step': current_step,
                'node': task.node.uuid,
                'err': e,
                'type': step_type})
        LOG.error(msg, exc_info=not isinstance(e, exception.IronicException))
        # do not set cleaning_reboot if we didn't reboot
        if step_type == 'clean':
            manager_utils.cleaning_error_handler(task, msg)
        else:
            manager_utils.deploying_error_handler(task, msg)
        return

    # Signify that we've rebooted
    driver_internal_info = task.node.driver_internal_info
    field = ('cleaning_reboot' if step_type == 'clean'
             else 'deployment_reboot')
    driver_internal_info[field] = True
    if not driver_internal_info.get('agent_secret_token_pregenerated', False):
        # Wipes out the existing recorded token because the machine will
        # need to re-establish the token.
        driver_internal_info.pop('agent_secret_token', None)
    task.node.driver_internal_info = driver_internal_info
    task.node.save()
Exemple #22
0
    def execute_clean_step(self, task, step):
        """Execute a clean step.

        :param task: a TaskManager object containing the node
        :param step: a clean step dictionary to execute
        :returns: None
        """
        node = task.node
        playbook, user, key = _parse_ansible_driver_info(task.node,
                                                         action='clean')
        stepname = step['step']
        try:
            ip_addr = node.driver_internal_info['ansible_cleaning_ip']
        except KeyError:
            raise exception.NodeCleaningFailure(node=node.uuid,
                                                reason='undefined node IP '
                                                'addresses')
        node_list = [(node.uuid, ip_addr, user, node.extra)]
        extra_vars = _prepare_extra_vars(node_list)

        LOG.debug('Starting cleaning step %(step)s on node %(node)s', {
            'node': node.uuid,
            'step': stepname
        })
        step_tags = step['args'].get('tags', [])
        try:
            _run_playbook(playbook, extra_vars, key, tags=step_tags)
        except exception.InstanceDeployFailure as e:
            LOG.error(
                _LE("Ansible failed cleaning step %(step)s "
                    "on node %(node)s."), {
                        'node': node.uuid,
                        'step': stepname
                    })
            manager_utils.cleaning_error_handler(task, six.text_type(e))
        else:
            LOG.info(
                _LI('Ansible completed cleaning step %(step)s '
                    'on node %(node)s.'), {
                        'node': node.uuid,
                        'step': stepname
                    })
Exemple #23
0
    def execute_clean_step(self, task, step):
        """Execute a clean step.

        :param task: a TaskManager object containing the node
        :param step: a clean step dictionary to execute
        :returns: None
        """
        node = task.node
        playbook, user, key = _parse_ansible_driver_info(task.node,
                                                         action='clean')
        stepname = step['step']

        node_address = _get_node_ip(task)

        node_list = [(node.uuid, node_address, user, node.extra)]
        extra_vars = _prepare_extra_vars(node_list)

        LOG.debug('Starting cleaning step %(step)s on node %(node)s', {
            'node': node.uuid,
            'step': stepname
        })
        step_tags = step['args'].get('tags', [])
        try:
            _run_playbook(node, playbook, extra_vars, key, tags=step_tags)
        except exception.InstanceDeployFailure as e:
            LOG.error(
                "Ansible failed cleaning step %(step)s "
                "on node %(node)s.", {
                    'node': node.uuid,
                    'step': stepname
                })
            manager_utils.cleaning_error_handler(task, six.text_type(e))
        else:
            LOG.info(
                'Ansible completed cleaning step %(step)s '
                'on node %(node)s.', {
                    'node': node.uuid,
                    'step': stepname
                })
Exemple #24
0
    def one_button_secure_erase(self, task):
        """Erase the whole system securely.

        The One-button secure erase process resets iLO and deletes all licenses
        stored there, resets BIOS settings, and deletes all Active Health
        System (AHS) and warranty data stored on the system. It also erases
        supported non-volatile storage data and deletes any deployment setting
        profiles.

        :param task: a TaskManager instance.
        :raises: IloError on an error from iLO.
        """
        node = task.node
        LOG.info("Calling one button secure erase for node %(node)s",
                 {'node': node.uuid})
        try:
            ilo_object = ilo_common.get_ilo_object(node)
            ilo_object.do_one_button_secure_erase()
            manager_utils.node_power_action(task, states.REBOOT)
            node.maintenance = True
            node.maintenance_reason = (
                "One Button Secure erase clean step has begun, it will wipe "
                "data from drives and any non-volatile/persistent storage, "
                "reset iLO and delete all licenses stored there, reset BIOS "
                "settings, delete  Active Health System (AHS) and warranty "
                "data stored in the system and delete any deployment settings "
                "profiles.")
            node.save()
            return states.CLEANWAIT
        except ilo_error.IloError as ilo_exception:
            log_msg = ("One button secure erase job failed for node "
                       "%(node)s. Message: '%(message)s'." % {
                           'node': task.node.uuid,
                           'message': ilo_exception
                       })
            manager_utils.cleaning_error_handler(task,
                                                 log_msg,
                                                 errmsg=ilo_exception)
    def execute_clean_step(self, task, step):
        """Execute a clean step.

        :param task: a TaskManager object containing the node
        :param step: a clean step dictionary to execute
        :returns: None
        """
        node = task.node
        playbook, user, key = _parse_ansible_driver_info(
            task.node, action='clean')
        stepname = step['step']

        if (not CONF.ansible.use_ramdisk_callback and
            'ansible_cleaning_ip' in node.driver_internal_info):
                node_address = node.driver_internal_info['ansible_cleaning_ip']
        else:
            node_address = _get_node_ip(task)

        node_list = [(node.uuid, node_address, user, node.extra)]
        extra_vars = _prepare_extra_vars(node_list)

        LOG.debug('Starting cleaning step %(step)s on node %(node)s',
                  {'node': node.uuid, 'step': stepname})
        step_tags = step['args'].get('tags', [])
        try:
            _run_playbook(playbook, extra_vars, key,
                          tags=step_tags)
        except exception.InstanceDeployFailure as e:
            LOG.error("Ansible failed cleaning step %(step)s "
                      "on node %(node)s.",
                      {'node': node.uuid, 'step': stepname})
            manager_utils.cleaning_error_handler(task, six.text_type(e))
        else:
            LOG.info('Ansible completed cleaning step %(step)s '
                     'on node %(node)s.',
                     {'node': node.uuid, 'step': stepname})
Exemple #26
0
    def _initiate_cleaning(self, task):
        """Initiates the steps required to start cleaning for the node.

        This method polls each interface of the driver for getting the
        clean steps and notifies Ironic conductor to resume cleaning.
        On error, it sets the node to CLEANFAIL state and populates
        node.last_error with the error message.

        :param task: a TaskManager instance containing the node to act on.
        """
        LOG.warning(
            _LW("Bash deploy ramdisk doesn't support in-band cleaning. "
                "Please use the ironic-python-agent (IPA) ramdisk "
                "instead for node %s. "), task.node.uuid)
        try:
            manager_utils.set_node_cleaning_steps(task)
            self.notify_conductor_resume_clean(task)
        except Exception as e:
            last_error = (
                _('Encountered exception for node %(node)s '
                  'while initiating cleaning. Error:  %(error)s') %
                {'node': task.node.uuid, 'error': e})
            return manager_utils.cleaning_error_handler(task, last_error)
Exemple #27
0
 def test_cleaning_error_handler_manual(self):
     target = states.MANAGEABLE
     self.node.target_provision_state = target
     conductor_utils.cleaning_error_handler(self.task, 'foo')
     self.task.process_event.assert_called_once_with('fail',
                                                     target_state=target)
Exemple #28
0
    def heartbeat(self, task, **kwargs):
        """Method for agent to periodically check in.

        The agent should be sending its agent_url (so Ironic can talk back)
        as a kwarg. kwargs should have the following format::

         {
             'agent_url': 'http://AGENT_HOST:AGENT_PORT'
         }

        AGENT_PORT defaults to 9999.
        """
        node = task.node
        driver_internal_info = node.driver_internal_info
        LOG.debug(
            'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.',
            {'node': node.uuid,
             'heartbeat': driver_internal_info.get('agent_last_heartbeat')})
        driver_internal_info['agent_last_heartbeat'] = int(time.time())
        try:
            driver_internal_info['agent_url'] = kwargs['agent_url']
        except KeyError:
            raise exception.MissingParameterValue(_('For heartbeat operation, '
                                                    '"agent_url" must be '
                                                    'specified.'))

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT and
                  not self.deploy_has_started(task)):
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task, **kwargs)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task, **kwargs)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_has_started(task)):
                node.touch_provisioning()
            # TODO(lucasagomes): CLEANING here for backwards compat
            # with previous code, otherwise nodes in CLEANING when this
            # is deployed would fail. Should be removed once the Mitaka
            # release starts.
            elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self._refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        self.notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task, **kwargs)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task, last_error)
Exemple #29
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, the agent
        compares the version of all hardware managers at the start of the
        cleaning (the agent's get_clean_steps() call) and before executing
        each clean step. If the version has changed between steps, the agent is
        unable to tell if an ordering change will cause a cleaning issue so
        it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
        the entire cleaning cycle. For manual cleaning, we don't.
        """
        node = task.node
        # For manual clean, the target provision state is MANAGEABLE, whereas
        # for automated cleaning, it is (the default) AVAILABLE.
        manual_clean = node.target_provision_state == states.MANAGEABLE
        command = self._get_completed_cleaning_command(task)
        LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': node.uuid,
                                   'step': node.clean_step,
                                   'command': command})

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_error'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Cache the new clean steps (and 'hardware_manager_version')
            try:
                self._refresh_clean_steps(task)
            except exception.NodeCleaningFailure as e:
                msg = (_('Could not continue cleaning on node '
                         '%(node)s: %(err)s.') %
                       {'node': node.uuid, 'err': e})
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)

            if manual_clean:
                # Don't restart manual cleaning if agent reboots to a new
                # version. Both are operator actions, unlike automated
                # cleaning. Manual clean steps are not necessarily idempotent
                # like automated clean steps and can be even longer running.
                LOG.info(_LI('During manual cleaning, node %(node)s detected '
                             'a clean version mismatch. Re-executing and '
                             'continuing from current step %(step)s.'),
                         {'node': node.uuid, 'step': node.clean_step})

                driver_internal_info = node.driver_internal_info
                driver_internal_info['skip_current_clean_step'] = False
                node.driver_internal_info = driver_internal_info
                node.save()
            else:
                # Restart cleaning, agent must have rebooted to new version
                LOG.info(_LI('During automated cleaning, node %s detected a '
                             'clean version mismatch. Resetting clean steps '
                             'and rebooting the node.'),
                         node.uuid)
                try:
                    manager_utils.set_node_cleaning_steps(task)
                except exception.NodeCleaningFailure:
                    msg = (_('Could not restart automated cleaning on node '
                             '%(node)s: %(err)s.') %
                           {'node': node.uuid,
                            'err': command.get('command_error'),
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            self.notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug('For node %(node)s, executing post clean step '
                          'hook %(method)s for clean step %(step)s' %
                          {'method': clean_step_hook.__name__,
                           'node': node.uuid,
                           'step': node.clean_step})
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post clean step hook '
                             '%(method)s failed for clean step %(step)s.'
                             'Error: %(error)s') %
                           {'method': clean_step_hook.__name__,
                            'node': node.uuid,
                            'error': e,
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            LOG.info(_LI('Agent on node %s returned cleaning command success, '
                         'moving to next clean step'), node.uuid)
            self.notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_status'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
Exemple #30
0
def do_next_clean_step(task, step_index):
    """Do cleaning, starting from the specified clean step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first clean step in the list to execute. This
        is the index (from 0) into the list of clean steps in the node's
        driver_internal_info['clean_steps']. Is None if there are no steps
        to execute.
    """
    node = task.node
    # For manual cleaning, the target provision state is MANAGEABLE,
    # whereas for automated cleaning, it is AVAILABLE.
    manual_clean = node.target_provision_state == states.MANAGEABLE

    if step_index is None:
        steps = []
    else:
        steps = node.driver_internal_info['clean_steps'][step_index:]

    LOG.info('Executing %(state)s on node %(node)s, remaining steps: '
             '%(steps)s', {'node': node.uuid, 'steps': steps,
                           'state': node.provision_state})

    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.clean_step = step
        driver_internal_info = node.driver_internal_info
        driver_internal_info['clean_step_index'] = step_index + ind
        node.driver_internal_info = driver_internal_info
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s',
                 {'step': step, 'node': node.uuid})
        try:
            result = interface.execute_clean_step(task, step)
        except Exception as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('cleaning_reboot'):
                    LOG.info('Agent is not yet running on node %(node)s '
                             'after cleaning reboot, waiting for agent to '
                             'come up to run next clean step %(step)s.',
                             {'node': node.uuid, 'step': step})
                    driver_internal_info['skip_current_clean_step'] = False
                    node.driver_internal_info = driver_internal_info
                    target_state = (states.MANAGEABLE if manual_clean
                                    else None)
                    task.process_event('wait', target_state=target_state)
                    return

            msg = (_('Node %(node)s failed step %(step)s: '
                     '%(exc)s') %
                   {'node': node.uuid, 'exc': e,
                    'step': node.clean_step})
            LOG.exception(msg)
            driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
            utils.cleaning_error_handler(task, msg)
            return

        # Check if the step is done or not. The step should return
        # states.CLEANWAIT if the step is still being executed, or
        # None if the step is done.
        if result == states.CLEANWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_clean to continue cleaning
            LOG.info('Clean step %(step)s on node %(node)s being '
                     'executed asynchronously, waiting for driver.',
                     {'node': node.uuid, 'step': step})
            target_state = states.MANAGEABLE if manual_clean else None
            task.process_event('wait', target_state=target_state)
            return
        elif result is not None:
            msg = (_('While executing step %(step)s on node '
                     '%(node)s, step returned invalid value: %(val)s')
                   % {'step': step, 'node': node.uuid, 'val': result})
            LOG.error(msg)
            return utils.cleaning_error_handler(task, msg)
        LOG.info('Node %(node)s finished clean step %(step)s',
                 {'node': node.uuid, 'step': step})

    if CONF.agent.deploy_logs_collect == 'always':
        driver_utils.collect_ramdisk_logs(task.node, label='cleaning')

    # Clear clean_step
    node.clean_step = None
    driver_internal_info = node.driver_internal_info
    driver_internal_info['clean_steps'] = None
    driver_internal_info.pop('clean_step_index', None)
    driver_internal_info.pop('cleaning_reboot', None)
    driver_internal_info.pop('cleaning_polling', None)
    driver_internal_info.pop('agent_secret_token', None)
    driver_internal_info.pop('agent_secret_token_pregenerated', None)

    # Remove agent_url
    if not utils.fast_track_able(task):
        driver_internal_info.pop('agent_url', None)
    node.driver_internal_info = driver_internal_info
    node.save()
    try:
        task.driver.deploy.tear_down_cleaning(task)
    except Exception as e:
        msg = (_('Failed to tear down from cleaning for node %(node)s, '
                 'reason: %(err)s')
               % {'node': node.uuid, 'err': e})
        LOG.exception(msg)
        return utils.cleaning_error_handler(task, msg,
                                            tear_down_cleaning=False)

    LOG.info('Node %s cleaning complete', node.uuid)
    event = 'manage' if manual_clean or node.retired else 'done'
    # NOTE(rloo): No need to specify target prov. state; we're done
    task.process_event(event)
Exemple #31
0
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # NOTE(pas-ha) immediately skip the rest if nothing to do
        if task.node.provision_state not in self.heartbeat_allowed_states:
            LOG.debug('Heartbeat from node %(node)s in unsupported '
                      'provision state %(state)s, not taking any action.',
                      {'node': task.node.uuid,
                       'state': task.node.provision_state})
            return

        task.upgrade_lock()

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url
        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT and
                  not self.deploy_has_started(task)):
                msg = _('Node failed to deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug('Node %s just booted to start cleaning.',
                              node.uuid)
                    msg = _('Node failed to start the first cleaning step.')
                    # First, cache the clean steps
                    self.refresh_clean_steps(task)
                    # Then set/verify node clean steps and start cleaning
                    manager_utils.set_node_cleaning_steps(task)
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    _notify_conductor_resume_clean(task)
                else:
                    msg = _('Node failed to check cleaning progress.')
                    self.continue_cleaning(task)

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(
                    task, last_error, collect_logs=bool(self._client))
Exemple #32
0
    def _check_node_firmware_update(self, task):
        """Check the progress of running firmware update on a node."""

        node = task.node

        firmware_updates = node.driver_internal_info['firmware_updates']
        current_update = firmware_updates[0]

        try:
            update_service = redfish_utils.get_update_service(node)
        except exception.RedfishConnectionError as e:
            # If the BMC firmware is being updated, the BMC will be
            # unavailable for some amount of time.
            LOG.warning(
                'Unable to communicate with firmware update service '
                'on node %(node)s. Will try again on the next poll. '
                'Error: %(error)s', {
                    'node': node.uuid,
                    'error': e
                })
            return

        wait_start_time = current_update.get('wait_start_time')
        if wait_start_time:
            wait_start = timeutils.parse_isotime(wait_start_time)

            elapsed_time = timeutils.utcnow(True) - wait_start
            if elapsed_time.seconds >= current_update['wait']:
                LOG.debug(
                    'Finished waiting after firmware update '
                    '%(firmware_image)s on node %(node)s. '
                    'Elapsed time: %(seconds)s seconds', {
                        'firmware_image': current_update['url'],
                        'node': node.uuid,
                        'seconds': elapsed_time.seconds
                    })
                current_update.pop('wait', None)
                current_update.pop('wait_start_time', None)

                task.upgrade_lock()
                self._continue_firmware_updates(task, update_service,
                                                firmware_updates)
            else:
                LOG.debug(
                    'Continuing to wait after firmware update '
                    '%(firmware_image)s on node %(node)s. '
                    'Elapsed time: %(seconds)s seconds', {
                        'firmware_image': current_update['url'],
                        'node': node.uuid,
                        'seconds': elapsed_time.seconds
                    })

            return

        try:
            task_monitor = update_service.get_task_monitor(
                current_update['task_monitor'])
        except sushy.exceptions.ResourceNotFoundError:
            # The BMC deleted the Task before we could query it
            LOG.warning(
                'Firmware update completed for node %(node)s, '
                'firmware %(firmware_image)s, but success of the '
                'update is unknown.  Assuming update was successful.', {
                    'node': node.uuid,
                    'firmware_image': current_update['url']
                })
            task.upgrade_lock()
            self._continue_firmware_updates(task, update_service,
                                            firmware_updates)
            return

        if not task_monitor.is_processing:
            # The last response does not necessarily contain a Task,
            # so get it
            sushy_task = task_monitor.get_task()

            # Only parse the messages if the BMC did not return parsed
            # messages
            messages = []
            if not sushy_task.messages[0].message:
                sushy_task.parse_messages()

            messages = [m.message for m in sushy_task.messages]

            if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED
                    and sushy_task.task_status
                    in [sushy.HEALTH_OK, sushy.HEALTH_WARNING]):
                LOG.info(
                    'Firmware update succeeded for node %(node)s, '
                    'firmware %(firmware_image)s: %(messages)s', {
                        'node': node.uuid,
                        'firmware_image': current_update['url'],
                        'messages': ", ".join(messages)
                    })

                task.upgrade_lock()
                self._continue_firmware_updates(task, update_service,
                                                firmware_updates)
            else:
                error_msg = (_('Firmware update failed for node %(node)s, '
                               'firmware %(firmware_image)s. '
                               'Error: %(errors)s') % {
                                   'node': node.uuid,
                                   'firmware_image': current_update['url'],
                                   'errors': ",  ".join(messages)
                               })
                LOG.error(error_msg)

                task.upgrade_lock()
                self._clear_firmware_updates(node)
                manager_utils.cleaning_error_handler(task, error_msg)
        else:
            LOG.debug(
                'Firmware update in progress for node %(node)s, '
                'firmware %(firmware_image)s.', {
                    'node': node.uuid,
                    'firmware_image': current_update['url']
                })
Exemple #33
0
    def process_next_step(self, task, step_type, **kwargs):
        """Start the next clean/deploy step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, the agent
        compares the version of all hardware managers at the start of the
        process (the agent's get_clean|deploy_steps() call) and before
        executing each step. If the version has changed between steps,
        the agent is unable to tell if an ordering change will cause an issue
        so it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we
        restart the entire cleaning cycle. For manual cleaning or deploy,
        we don't.

        Additionally, if a step includes the reboot_requested property
        set to True, this method will coordinate the reboot once the step is
        completed.
        """
        assert step_type in ('clean', 'deploy')

        node = task.node
        # For manual clean, the target provision state is MANAGEABLE, whereas
        # for automated cleaning, it is (the default) AVAILABLE.
        manual_clean = node.target_provision_state == states.MANAGEABLE
        agent_commands = self._client.get_commands_status(task.node)

        if not agent_commands:
            field = ('cleaning_reboot' if step_type == 'clean'
                     else 'deployment_reboot')
            if task.node.driver_internal_info.get(field):
                # Node finished a cleaning step that requested a reboot, and
                # this is the first heartbeat after booting. Continue cleaning.
                info = task.node.driver_internal_info
                info.pop(field, None)
                task.node.driver_internal_info = info
                task.node.save()
                manager_utils.notify_conductor_resume_operation(task,
                                                                step_type)
                return
            else:
                # Agent has no commands whatsoever
                return

        current_step = (node.clean_step if step_type == 'clean'
                        else node.deploy_step)
        command = _get_completed_command(task, agent_commands, step_type)
        LOG.debug('%(type)s command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': node.uuid,
                                   'step': current_step,
                                   'command': command,
                                   'type': step_type})

        if not command:
            # Agent command in progress
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for %(type)s step %(step)s on node '
                     '%(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_error'),
                    'step': current_step,
                    'type': step_type})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH',
                                               'DEPLOY_VERSION_MISMATCH'):
            # Cache the new clean steps (and 'hardware_manager_version')
            try:
                self.refresh_steps(task, step_type)
            except exception.NodeCleaningFailure as e:
                msg = (_('Could not continue cleaning on node '
                         '%(node)s: %(err)s.') %
                       {'node': node.uuid, 'err': e})
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)
            except exception.InstanceDeployFailure as e:
                msg = (_('Could not continue deployment on node '
                         '%(node)s: %(err)s.') %
                       {'node': node.uuid, 'err': e})
                LOG.exception(msg)
                return manager_utils.deploying_error_handler(task, msg)

            if manual_clean:
                # Don't restart manual cleaning if agent reboots to a new
                # version. Both are operator actions, unlike automated
                # cleaning. Manual clean steps are not necessarily idempotent
                # like automated clean steps and can be even longer running.
                LOG.info('During manual cleaning, node %(node)s detected '
                         'a clean version mismatch. Re-executing and '
                         'continuing from current step %(step)s.',
                         {'node': node.uuid, 'step': node.clean_step})

                driver_internal_info = node.driver_internal_info
                driver_internal_info['skip_current_clean_step'] = False
                node.driver_internal_info = driver_internal_info
                node.save()
            else:
                # Restart the process, agent must have rebooted to new version
                LOG.info('During %(type)s, node %(node)s detected a '
                         '%(type)s version mismatch. Resetting %(type)s steps '
                         'and rebooting the node.',
                         {'type': step_type, 'node': node.uuid})
                try:
                    conductor_steps.set_node_cleaning_steps(task)
                except exception.NodeCleaningFailure as e:
                    msg = (_('Could not restart automated cleaning on node '
                             '%(node)s after step %(step)s: %(err)s.') %
                           {'node': node.uuid, 'err': e,
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)
                except exception.InstanceDeployFailure as e:
                    msg = (_('Could not restart deployment on node '
                             '%(node)s after step %(step)s: %(err)s.') %
                           {'node': node.uuid, 'err': e,
                            'step': node.deploy_step})
                    LOG.exception(msg)
                    return manager_utils.deploying_error_handler(task, msg)

            manager_utils.notify_conductor_resume_operation(task, step_type)

        elif command.get('command_status') == 'SUCCEEDED':
            step_hook = _get_post_step_hook(node, step_type)
            if step_hook is not None:
                LOG.debug('For node %(node)s, executing post %(type)s step '
                          'hook %(method)s for %(type)s step %(step)s',
                          {'method': step_hook.__name__,
                           'node': node.uuid,
                           'step': current_step,
                           'type': step_type})
                try:
                    step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post %(type)s step hook '
                             '%(method)s failed for %(type)s step %(step)s.'
                             '%(cls)s: %(error)s') %
                           {'method': step_hook.__name__,
                            'node': node.uuid,
                            'error': e,
                            'cls': e.__class__.__name__,
                            'step': current_step,
                            'type': step_type})
                    LOG.exception(msg)
                    if step_type == 'clean':
                        return manager_utils.cleaning_error_handler(task, msg)
                    else:
                        return manager_utils.deploying_error_handler(task, msg)

            if current_step.get('reboot_requested'):
                _post_step_reboot(task, step_type)
                return

            LOG.info('Agent on node %(node)s returned %(type)s command '
                     'success, moving to next step',
                     {'node': node.uuid, 'type': step_type})
            manager_utils.notify_conductor_resume_operation(task, step_type)
        else:
            msg = (_('Agent returned unknown status for %(type)s step %(step)s'
                     ' on node %(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_status'),
                    'step': current_step,
                    'type': step_type})
            LOG.error(msg)
            if step_type == 'clean':
                return manager_utils.cleaning_error_handler(task, msg)
            else:
                return manager_utils.deploying_error_handler(task, msg)
Exemple #34
0
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # TODO(dtantsur): upgrade lock only if we actually take action other
        # than updating the last timestamp.
        task.upgrade_lock()

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url

        # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't
        # being used so remove that entry if it exists.
        # Hopefully all nodes will have been updated by Pike, so
        # we can delete this code then.
        driver_internal_info.pop('agent_last_heartbeat', None)

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug(
                    'Heartbeat from node %(node)s in maintenance mode; '
                    'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT
                  and not self.deploy_has_started(task)):
                msg = _('Node failed to deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self.refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        _notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task,
                                              last_error,
                                              collect_logs=bool(self._client))
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        node = task.node
        command = self._get_completed_cleaning_command(task)
        LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': node.uuid,
                                   'step': node.clean_step,
                                   'command': command})

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_error'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(_LI('Node %s detected a clean version mismatch, '
                         'resetting clean steps and rebooting the node.'),
                     node.uuid)
            try:
                manager_utils.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = (_('Could not restart cleaning on node %(node)s: '
                         '%(err)s.') %
                       {'node': node.uuid,
                        'err': command.get('command_error'),
                        'step': node.clean_step})
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)
            self.notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug('For node %(node)s, executing post clean step '
                          'hook %(method)s for clean step %(step)s' %
                          {'method': clean_step_hook.__name__,
                           'node': node.uuid,
                           'step': node.clean_step})
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post clean step hook '
                             '%(method)s failed for clean step %(step)s.'
                             'Error: %(error)s') %
                           {'method': clean_step_hook.__name__,
                            'node': node.uuid,
                            'error': e,
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            LOG.info(_LI('Agent on node %s returned cleaning command success, '
                         'moving to next clean step'), node.uuid)
            self.notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_status'),
                    'step': node.clean_step})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
Exemple #36
0
 def test_cleaning_error_handler_manual(self):
     target = states.MANAGEABLE
     self.node.target_provision_state = target
     conductor_utils.cleaning_error_handler(self.task, 'foo')
     self.task.process_event.assert_called_once_with('fail',
                                                     target_state=target)
Exemple #37
0
    def erase_devices(self, task, **kwargs):
        """Erase all the drives on the node.

        This method performs out-of-band sanitize disk erase on all the
        supported physical drives in the node. This erase cannot be performed
        on logical drives.

        :param task: a TaskManager instance.
        :raises: InvalidParameterValue, if any of the arguments are invalid.
        :raises: IloError on an error from iLO.
        """
        erase_pattern = kwargs.get('erase_pattern', {
            'hdd': 'overwrite',
            'ssd': 'block'
        })
        node = task.node
        self._validate_erase_pattern(erase_pattern, node)
        driver_internal_info = node.driver_internal_info
        LOG.debug("Calling out-of-band sanitize disk erase for node %(node)s",
                  {'node': node.uuid})
        try:
            ilo_object = ilo_common.get_ilo_object(node)
            disk_types = ilo_object.get_available_disk_types()
            LOG.info(
                "Disk type detected are: %(disk_types)s. Sanitize disk "
                "erase are now exercised for one after another disk type "
                "for node %(node)s.", {
                    'disk_types': disk_types,
                    'node': node.uuid
                })

            if disk_types:
                # First disk-erase will execute for HDD's and after reboot only
                # try for SSD, since both share same redfish api and would be
                # overwritten.
                if not driver_internal_info.get(
                        'ilo_disk_erase_hdd_check') and ('HDD' in disk_types):
                    ilo_object.do_disk_erase('HDD', erase_pattern.get('hdd'))
                    self._set_driver_internal_value(
                        task, True, 'cleaning_reboot',
                        'ilo_disk_erase_hdd_check')
                    self._set_driver_internal_value(task, False,
                                                    'skip_current_clean_step')
                    deploy_opts = deploy_utils.build_agent_options(task.node)
                    task.driver.boot.prepare_ramdisk(task, deploy_opts)
                    manager_utils.node_power_action(task, states.REBOOT)
                    return states.CLEANWAIT

                if not driver_internal_info.get(
                        'ilo_disk_erase_ssd_check') and ('SSD' in disk_types):
                    ilo_object.do_disk_erase('SSD', erase_pattern.get('ssd'))
                    self._set_driver_internal_value(
                        task, True, 'ilo_disk_erase_hdd_check',
                        'ilo_disk_erase_ssd_check', 'cleaning_reboot')
                    self._set_driver_internal_value(task, False,
                                                    'skip_current_clean_step')
                    deploy_opts = deploy_utils.build_agent_options(task.node)
                    task.driver.boot.prepare_ramdisk(task, deploy_opts)
                    manager_utils.node_power_action(task, states.REBOOT)
                    return states.CLEANWAIT

                # It will wait until disk erase will complete
                if self._wait_for_disk_erase_status(task.node):
                    LOG.info(
                        "For node %(uuid)s erase_devices clean "
                        "step is done.", {'uuid': task.node.uuid})
                    self._pop_driver_internal_values(
                        task, 'ilo_disk_erase_hdd_check',
                        'ilo_disk_erase_ssd_check')
            else:
                LOG.info(
                    "No drive found to perform out-of-band sanitize "
                    "disk erase for node %(node)s", {'node': node.uuid})
        except ilo_error.IloError as ilo_exception:
            log_msg = ("Out-of-band sanitize disk erase job failed for node "
                       "%(node)s. Message: '%(message)s'." % {
                           'node': task.node.uuid,
                           'message': ilo_exception
                       })
            self._pop_driver_internal_values(task, 'ilo_disk_erase_hdd_check',
                                             'ilo_disk_erase_ssd_check',
                                             'cleaning_reboot',
                                             'skip_current_clean_step')
            manager_utils.cleaning_error_handler(task,
                                                 log_msg,
                                                 errmsg=ilo_exception)
    def heartbeat(self, task, callback_url, agent_version):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        :param agent_version: The version of the agent that is heartbeating
        """
        # NOTE(pas-ha) immediately skip the rest if nothing to do
        if task.node.provision_state not in self.heartbeat_allowed_states:
            LOG.debug(
                'Heartbeat from node %(node)s in unsupported '
                'provision state %(state)s, not taking any action.', {
                    'node': task.node.uuid,
                    'state': task.node.provision_state
                })
            return

        try:
            task.upgrade_lock()
        except exception.NodeLocked:
            LOG.warning(
                'Node %s is currently locked, skipping heartbeat '
                'processing (will retry on the next heartbeat)',
                task.node.uuid)
            return

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url
        driver_internal_info['agent_version'] = agent_version
        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug(
                    'Heartbeat from node %(node)s in maintenance mode; '
                    'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT
                  and not self.deploy_has_started(task)):
                msg = _('Node failed to deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug('Node %s just booted to start cleaning.',
                              node.uuid)
                    msg = _('Node failed to start the first cleaning step.')
                    # First, cache the clean steps
                    self.refresh_clean_steps(task)
                    # Then set/verify node clean steps and start cleaning
                    manager_utils.set_node_cleaning_steps(task)
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    manager_utils.notify_conductor_resume_clean(task)
                else:
                    msg = _('Node failed to check cleaning progress.')
                    self.continue_cleaning(task)
            elif (node.provision_state == states.RESCUEWAIT):
                msg = _('Node failed to perform rescue operation.')
                self._finalize_rescue(task)
        except Exception as e:
            err_info = {'msg': msg, 'e': e}
            last_error = _('Asynchronous exception: %(msg)s '
                           'Exception: %(e)s for node') % err_info
            errmsg = last_error + ' %(node)s'
            LOG.exception(errmsg, {'node': node.uuid})
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task,
                                              last_error,
                                              collect_logs=bool(self._client))
            elif node.provision_state in (states.RESCUING, states.RESCUEWAIT):
                manager_utils.rescuing_error_handler(task, last_error)
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # TODO(dtantsur): upgrade lock only if we actually take action other
        # than updating the last timestamp.
        task.upgrade_lock()

        node = task.node
        driver_internal_info = node.driver_internal_info
        LOG.debug(
            'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.', {
                'node': node.uuid,
                'heartbeat': driver_internal_info.get('agent_last_heartbeat')
            })
        driver_internal_info['agent_last_heartbeat'] = int(time.time())
        try:
            driver_internal_info['agent_url'] = callback_url
        except KeyError:
            raise exception.MissingParameterValue(
                _('For heartbeat operation, '
                  '"agent_url" must be '
                  'specified.'))

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug(
                    'Heartbeat from node %(node)s in maintenance mode; '
                    'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT
                  and not self.deploy_has_started(task)):
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT
                  and self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self._refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        _notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task, last_error)
Exemple #40
0
 def test_cleaning_error_handler_tear_down_error(self, log_mock):
     driver = self.task.driver.deploy
     driver.tear_down_cleaning.side_effect = Exception('bar')
     conductor_utils.cleaning_error_handler(self.task, 'foo')
     self.assertTrue(log_mock.exception.called)
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        node = task.node
        command = self._get_completed_cleaning_command(task)
        LOG.debug(
            "Cleaning command status for node %(node)s on step %(step)s:" " %(command)s",
            {"node": node.uuid, "step": node.clean_step, "command": command},
        )

        if not command:
            # Command is not done yet
            return

        if command.get("command_status") == "FAILED":
            msg = _("Agent returned error for clean step %(step)s on node " "%(node)s : %(err)s.") % {
                "node": node.uuid,
                "err": command.get("command_error"),
                "step": node.clean_step,
            }
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get("command_status") == "CLEAN_VERSION_MISMATCH":
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(
                _LI("Node %s detected a clean version mismatch, " "resetting clean steps and rebooting the node."),
                node.uuid,
            )
            try:
                manager_utils.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = _("Could not restart cleaning on node %(node)s: " "%(err)s.") % {
                    "node": node.uuid,
                    "err": command.get("command_error"),
                    "step": node.clean_step,
                }
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)
            self.notify_conductor_resume_clean(task)

        elif command.get("command_status") == "SUCCEEDED":
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug(
                    "For node %(node)s, executing post clean step "
                    "hook %(method)s for clean step %(step)s"
                    % {"method": clean_step_hook.__name__, "node": node.uuid, "step": node.clean_step}
                )
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = _(
                        "For node %(node)s, post clean step hook "
                        "%(method)s failed for clean step %(step)s."
                        "Error: %(error)s"
                    ) % {"method": clean_step_hook.__name__, "node": node.uuid, "error": e, "step": node.clean_step}
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)

            LOG.info(_LI("Agent on node %s returned cleaning command success, " "moving to next clean step"), node.uuid)
            self.notify_conductor_resume_clean(task)
        else:
            msg = _("Agent returned unknown status for clean step %(step)s " "on node %(node)s : %(err)s.") % {
                "node": node.uuid,
                "err": command.get("command_status"),
                "step": node.clean_step,
            }
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
    def heartbeat(self, task, callback_url, agent_version):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        :param agent_version: The version of the agent that is heartbeating
        """
        # NOTE(pas-ha) immediately skip the rest if nothing to do
        if (task.node.provision_state not in self.heartbeat_allowed_states
            and not manager_utils.fast_track_able(task)):
            LOG.debug('Heartbeat from node %(node)s in unsupported '
                      'provision state %(state)s, not taking any action.',
                      {'node': task.node.uuid,
                       'state': task.node.provision_state})
            return

        try:
            task.upgrade_lock()
        except exception.NodeLocked:
            LOG.warning('Node %s is currently locked, skipping heartbeat '
                        'processing (will retry on the next heartbeat)',
                        task.node.uuid)
            return

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)
        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url
        driver_internal_info['agent_version'] = agent_version
        # Record the last heartbeat event time in UTC, so we can make
        # decisions about it later. Can be decoded to datetime object with:
        # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f")
        driver_internal_info['agent_last_heartbeat'] = str(
            timeutils.utcnow().isoformat())
        node.driver_internal_info = driver_internal_info
        node.save()

        if node.provision_state in _HEARTBEAT_RECORD_ONLY:
            # We shouldn't take any additional action. The agent will
            # silently continue to heartbeat to ironic until user initiated
            # state change occurs causing it to match a state below.
            LOG.debug('Heartbeat from %(node)s recorded to identify the '
                      'node as on-line.', {'node': task.node.uuid})
            return

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
            # are currently in the core deploy.deploy step. Other deploy steps
            # may cause the agent to boot, but we should not trigger deployment
            # at that point.
            elif node.provision_state == states.DEPLOYWAIT:
                if self.in_core_deploy_step(task):
                    if not self.deploy_has_started(task):
                        msg = _('Node failed to deploy.')
                        self.continue_deploy(task)
                    elif self.deploy_is_done(task):
                        msg = _('Node failed to move to active state.')
                        self.reboot_to_instance(task)
                    else:
                        node.touch_provisioning()
                else:
                    node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug('Node %s just booted to start cleaning.',
                              node.uuid)
                    msg = _('Node failed to start the first cleaning step.')
                    # First, cache the clean steps
                    self.refresh_clean_steps(task)
                    # Then set/verify node clean steps and start cleaning
                    conductor_steps.set_node_cleaning_steps(task)
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    manager_utils.notify_conductor_resume_clean(task)
                else:
                    msg = _('Node failed to check cleaning progress.')
                    self.continue_cleaning(task)
            elif (node.provision_state == states.RESCUEWAIT):
                msg = _('Node failed to perform rescue operation.')
                self._finalize_rescue(task)
        except Exception as e:
            err_info = {'msg': msg, 'e': e}
            last_error = _('Asynchronous exception: %(msg)s '
                           'Exception: %(e)s for node') % err_info
            errmsg = last_error + ' %(node)s'
            LOG.exception(errmsg, {'node': node.uuid})
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(
                    task, last_error, collect_logs=bool(self._client))
            elif node.provision_state in (states.RESCUING, states.RESCUEWAIT):
                manager_utils.rescuing_error_handler(task, last_error)
    def heartbeat(self, task, callback_url):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        """
        # TODO(dtantsur): upgrade lock only if we actually take action other
        # than updating the last timestamp.
        task.upgrade_lock()

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)

        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url

        # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't
        # being used so remove that entry if it exists.
        # Hopefully all nodes will have been updated by Pike, so
        # we can delete this code then.
        driver_internal_info.pop('agent_last_heartbeat', None)

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif (node.provision_state == states.DEPLOYWAIT and
                  not self.deploy_has_started(task)):
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task)
            elif (node.provision_state == states.DEPLOYWAIT and
                  self.deploy_has_started(task)):
                node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                try:
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
                        msg = _('Node failed to start the first cleaning '
                                'step.')
                        # First, cache the clean steps
                        self._refresh_clean_steps(task)
                        # Then set/verify node clean steps and start cleaning
                        manager_utils.set_node_cleaning_steps(task)
                        _notify_conductor_resume_clean(task)
                    else:
                        msg = _('Node failed to check cleaning progress.')
                        self.continue_cleaning(task)
                except exception.NoFreeConductorWorker:
                    # waiting for the next heartbeat, node.last_error and
                    # logging message is filled already via conductor's hook
                    pass

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(task, last_error)
Exemple #44
0
    def heartbeat(self, task, callback_url, agent_version):
        """Process a heartbeat.

        :param task: task to work with.
        :param callback_url: agent HTTP API URL.
        :param agent_version: The version of the agent that is heartbeating
        """
        # NOTE(pas-ha) immediately skip the rest if nothing to do
        if (task.node.provision_state not in self.heartbeat_allowed_states
            and not manager_utils.fast_track_able(task)):
            LOG.debug('Heartbeat from node %(node)s in unsupported '
                      'provision state %(state)s, not taking any action.',
                      {'node': task.node.uuid,
                       'state': task.node.provision_state})
            return

        try:
            task.upgrade_lock()
        except exception.NodeLocked:
            LOG.warning('Node %s is currently locked, skipping heartbeat '
                        'processing (will retry on the next heartbeat)',
                        task.node.uuid)
            return

        node = task.node
        LOG.debug('Heartbeat from node %s', node.uuid)
        driver_internal_info = node.driver_internal_info
        driver_internal_info['agent_url'] = callback_url
        driver_internal_info['agent_version'] = agent_version
        # Record the last heartbeat event time in UTC, so we can make
        # decisions about it later. Can be decoded to datetime object with:
        # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f")
        driver_internal_info['agent_last_heartbeat'] = str(
            timeutils.utcnow().isoformat())
        node.driver_internal_info = driver_internal_info
        node.save()

        if node.provision_state in _HEARTBEAT_RECORD_ONLY:
            # We shouldn't take any additional action. The agent will
            # silently continue to heartbeat to ironic until user initiated
            # state change occurs causing it to match a state below.
            LOG.debug('Heartbeat from %(node)s recorded to identify the '
                      'node as on-line.', {'node': task.node.uuid})
            return

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
            # are currently in the core deploy.deploy step. Other deploy steps
            # may cause the agent to boot, but we should not trigger deployment
            # at that point.
            elif node.provision_state == states.DEPLOYWAIT:
                if self.in_core_deploy_step(task):
                    if not self.deploy_has_started(task):
                        msg = _('Node failed to deploy.')
                        self.continue_deploy(task)
                    elif self.deploy_is_done(task):
                        msg = _('Node failed to move to active state.')
                        self.reboot_to_instance(task)
                    else:
                        node.touch_provisioning()
                else:
                    node.touch_provisioning()
            elif node.provision_state == states.CLEANWAIT:
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug('Node %s just booted to start cleaning.',
                              node.uuid)
                    msg = _('Node failed to start the first cleaning step.')
                    # First, cache the clean steps
                    self.refresh_clean_steps(task)
                    # Then set/verify node clean steps and start cleaning
                    conductor_steps.set_node_cleaning_steps(task)
                    # The exceptions from RPC are not possible as we using cast
                    # here
                    manager_utils.notify_conductor_resume_clean(task)
                else:
                    msg = _('Node failed to check cleaning progress.')
                    self.continue_cleaning(task)
            elif (node.provision_state == states.RESCUEWAIT):
                msg = _('Node failed to perform rescue operation.')
                self._finalize_rescue(task)
        except Exception as e:
            err_info = {'msg': msg, 'e': e}
            last_error = _('Asynchronous exception: %(msg)s '
                           'Exception: %(e)s for node') % err_info
            errmsg = last_error + ' %(node)s'
            LOG.exception(errmsg, {'node': node.uuid})
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
            elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                deploy_utils.set_failed_state(
                    task, last_error, collect_logs=bool(self._client))
            elif node.provision_state in (states.RESCUING, states.RESCUEWAIT):
                manager_utils.rescuing_error_handler(task, last_error)
Exemple #45
0
 def test_cleaning_error_handler_tear_down_error(self, log_mock):
     driver = self.task.driver.deploy
     driver.tear_down_cleaning.side_effect = Exception('bar')
     conductor_utils.cleaning_error_handler(self.task, 'foo')
     self.assertTrue(log_mock.exception.called)
Exemple #46
0
def do_node_clean(task, clean_steps=None):
    """Internal RPC method to perform cleaning of a node.

    :param task: a TaskManager instance with an exclusive lock on its node
    :param clean_steps: For a manual clean, the list of clean steps to
                        perform. Is None For automated cleaning (default).
                        For more information, see the clean_steps parameter
                        of :func:`ConductorManager.do_node_clean`.
    """
    node = task.node
    manual_clean = clean_steps is not None
    clean_type = 'manual' if manual_clean else 'automated'
    LOG.debug('Starting %(type)s cleaning for node %(node)s',
              {'type': clean_type, 'node': node.uuid})

    if not manual_clean and utils.skip_automated_cleaning(node):
        # Skip cleaning, move to AVAILABLE.
        node.clean_step = None
        node.save()

        task.process_event('done')
        LOG.info('Automated cleaning is disabled, node %s has been '
                 'successfully moved to AVAILABLE state.', node.uuid)
        return

    # NOTE(dtantsur): this is only reachable during automated cleaning,
    # for manual cleaning we verify maintenance mode earlier on.
    if (not CONF.conductor.allow_provisioning_in_maintenance
            and node.maintenance):
        msg = _('Cleaning a node in maintenance mode is not allowed')
        return utils.cleaning_error_handler(task, msg,
                                            tear_down_cleaning=False)

    try:
        # NOTE(ghe): Valid power and network values are needed to perform
        # a cleaning.
        task.driver.power.validate(task)
        task.driver.network.validate(task)
    except exception.InvalidParameterValue as e:
        msg = (_('Validation failed. Cannot clean node %(node)s. '
                 'Error: %(msg)s') %
               {'node': node.uuid, 'msg': e})
        return utils.cleaning_error_handler(task, msg)

    if manual_clean:
        info = node.driver_internal_info
        info['clean_steps'] = clean_steps
        node.driver_internal_info = info
        node.save()

    # Do caching of bios settings if supported by driver,
    # this will be called for both manual and automated cleaning.
    try:
        task.driver.bios.cache_bios_settings(task)
    except exception.UnsupportedDriverExtension:
        LOG.warning('BIOS settings are not supported for node %s, '
                    'skipping', task.node.uuid)
    # TODO(zshi) remove this check when classic drivers are removed
    except Exception:
        msg = (_('Caching of bios settings failed on node %(node)s. '
                 'Continuing with node cleaning.')
               % {'node': node.uuid})
        LOG.exception(msg)

    # Allow the deploy driver to set up the ramdisk again (necessary for
    # IPA cleaning)
    try:
        prepare_result = task.driver.deploy.prepare_cleaning(task)
    except Exception as e:
        msg = (_('Failed to prepare node %(node)s for cleaning: %(e)s')
               % {'node': node.uuid, 'e': e})
        LOG.exception(msg)
        return utils.cleaning_error_handler(task, msg)

    if prepare_result == states.CLEANWAIT:
        # Prepare is asynchronous, the deploy driver will need to
        # set node.driver_internal_info['clean_steps'] and
        # node.clean_step and then make an RPC call to
        # continue_node_clean to start cleaning.

        # For manual cleaning, the target provision state is MANAGEABLE,
        # whereas for automated cleaning, it is AVAILABLE (the default).
        target_state = states.MANAGEABLE if manual_clean else None
        task.process_event('wait', target_state=target_state)
        return

    try:
        conductor_steps.set_node_cleaning_steps(task)
    except (exception.InvalidParameterValue,
            exception.NodeCleaningFailure) as e:
        msg = (_('Cannot clean node %(node)s. Error: %(msg)s')
               % {'node': node.uuid, 'msg': e})
        return utils.cleaning_error_handler(task, msg)

    steps = node.driver_internal_info.get('clean_steps', [])
    step_index = 0 if steps else None
    do_next_clean_step(task, step_index)
Exemple #47
0
 def test_cleaning_error_handler_no_fail(self):
     conductor_utils.cleaning_error_handler(self.task, 'foo',
                                            set_fail_state=False)
     driver = self.task.driver.deploy
     driver.tear_down_cleaning.assert_called_once_with(self.task)
     self.assertFalse(self.task.process_event.called)