Esempio n. 1
0
 def _set_step_failed(self, task, msg, exc):
     log_msg = ("RAID configuration job failed for node %(node)s. "
                "Message: '%(message)s'." % {
                    'node': task.node.uuid,
                    'message': msg
                })
     if task.node.provision_state == states.DEPLOYING:
         manager_utils.deploying_error_handler(task, log_msg, errmsg=msg)
     else:
         manager_utils.cleaning_error_handler(task, log_msg, errmsg=msg)
Esempio n. 2
0
    def _set_failed(self, task, error_message):
        """Set the node in failed state by invoking 'fail' event.

        :param task: a TaskManager instance with node to act on
        :param error_message: Error message
        """
        log_msg = ("BIOS configuration failed for node %(node)s. %(error)s " %
                   {
                       'node': task.node.uuid,
                       'error': error_message
                   })
        if task.node.clean_step:
            manager_utils.cleaning_error_handler(task, log_msg, error_message)
        else:
            manager_utils.deploying_error_handler(task, log_msg, error_message)
Esempio n. 3
0
def continue_node_deploy(task):
    """Continue deployment after finishing an async deploy step.

    This function calculates which step has to run next and passes control
    into do_next_deploy_step. On the first run, deploy steps and templates are
    also validated.

    :param task: a TaskManager instance with an exclusive lock
    """
    node = task.node

    # Agent is now running, we're ready to validate the remaining steps
    if not node.driver_internal_info.get('steps_validated'):
        try:
            conductor_steps.validate_deploy_templates(task)
            conductor_steps.set_node_deployment_steps(task,
                                                      reset_current=False)
        except exception.IronicException as exc:
            msg = _('Failed to validate the final deploy steps list '
                    'for node %(node)s: %(exc)s') % {
                        'node': node.uuid,
                        'exc': exc
                    }
            return utils.deploying_error_handler(task, msg)

        info = node.driver_internal_info
        info['steps_validated'] = True
        node.driver_internal_info = info
        node.save()

    next_step_index = utils.update_next_step_index(task, 'deploy')

    do_next_deploy_step(task, next_step_index)
Esempio n. 4
0
    def _set_step_failed(self, task, attrs_not_updated):
        """Fail the cleaning or deployment step and log the error.

        :param task: a TaskManager instance containing the node to act on.
        :param attrs_not_updated: the BIOS attributes that were not updated.
        """
        error_msg = (_('Redfish BIOS apply_configuration step failed for node '
                       '%(node)s. Attributes %(attrs)s are not updated.') %
                     {'node': task.node.uuid, 'attrs': attrs_not_updated})
        last_error = (_('Redfish BIOS apply_configuration step failed. '
                        'Attributes %(attrs)s are not updated.') %
                      {'attrs': attrs_not_updated})
        if task.node.provision_state in [states.CLEANING, states.CLEANWAIT]:
            manager_utils.cleaning_error_handler(task, last_error)
        if task.node.provision_state in [states.DEPLOYING, states.DEPLOYWAIT]:
            manager_utils.deploying_error_handler(task, error_msg, last_error)
Esempio n. 5
0
def _post_step_reboot(task, step_type):
    """Reboots a node out of band after a clean/deploy step that requires it.

    If an agent step has 'reboot_requested': True, reboots the node when
    the step is completed. Will put the node in CLEANFAIL/DEPLOYFAIL if
    the node cannot be rebooted.

    :param task: a TaskManager instance
    :param step_type: 'clean' or 'deploy'
    """
    current_step = (task.node.clean_step if step_type == 'clean'
                    else task.node.deploy_step)
    try:
        # NOTE(fellypefca): Call prepare_ramdisk on ensure that the
        # baremetal node boots back into the ramdisk after reboot.
        deploy_opts = deploy_utils.build_agent_options(task.node)
        task.driver.boot.prepare_ramdisk(task, deploy_opts)
        manager_utils.node_power_action(task, states.REBOOT)
    except Exception as e:
        msg = (_('Reboot requested by %(type)s step %(step)s failed for '
                 'node %(node)s: %(err)s') %
               {'step': current_step,
                'node': task.node.uuid,
                'err': e,
                'type': step_type})
        LOG.error(msg, exc_info=not isinstance(e, exception.IronicException))
        # do not set cleaning_reboot if we didn't reboot
        if step_type == 'clean':
            manager_utils.cleaning_error_handler(task, msg)
        else:
            manager_utils.deploying_error_handler(task, msg)
        return

    # Signify that we've rebooted
    driver_internal_info = task.node.driver_internal_info
    field = ('cleaning_reboot' if step_type == 'clean'
             else 'deployment_reboot')
    driver_internal_info[field] = True
    if not driver_internal_info.get('agent_secret_token_pregenerated', False):
        # Wipes out the existing recorded token because the machine will
        # need to re-establish the token.
        driver_internal_info.pop('agent_secret_token', None)
    task.node.driver_internal_info = driver_internal_info
    task.node.save()
Esempio n. 6
0
def set_failed_state(task, msg, collect_logs=True):
    """Sets the deploy status as failed with relevant messages.

    This method sets the deployment as fail with the given message.
    It sets node's provision_state to DEPLOYFAIL and updates last_error
    with the given error message. It also powers off the baremetal node.

    :param task: a TaskManager instance containing the node to act on.
    :param msg: the message to set in logs and last_error of the node.
    :param collect_logs: Boolean indicating whether to attempt to collect
                         logs from IPA-based ramdisk. Defaults to True.
                         Actual log collection is also affected by
                         CONF.agent.deploy_logs_collect config option.
    """
    node = task.node

    if (collect_logs
            and CONF.agent.deploy_logs_collect in ('on_failure', 'always')):
        driver_utils.collect_ramdisk_logs(node)

    try:
        manager_utils.deploying_error_handler(task, msg, msg, clean_up=False)
    except exception.InvalidState:
        msg2 = ('Internal error. Node %(node)s in provision state '
                '"%(state)s" could not transition to a failed state.' % {
                    'node': node.uuid,
                    'state': node.provision_state
                })
        LOG.exception(msg2)

    if CONF.deploy.power_off_after_deploy_failure:
        try:
            manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception:
            msg2 = ('Node %s failed to power off while handling deploy '
                    'failure. This may be a serious condition. Node '
                    'should be removed from Ironic or put in maintenance '
                    'mode until the problem is resolved.' % node.uuid)
            LOG.exception(msg2)
    # NOTE(tenbrae): node_power_action() erases node.last_error
    #             so we need to set it here.
    node.last_error = msg
    node.save()
Esempio n. 7
0
def do_next_deploy_step(task, step_index, conductor_id):
    """Do deployment, starting from the specified deploy step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first deploy step in the list to execute. This
        is the index (from 0) into the list of deploy steps in the node's
        driver_internal_info['deploy_steps']. Is None if there are no steps
        to execute.
    """
    node = task.node
    if step_index is None:
        steps = []
    else:
        steps = node.driver_internal_info['deploy_steps'][step_index:]

    LOG.info(
        'Executing %(state)s on node %(node)s, remaining steps: '
        '%(steps)s', {
            'node': node.uuid,
            'steps': steps,
            'state': node.provision_state
        })

    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.deploy_step = step
        driver_internal_info = node.driver_internal_info
        driver_internal_info['deploy_step_index'] = step_index + ind
        node.driver_internal_info = driver_internal_info
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s', {
            'step': step,
            'node': node.uuid
        })
        try:
            result = interface.execute_deploy_step(task, step)
        except exception.IronicException as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('deployment_reboot'):
                    LOG.info(
                        'Agent is not yet running on node %(node)s after '
                        'deployment reboot, waiting for agent to come up '
                        'to run next deploy step %(step)s.', {
                            'node': node.uuid,
                            'step': step
                        })
                    driver_internal_info['skip_current_deploy_step'] = False
                    node.driver_internal_info = driver_internal_info
                    task.process_event('wait')
                    return
            log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
                       '%(err)s' % {
                           'node': node.uuid,
                           'step': node.deploy_step,
                           'err': e
                       })
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy: %s") % node.deploy_step)
            return
        except Exception as e:
            log_msg = ('Node %(node)s failed deploy step %(step)s with '
                       'unexpected error: %(err)s' % {
                           'node': node.uuid,
                           'step': node.deploy_step,
                           'err': e
                       })
            utils.deploying_error_handler(
                task,
                log_msg,
                _("Failed to deploy. Exception: %s") % e,
                traceback=True)
            return

        if ind == 0:
            # We've done the very first deploy step.
            # Update conductor_affinity to reference this conductor's ID
            # since there may be local persistent state
            node.conductor_affinity = conductor_id
            node.save()

        # Check if the step is done or not. The step should return
        # states.DEPLOYWAIT if the step is still being executed, or
        # None if the step is done.
        # NOTE(deva): Some drivers may return states.DEPLOYWAIT
        #             eg. if they are waiting for a callback
        if result == states.DEPLOYWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_deploy() to continue deploying
            LOG.info(
                'Deploy step %(step)s on node %(node)s being '
                'executed asynchronously, waiting for driver.', {
                    'node': node.uuid,
                    'step': step
                })
            task.process_event('wait')
            return
        elif result is not None:
            # NOTE(rloo): This is an internal/dev error; shouldn't happen.
            log_msg = (_('While executing deploy step %(step)s on node '
                         '%(node)s, step returned unexpected state: %(val)s') %
                       {
                           'step': step,
                           'node': node.uuid,
                           'val': result
                       })
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy: %s") % node.deploy_step)
            return

        LOG.info('Node %(node)s finished deploy step %(step)s', {
            'node': node.uuid,
            'step': step
        })

    # Finished executing the steps. Clear deploy_step.
    node.deploy_step = None
    driver_internal_info = node.driver_internal_info
    driver_internal_info.pop('agent_secret_token', None)
    driver_internal_info.pop('agent_secret_token_pregenerated', None)
    driver_internal_info['deploy_steps'] = None
    driver_internal_info.pop('deploy_step_index', None)
    driver_internal_info.pop('deployment_reboot', None)
    driver_internal_info.pop('deployment_polling', None)
    # Remove the agent_url cached from the deployment.
    driver_internal_info.pop('agent_url', None)
    node.driver_internal_info = driver_internal_info
    node.save()

    _start_console_in_deploy(task)

    task.process_event('done')
    LOG.info(
        'Successfully deployed node %(node)s with '
        'instance %(instance)s.', {
            'node': node.uuid,
            'instance': node.instance_uuid
        })
Esempio n. 8
0
def do_node_deploy(task, conductor_id=None, configdrive=None):
    """Prepare the environment and deploy a node."""
    node = task.node
    utils.del_secret_token(node)
    try:
        if configdrive:
            if isinstance(configdrive, dict):
                configdrive = utils.build_configdrive(node, configdrive)
            _store_configdrive(node, configdrive)
    except (exception.SwiftOperationError, exception.ConfigInvalid) as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Error while uploading the configdrive for %(node)s '
                 'to Swift') % {'node': node.uuid},
                _('Failed to upload the configdrive to Swift. '
                  'Error: %s') % e,
                clean_up=False)
    except db_exception.DBDataError as e:
        with excutils.save_and_reraise_exception():
            # NOTE(hshiina): This error happens when the configdrive is
            #                too large. Remove the configdrive from the
            #                object to update DB successfully in handling
            #                the failure.
            node.obj_reset_changes()
            utils.deploying_error_handler(
                task,
                ('Error while storing the configdrive for %(node)s into '
                 'the database: %(err)s') % {
                     'node': node.uuid,
                     'err': e
                 },
                _("Failed to store the configdrive in the database. "
                  "%s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing the configdrive for '
                 'node %(node)s') % {'node': node.uuid},
                _("Failed to prepare the configdrive. Exception: %s") % e,
                traceback=True,
                clean_up=False)

    try:
        task.driver.deploy.prepare(task)
    except exception.IronicException as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task, ('Error while preparing to deploy to node %(node)s: '
                       '%(err)s') % {
                           'node': node.uuid,
                           'err': e
                       },
                _("Failed to prepare to deploy: %s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing to deploy to node '
                 '%(node)s') % {'node': node.uuid},
                _("Failed to prepare to deploy. Exception: %s") % e,
                traceback=True,
                clean_up=False)

    try:
        # This gets the deploy steps and puts them in the node's
        # driver_internal_info['deploy_steps'].
        conductor_steps.set_node_deployment_steps(task)
    except exception.InstanceDeployFailure as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                'Error while getting deploy steps; cannot deploy to node '
                '%(node)s. Error: %(err)s' % {
                    'node': node.uuid,
                    'err': e
                },
                _("Cannot get deploy steps; failed to deploy: %s") % e)

    if not node.driver_internal_info.get('deploy_steps'):
        msg = _('Error while getting deploy steps: no steps returned for '
                'node %s') % node.uuid
        utils.deploying_error_handler(
            task, msg, _("No deploy steps returned by the driver"))
        raise exception.InstanceDeployFailure(msg)

    do_next_deploy_step(task, 0, conductor_id)
Esempio n. 9
0
def do_next_deploy_step(task, step_index):
    """Do deployment, starting from the specified deploy step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first deploy step in the list to execute. This
        is the index (from 0) into the list of deploy steps in the node's
        driver_internal_info['deploy_steps']. Is None if there are no steps
        to execute.
    """
    node = task.node
    if step_index is None:
        steps = []
    else:
        steps = node.driver_internal_info['deploy_steps'][step_index:]

    LOG.info(
        'Executing %(state)s on node %(node)s, remaining steps: '
        '%(steps)s', {
            'node': node.uuid,
            'steps': steps,
            'state': node.provision_state
        })

    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.deploy_step = step
        driver_internal_info = node.driver_internal_info
        driver_internal_info['deploy_step_index'] = step_index + ind
        node.driver_internal_info = driver_internal_info
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s', {
            'step': step,
            'node': node.uuid
        })
        try:
            result = interface.execute_deploy_step(task, step)
        except exception.IronicException as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('deployment_reboot'):
                    LOG.info(
                        'Agent is not yet running on node %(node)s after '
                        'deployment reboot, waiting for agent to come up '
                        'to run next deploy step %(step)s.', {
                            'node': node.uuid,
                            'step': step
                        })
                    driver_internal_info['skip_current_deploy_step'] = False
                    node.driver_internal_info = driver_internal_info
                    task.process_event('wait')
                    return
            if isinstance(e, exception.AgentInProgress):
                LOG.info(
                    'Conductor attempted to process deploy step for '
                    'node %(node)s. Agent indicated it is presently '
                    'executing a command. Error: %(error)s', {
                        'node': task.node.uuid,
                        'error': e
                    })
                driver_internal_info['skip_current_deploy_step'] = False
                node.driver_internal_info = driver_internal_info
                task.process_event('wait')
                return
            log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
                       '%(err)s' % {
                           'node': node.uuid,
                           'step': node.deploy_step,
                           'err': e
                       })
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy: Deploy step %(step)s, "
                  "error: %(err)s.") % {
                      'step': node.deploy_step,
                      'err': e
                  })
            return
        except Exception as e:
            log_msg = ('Node %(node)s failed deploy step %(step)s with '
                       'unexpected error: %(err)s' % {
                           'node': node.uuid,
                           'step': node.deploy_step,
                           'err': e
                       })
            utils.deploying_error_handler(
                task,
                log_msg,
                _("Failed to deploy. Exception: %s") % e,
                traceback=True)
            return

        if task.node.provision_state == states.DEPLOYFAIL:
            # NOTE(dtantsur): some deploy steps do not raise but rather update
            # the node and return. Take them into account.
            LOG.debug(
                'Node %s is in error state, not processing '
                'the remaining deploy steps', task.node)
            return

        # Check if the step is done or not. The step should return
        # states.DEPLOYWAIT if the step is still being executed, or
        # None if the step is done.
        # NOTE(tenbrae): Some drivers may return states.DEPLOYWAIT
        #                eg. if they are waiting for a callback
        if result == states.DEPLOYWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_deploy() to continue deploying
            LOG.info(
                'Deploy step %(step)s on node %(node)s being '
                'executed asynchronously, waiting for driver.', {
                    'node': node.uuid,
                    'step': step
                })
            if task.node.provision_state != states.DEPLOYWAIT:
                task.process_event('wait')
            return
        elif result is not None:
            # NOTE(rloo): This is an internal/dev error; shouldn't happen.
            log_msg = (_('While executing deploy step %(step)s on node '
                         '%(node)s, step returned unexpected state: %(val)s') %
                       {
                           'step': step,
                           'node': node.uuid,
                           'val': result
                       })
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy: %s") % node.deploy_step)
            return

        LOG.info('Node %(node)s finished deploy step %(step)s', {
            'node': node.uuid,
            'step': step
        })

    # Finished executing the steps. Clear deploy_step.
    node.deploy_step = None
    utils.wipe_deploy_internal_info(task)
    node.save()

    _start_console_in_deploy(task)

    task.process_event('done')
    LOG.info(
        'Successfully deployed node %(node)s with '
        'instance %(instance)s.', {
            'node': node.uuid,
            'instance': node.instance_uuid
        })
Esempio n. 10
0
def do_node_deploy(task,
                   conductor_id=None,
                   configdrive=None,
                   deploy_steps=None):
    """Prepare the environment and deploy a node."""
    node = task.node
    utils.wipe_deploy_internal_info(task)
    try:
        if configdrive:
            if isinstance(configdrive, dict):
                configdrive = utils.build_configdrive(node, configdrive)
            _store_configdrive(node, configdrive)
    except (exception.SwiftOperationError, exception.ConfigInvalid) as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Error while uploading the configdrive for %(node)s '
                 'to Swift') % {'node': node.uuid},
                _('Failed to upload the configdrive to Swift. '
                  'Error: %s') % e,
                clean_up=False)
    except db_exception.DBDataError as e:
        with excutils.save_and_reraise_exception():
            # NOTE(hshiina): This error happens when the configdrive is
            #                too large. Remove the configdrive from the
            #                object to update DB successfully in handling
            #                the failure.
            node.obj_reset_changes()
            utils.deploying_error_handler(
                task,
                ('Error while storing the configdrive for %(node)s into '
                 'the database: %(err)s') % {
                     'node': node.uuid,
                     'err': e
                 },
                _("Failed to store the configdrive in the database. "
                  "%s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing the configdrive for '
                 'node %(node)s') % {'node': node.uuid},
                _("Failed to prepare the configdrive. Exception: %s") % e,
                traceback=True,
                clean_up=False)

    try:
        task.driver.deploy.prepare(task)
    except exception.IronicException as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task, ('Error while preparing to deploy to node %(node)s: '
                       '%(err)s') % {
                           'node': node.uuid,
                           'err': e
                       },
                _("Failed to prepare to deploy: %s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing to deploy to node '
                 '%(node)s') % {'node': node.uuid},
                _("Failed to prepare to deploy. Exception: %s") % e,
                traceback=True,
                clean_up=False)

    try:
        # If any deploy steps provided by user, save them to node. They will be
        # validated & processed later together with driver and deploy template
        # steps.
        if deploy_steps:
            info = node.driver_internal_info
            info['user_deploy_steps'] = deploy_steps
            node.driver_internal_info = info
            node.save()
        # This gets the deploy steps (if any) from driver, deploy template and
        # deploy_steps argument and updates them in the node's
        # driver_internal_info['deploy_steps']. In-band steps are skipped since
        # we know that an agent is not running yet.
        conductor_steps.set_node_deployment_steps(task, skip_missing=True)
    except exception.InstanceDeployFailure as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                'Error while getting deploy steps; cannot deploy to node '
                '%(node)s. Error: %(err)s' % {
                    'node': node.uuid,
                    'err': e
                },
                _("Cannot get deploy steps; failed to deploy: %s") % e)

    if not node.driver_internal_info.get('deploy_steps'):
        msg = _('Error while getting deploy steps: no steps returned for '
                'node %s') % node.uuid
        utils.deploying_error_handler(
            task, msg, _("No deploy steps returned by the driver"))
        raise exception.InstanceDeployFailure(msg)

    if conductor_id is not None:
        # Update conductor_affinity to reference this conductor's ID
        # since there may be local persistent state
        node.conductor_affinity = conductor_id
        node.save()

    do_next_deploy_step(task, 0)
Esempio n. 11
0
    def process_next_step(self, task, step_type, **kwargs):
        """Start the next clean/deploy step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, the agent
        compares the version of all hardware managers at the start of the
        process (the agent's get_clean|deploy_steps() call) and before
        executing each step. If the version has changed between steps,
        the agent is unable to tell if an ordering change will cause an issue
        so it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we
        restart the entire cleaning cycle. For manual cleaning or deploy,
        we don't.

        Additionally, if a step includes the reboot_requested property
        set to True, this method will coordinate the reboot once the step is
        completed.
        """
        assert step_type in ('clean', 'deploy')

        node = task.node
        # For manual clean, the target provision state is MANAGEABLE, whereas
        # for automated cleaning, it is (the default) AVAILABLE.
        manual_clean = node.target_provision_state == states.MANAGEABLE
        agent_commands = self._client.get_commands_status(task.node)

        if not agent_commands:
            field = ('cleaning_reboot' if step_type == 'clean'
                     else 'deployment_reboot')
            if task.node.driver_internal_info.get(field):
                # Node finished a cleaning step that requested a reboot, and
                # this is the first heartbeat after booting. Continue cleaning.
                info = task.node.driver_internal_info
                info.pop(field, None)
                task.node.driver_internal_info = info
                task.node.save()
                manager_utils.notify_conductor_resume_operation(task,
                                                                step_type)
                return
            else:
                # Agent has no commands whatsoever
                return

        current_step = (node.clean_step if step_type == 'clean'
                        else node.deploy_step)
        command = _get_completed_command(task, agent_commands, step_type)
        LOG.debug('%(type)s command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': node.uuid,
                                   'step': current_step,
                                   'command': command,
                                   'type': step_type})

        if not command:
            # Agent command in progress
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for %(type)s step %(step)s on node '
                     '%(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_error'),
                    'step': current_step,
                    'type': step_type})
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH',
                                               'DEPLOY_VERSION_MISMATCH'):
            # Cache the new clean steps (and 'hardware_manager_version')
            try:
                self.refresh_steps(task, step_type)
            except exception.NodeCleaningFailure as e:
                msg = (_('Could not continue cleaning on node '
                         '%(node)s: %(err)s.') %
                       {'node': node.uuid, 'err': e})
                LOG.exception(msg)
                return manager_utils.cleaning_error_handler(task, msg)
            except exception.InstanceDeployFailure as e:
                msg = (_('Could not continue deployment on node '
                         '%(node)s: %(err)s.') %
                       {'node': node.uuid, 'err': e})
                LOG.exception(msg)
                return manager_utils.deploying_error_handler(task, msg)

            if manual_clean:
                # Don't restart manual cleaning if agent reboots to a new
                # version. Both are operator actions, unlike automated
                # cleaning. Manual clean steps are not necessarily idempotent
                # like automated clean steps and can be even longer running.
                LOG.info('During manual cleaning, node %(node)s detected '
                         'a clean version mismatch. Re-executing and '
                         'continuing from current step %(step)s.',
                         {'node': node.uuid, 'step': node.clean_step})

                driver_internal_info = node.driver_internal_info
                driver_internal_info['skip_current_clean_step'] = False
                node.driver_internal_info = driver_internal_info
                node.save()
            else:
                # Restart the process, agent must have rebooted to new version
                LOG.info('During %(type)s, node %(node)s detected a '
                         '%(type)s version mismatch. Resetting %(type)s steps '
                         'and rebooting the node.',
                         {'type': step_type, 'node': node.uuid})
                try:
                    conductor_steps.set_node_cleaning_steps(task)
                except exception.NodeCleaningFailure as e:
                    msg = (_('Could not restart automated cleaning on node '
                             '%(node)s after step %(step)s: %(err)s.') %
                           {'node': node.uuid, 'err': e,
                            'step': node.clean_step})
                    LOG.exception(msg)
                    return manager_utils.cleaning_error_handler(task, msg)
                except exception.InstanceDeployFailure as e:
                    msg = (_('Could not restart deployment on node '
                             '%(node)s after step %(step)s: %(err)s.') %
                           {'node': node.uuid, 'err': e,
                            'step': node.deploy_step})
                    LOG.exception(msg)
                    return manager_utils.deploying_error_handler(task, msg)

            manager_utils.notify_conductor_resume_operation(task, step_type)

        elif command.get('command_status') == 'SUCCEEDED':
            step_hook = _get_post_step_hook(node, step_type)
            if step_hook is not None:
                LOG.debug('For node %(node)s, executing post %(type)s step '
                          'hook %(method)s for %(type)s step %(step)s',
                          {'method': step_hook.__name__,
                           'node': node.uuid,
                           'step': current_step,
                           'type': step_type})
                try:
                    step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post %(type)s step hook '
                             '%(method)s failed for %(type)s step %(step)s.'
                             '%(cls)s: %(error)s') %
                           {'method': step_hook.__name__,
                            'node': node.uuid,
                            'error': e,
                            'cls': e.__class__.__name__,
                            'step': current_step,
                            'type': step_type})
                    LOG.exception(msg)
                    if step_type == 'clean':
                        return manager_utils.cleaning_error_handler(task, msg)
                    else:
                        return manager_utils.deploying_error_handler(task, msg)

            if current_step.get('reboot_requested'):
                _post_step_reboot(task, step_type)
                return

            LOG.info('Agent on node %(node)s returned %(type)s command '
                     'success, moving to next step',
                     {'node': node.uuid, 'type': step_type})
            manager_utils.notify_conductor_resume_operation(task, step_type)
        else:
            msg = (_('Agent returned unknown status for %(type)s step %(step)s'
                     ' on node %(node)s : %(err)s.') %
                   {'node': node.uuid,
                    'err': command.get('command_status'),
                    'step': current_step,
                    'type': step_type})
            LOG.error(msg)
            if step_type == 'clean':
                return manager_utils.cleaning_error_handler(task, msg)
            else:
                return manager_utils.deploying_error_handler(task, msg)
Esempio n. 12
0
 def _set_failed(self, task, log_msg, error_msg):
     if task.node.clean_step:
         manager_utils.cleaning_error_handler(task, log_msg, error_msg)
     else:
         manager_utils.deploying_error_handler(task, log_msg, error_msg)