Example #1
0
def do_node_clean_abort(task, step_name=None):
    """Internal method to abort an ongoing operation.

    :param task: a TaskManager instance with an exclusive lock
    :param step_name: The name of the clean step.
    """
    node = task.node
    try:
        task.driver.deploy.tear_down_cleaning(task)
    except Exception as e:
        LOG.exception('Failed to tear down cleaning for node %(node)s '
                      'after aborting the operation. Error: %(err)s',
                      {'node': node.uuid, 'err': e})
        error_msg = _('Failed to tear down cleaning after aborting '
                      'the operation')
        utils.cleaning_error_handler(task, error_msg,
                                     tear_down_cleaning=False,
                                     set_fail_state=False)
        return

    info_message = _('Clean operation aborted for node %s') % node.uuid
    last_error = _('By request, the clean operation was aborted')
    if step_name:
        msg = _(' after the completion of step "%s"') % step_name
        last_error += msg
        info_message += msg

    node.last_error = last_error
    node.clean_step = None
    utils.wipe_cleaning_internal_info(task)
    node.save()
    LOG.info(info_message)
Example #2
0
def do_next_clean_step(task, step_index):
    """Do cleaning, starting from the specified clean step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first clean step in the list to execute. This
        is the index (from 0) into the list of clean steps in the node's
        driver_internal_info['clean_steps']. Is None if there are no steps
        to execute.
    """
    node = task.node
    # For manual cleaning, the target provision state is MANAGEABLE,
    # whereas for automated cleaning, it is AVAILABLE.
    manual_clean = node.target_provision_state == states.MANAGEABLE

    if step_index is None:
        steps = []
    else:
        steps = node.driver_internal_info['clean_steps'][step_index:]

    LOG.info(
        'Executing %(state)s on node %(node)s, remaining steps: '
        '%(steps)s', {
            'node': node.uuid,
            'steps': steps,
            'state': node.provision_state
        })

    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.clean_step = step
        driver_internal_info = node.driver_internal_info
        driver_internal_info['clean_step_index'] = step_index + ind
        node.driver_internal_info = driver_internal_info
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s', {
            'step': step,
            'node': node.uuid
        })
        try:
            result = interface.execute_clean_step(task, step)
        except Exception as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('cleaning_reboot'):
                    LOG.info(
                        'Agent is not yet running on node %(node)s '
                        'after cleaning reboot, waiting for agent to '
                        'come up to run next clean step %(step)s.', {
                            'node': node.uuid,
                            'step': step
                        })
                    driver_internal_info['skip_current_clean_step'] = False
                    node.driver_internal_info = driver_internal_info
                    target_state = (states.MANAGEABLE
                                    if manual_clean else None)
                    task.process_event('wait', target_state=target_state)
                    return

            msg = (_('Node %(node)s failed step %(step)s: '
                     '%(exc)s') % {
                         'node': node.uuid,
                         'exc': e,
                         'step': node.clean_step
                     })
            LOG.exception(msg)
            driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
            utils.cleaning_error_handler(task, msg)
            return

        # Check if the step is done or not. The step should return
        # states.CLEANWAIT if the step is still being executed, or
        # None if the step is done.
        if result == states.CLEANWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_clean to continue cleaning
            LOG.info(
                'Clean step %(step)s on node %(node)s being '
                'executed asynchronously, waiting for driver.', {
                    'node': node.uuid,
                    'step': step
                })
            target_state = states.MANAGEABLE if manual_clean else None
            task.process_event('wait', target_state=target_state)
            return
        elif result is not None:
            msg = (_('While executing step %(step)s on node '
                     '%(node)s, step returned invalid value: %(val)s') % {
                         'step': step,
                         'node': node.uuid,
                         'val': result
                     })
            LOG.error(msg)
            return utils.cleaning_error_handler(task, msg)
        LOG.info('Node %(node)s finished clean step %(step)s', {
            'node': node.uuid,
            'step': step
        })

    if CONF.agent.deploy_logs_collect == 'always':
        driver_utils.collect_ramdisk_logs(task.node, label='cleaning')

    # Clear clean_step
    node.clean_step = None
    utils.wipe_cleaning_internal_info(task)
    node.save()
    try:
        task.driver.deploy.tear_down_cleaning(task)
    except Exception as e:
        msg = (_('Failed to tear down from cleaning for node %(node)s, '
                 'reason: %(err)s') % {
                     'node': node.uuid,
                     'err': e
                 })
        LOG.exception(msg)
        return utils.cleaning_error_handler(task,
                                            msg,
                                            tear_down_cleaning=False)

    LOG.info('Node %s cleaning complete', node.uuid)
    event = 'manage' if manual_clean or node.retired else 'done'
    # NOTE(rloo): No need to specify target prov. state; we're done
    task.process_event(event)
Example #3
0
def do_node_clean(task, clean_steps=None, disable_ramdisk=False):
    """Internal RPC method to perform cleaning of a node.

    :param task: a TaskManager instance with an exclusive lock on its node
    :param clean_steps: For a manual clean, the list of clean steps to
                        perform. Is None For automated cleaning (default).
                        For more information, see the clean_steps parameter
                        of :func:`ConductorManager.do_node_clean`.
    :param disable_ramdisk: Whether to skip booting ramdisk for cleaning.
    """
    node = task.node
    manual_clean = clean_steps is not None
    clean_type = 'manual' if manual_clean else 'automated'
    LOG.debug('Starting %(type)s cleaning for node %(node)s', {
        'type': clean_type,
        'node': node.uuid
    })

    if not manual_clean and utils.skip_automated_cleaning(node):
        # Skip cleaning, move to AVAILABLE.
        node.clean_step = None
        node.save()

        task.process_event('done')
        LOG.info(
            'Automated cleaning is disabled, node %s has been '
            'successfully moved to AVAILABLE state.', node.uuid)
        return

    # NOTE(dtantsur): this is only reachable during automated cleaning,
    # for manual cleaning we verify maintenance mode earlier on.
    if (not CONF.conductor.allow_provisioning_in_maintenance
            and node.maintenance):
        msg = _('Cleaning a node in maintenance mode is not allowed')
        return utils.cleaning_error_handler(task,
                                            msg,
                                            tear_down_cleaning=False)

    try:
        # NOTE(ghe): Valid power and network values are needed to perform
        # a cleaning.
        task.driver.power.validate(task)
        if not disable_ramdisk:
            task.driver.network.validate(task)
    except exception.InvalidParameterValue as e:
        msg = (_('Validation failed. Cannot clean node %(node)s. '
                 'Error: %(msg)s') % {
                     'node': node.uuid,
                     'msg': e
                 })
        return utils.cleaning_error_handler(task, msg)

    utils.wipe_cleaning_internal_info(task)
    if manual_clean:
        info = node.driver_internal_info
        info['clean_steps'] = clean_steps
        info['cleaning_disable_ramdisk'] = disable_ramdisk
        node.driver_internal_info = info
    task.node.save()

    # Retrieve BIOS config settings for this node
    utils.node_cache_bios_settings(task, node)

    # Allow the deploy driver to set up the ramdisk again (necessary for
    # IPA cleaning)
    try:
        if not disable_ramdisk:
            prepare_result = task.driver.deploy.prepare_cleaning(task)
        else:
            LOG.info(
                'Skipping preparing for in-band cleaning since '
                'out-of-band only cleaning has been requested for node '
                '%s', node.uuid)
            prepare_result = None
    except Exception as e:
        msg = (_('Failed to prepare node %(node)s for cleaning: %(e)s') % {
            'node': node.uuid,
            'e': e
        })
        return utils.cleaning_error_handler(task, msg, traceback=True)

    if prepare_result == states.CLEANWAIT:
        # Prepare is asynchronous, the deploy driver will need to
        # set node.driver_internal_info['clean_steps'] and
        # node.clean_step and then make an RPC call to
        # continue_node_clean to start cleaning.

        # For manual cleaning, the target provision state is MANAGEABLE,
        # whereas for automated cleaning, it is AVAILABLE (the default).
        target_state = states.MANAGEABLE if manual_clean else None
        task.process_event('wait', target_state=target_state)
        return

    try:
        conductor_steps.set_node_cleaning_steps(
            task, disable_ramdisk=disable_ramdisk)
    except (exception.InvalidParameterValue,
            exception.NodeCleaningFailure) as e:
        msg = (_('Cannot clean node %(node)s. Error: %(msg)s') % {
            'node': node.uuid,
            'msg': e
        })
        return utils.cleaning_error_handler(task, msg)

    steps = node.driver_internal_info.get('clean_steps', [])
    step_index = 0 if steps else None
    do_next_clean_step(task, step_index, disable_ramdisk=disable_ramdisk)