Ejemplo n.º 1
0
def get_cluster_sbd_config(lib_env):
    """
    Returns list of SBD config from all cluster nodes in cluster. Structure
    of data:
    [
        {
            "node": <NodeAddress>
            "config": <sbd_config_dict> or None if there was failure,
        },
        ...
    ]
    If error occurs while obtaining config from some node, it's config will be
    None. If obtaining config fail on all node returns empty dictionary.

    lib_env -- LibraryEnvironment
    """
    node_list, get_nodes_report_list = get_existing_nodes_names(
        lib_env.get_corosync_conf()
    )
    if not node_list:
        get_nodes_report_list.append(reports.corosync_config_no_nodes_defined())
    lib_env.report_processor.process_list(get_nodes_report_list)

    com_cmd = GetSbdConfig(lib_env.report_processor)
    com_cmd.set_targets(
        lib_env.get_node_target_factory().get_target_list(
            node_list,
            skip_non_existing=True
        )
    )
    return run_com(lib_env.get_node_communicator(), com_cmd)
Ejemplo n.º 2
0
def get_cluster_sbd_status(lib_env):
    """
    Returns status of SBD service in cluster in dictionary with format:
    {
        <NodeAddress>: {
            "installed": <boolean>,
            "enabled": <boolean>,
            "running": <boolean>
        },
        ...
    }

    lib_env -- LibraryEnvironment
    """
    node_list, get_nodes_report_list = get_existing_nodes_names(
        lib_env.get_corosync_conf())
    if not node_list:
        get_nodes_report_list.append(
            ReportItem.error(reports.messages.CorosyncConfigNoNodesDefined()))
    if lib_env.report_processor.report_list(get_nodes_report_list).has_errors:
        raise LibraryError()

    com_cmd = GetSbdStatus(lib_env.report_processor)
    com_cmd.set_targets(lib_env.get_node_target_factory().get_target_list(
        node_list, skip_non_existing=True))
    return run_com(lib_env.get_node_communicator(), com_cmd)
Ejemplo n.º 3
0
def get_cluster_sbd_status(lib_env):
    """
    Returns status of SBD service in cluster in dictionary with format:
    {
        <NodeAddress>: {
            "installed": <boolean>,
            "enabled": <boolean>,
            "running": <boolean>
        },
        ...
    }

    lib_env -- LibraryEnvironment
    """
    node_list, get_nodes_report_list = get_existing_nodes_names(
        lib_env.get_corosync_conf()
    )
    if not node_list:
        get_nodes_report_list.append(reports.corosync_config_no_nodes_defined())
    lib_env.report_processor.process_list(get_nodes_report_list)

    com_cmd = GetSbdStatus(lib_env.report_processor)
    com_cmd.set_targets(
        lib_env.get_node_target_factory().get_target_list(
            node_list,
            skip_non_existing=True
        )
    )
    return run_com(lib_env.get_node_communicator(), com_cmd)
Ejemplo n.º 4
0
def get_cluster_sbd_config(lib_env):
    """
    Returns list of SBD config from all cluster nodes in cluster. Structure
    of data:
    [
        {
            "node": <NodeAddress>
            "config": <sbd_config_dict> or None if there was failure,
        },
        ...
    ]
    If error occurs while obtaining config from some node, it's config will be
    None. If obtaining config fail on all node returns empty dictionary.

    lib_env -- LibraryEnvironment
    """
    node_list, get_nodes_report_list = get_existing_nodes_names(
        lib_env.get_corosync_conf()
    )
    if not node_list:
        get_nodes_report_list.append(reports.corosync_config_no_nodes_defined())
    lib_env.report_processor.process_list(get_nodes_report_list)

    com_cmd = GetSbdConfig(lib_env.report_processor)
    com_cmd.set_targets(
        lib_env.get_node_target_factory().get_target_list(
            node_list,
            skip_non_existing=True
        )
    )
    return run_com(lib_env.get_node_communicator(), com_cmd)
Ejemplo n.º 5
0
def get_cluster_sbd_status(lib_env):
    """
    Returns status of SBD service in cluster in dictionary with format:
    {
        <NodeAddress>: {
            "installed": <boolean>,
            "enabled": <boolean>,
            "running": <boolean>
        },
        ...
    }

    lib_env -- LibraryEnvironment
    """
    com_cmd = GetSbdStatus(lib_env.report_processor)
    com_cmd.set_targets(lib_env.get_node_target_factory().get_target_list(
        _get_cluster_nodes(lib_env)))
    return run_com(lib_env.get_node_communicator(), com_cmd)
Ejemplo n.º 6
0
def get_cluster_sbd_config(lib_env):
    """
    Returns list of SBD config from all cluster nodes in cluster. Structure
    of data:
    [
        {
            "node": <NodeAddress>
            "config": <sbd_config_dict> or None if there was failure,
        },
        ...
    ]
    If error occurs while obtaining config from some node, it's config will be
    None. If obtaining config fail on all node returns empty dictionary.

    lib_env -- LibraryEnvironment
    """
    com_cmd = GetSbdConfig(lib_env.report_processor)
    com_cmd.set_targets(lib_env.get_node_target_factory().get_target_list(
        _get_cluster_nodes(lib_env)))
    return run_com(lib_env.get_node_communicator(), com_cmd)
Ejemplo n.º 7
0
def _wait_for_pacemaker_to_start(node_communicator,
                                 report_processor,
                                 target_list,
                                 timeout=None):
    timeout = 60 * 15 if timeout is None else timeout
    interval = 2
    stop_at = time.time() + timeout
    report_processor.process(
        reports.wait_for_node_startup_started(
            [target.label for target in target_list]))
    error_report_list = []
    while target_list:
        if time.time() > stop_at:
            error_report_list.append(reports.wait_for_node_startup_timed_out())
            break
        time.sleep(interval)
        com_cmd = CheckPacemakerStarted(report_processor)
        com_cmd.set_targets(target_list)
        target_list = run_com(node_communicator, com_cmd)
        error_report_list.extend(com_cmd.error_list)

    if error_report_list:
        error_report_list.append(reports.wait_for_node_startup_error())
    return error_report_list
Ejemplo n.º 8
0
def _prepare_pacemaker_remote_environment(
    env,
    report_processor,
    existing_nodes_target_list,
    new_node_target,
    new_node_name,
    skip_offline_nodes,
    allow_incomplete_distribution,
    allow_fails,
):
    if new_node_target:
        com_cmd = GetOnlineTargets(
            report_processor,
            ignore_offline_targets=skip_offline_nodes,
        )
        com_cmd.set_targets([new_node_target])
        online_new_target_list = run_com(env.get_node_communicator(), com_cmd)
        if not online_new_target_list and not skip_offline_nodes:
            raise LibraryError()
    else:
        online_new_target_list = []

    # check new nodes
    if online_new_target_list:
        com_cmd = GetHostInfo(report_processor)
        com_cmd.set_targets(online_new_target_list)
        report_processor.report_list(
            _host_check_remote_node(
                run_com(env.get_node_communicator(), com_cmd)))
        if report_processor.has_errors:
            raise LibraryError()
    else:
        report_processor.report_list(
            _reports_skip_new_node(new_node_name, "unreachable"))

    # share pacemaker authkey
    authkey_file = FileInstance.for_pacemaker_key()
    try:
        if authkey_file.raw_file.exists():
            authkey_content = authkey_file.read_raw()
            authkey_targets = online_new_target_list
        else:
            authkey_content = generate_binary_key(
                random_bytes_count=settings.pacemaker_authkey_bytes)
            authkey_targets = (existing_nodes_target_list +
                               online_new_target_list)
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    if report_processor.has_errors:
        raise LibraryError()

    if authkey_targets:
        com_cmd = DistributeFiles(
            report_processor,
            node_communication_format.pcmk_authkey_file(authkey_content),
            skip_offline_targets=skip_offline_nodes,
            allow_fails=allow_incomplete_distribution,
        )
        com_cmd.set_targets(authkey_targets)
        run_and_raise(env.get_node_communicator(), com_cmd)

    # start and enable pacemaker_remote
    if online_new_target_list:
        com_cmd = ServiceAction(
            report_processor,
            node_communication_format.create_pcmk_remote_actions([
                "start",
                "enable",
            ]),
            allow_fails=allow_fails,
        )
        com_cmd.set_targets(online_new_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)
Ejemplo n.º 9
0
def remove_nodes(env, node_list, force_quorum_loss=False, skip_offline=False):
    """
    Remove nodes from a cluster.

    env LibraryEnvironment
    node_list iterable -- names of nodes to remove
    force_quorum_loss bool -- treat quorum loss as a warning if True
    skip_offline bool -- treat unreachable nodes as warnings if True
    """
    _ensure_live_env(env)  # raises if env is not live

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()
    corosync_conf = env.get_corosync_conf()
    cluster_nodes_names = corosync_conf.get_nodes_names()

    # validations

    report_processor.report_list(
        config_validators.remove_nodes(
            node_list,
            corosync_conf.get_nodes(),
            corosync_conf.get_quorum_device_settings(),
        ))
    if report_processor.has_errors:
        # If there is an error, there is usually not much sense in doing other
        # validations:
        # - if there would be no node left in the cluster, it's pointless
        #   to check for quorum loss or if at least one remaining node is online
        # - if only one node is being removed and it doesn't exist, it's again
        #   pointless to check for other issues
        raise LibraryError()

    target_report_list, cluster_nodes_target_list = (
        target_factory.get_target_list_with_reports(
            cluster_nodes_names,
            skip_non_existing=skip_offline,
        ))
    known_nodes = set([target.label for target in cluster_nodes_target_list])
    unknown_nodes = set(
        [name for name in cluster_nodes_names if name not in known_nodes])
    report_processor.report_list(target_report_list)

    com_cmd = GetOnlineTargets(
        report_processor,
        ignore_offline_targets=skip_offline,
    )
    com_cmd.set_targets(cluster_nodes_target_list)
    online_target_list = run_com(env.get_node_communicator(), com_cmd)
    offline_target_list = [
        target for target in cluster_nodes_target_list
        if target not in online_target_list
    ]
    staying_online_target_list = [
        target for target in online_target_list
        if target.label not in node_list
    ]
    targets_to_remove = [
        target for target in cluster_nodes_target_list
        if target.label in node_list
    ]
    if not staying_online_target_list:
        report_processor.report(
            reports.unable_to_connect_to_any_remaining_node())
        # If no remaining node is online, there is no point in checking quorum
        # loss or anything as we would just get errors.
        raise LibraryError()

    if skip_offline:
        staying_offline_nodes = ([
            target.label
            for target in offline_target_list if target.label not in node_list
        ] + [name for name in unknown_nodes if name not in node_list])
        if staying_offline_nodes:
            report_processor.report(
                reports.unable_to_connect_to_all_remaining_node(
                    staying_offline_nodes))

    atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(),
                                                      corosync_conf,
                                                      -len(node_list))
    if atb_has_to_be_enabled:
        report_processor.report(
            reports.corosync_quorum_atb_will_be_enabled_due_to_sbd())
        com_cmd = CheckCorosyncOffline(
            report_processor,
            allow_skip_offline=False,
        )
        com_cmd.set_targets(staying_online_target_list)
        run_com(env.get_node_communicator(), com_cmd)
    else:
        # Check if removing the nodes would cause quorum loss. We ask the nodes
        # to be removed for their view of quorum. If they are all stopped or
        # not in a quorate partition, their removal cannot cause quorum loss.
        # That's why we ask them and not the remaining nodes.
        # example: 5-node cluster, 3 online nodes, removing one online node,
        # results in 4-node cluster with 2 online nodes => quorum lost
        # Check quorum loss only if ATB does not need to be enabled. If it is
        # required, cluster has to be turned off and therefore it loses quorum.
        forceable_report_creator = reports.get_problem_creator(
            report_codes.FORCE_QUORUM_LOSS, force_quorum_loss)
        com_cmd = cluster.GetQuorumStatus(report_processor)
        com_cmd.set_targets(targets_to_remove)
        failures, quorum_status = run_com(env.get_node_communicator(), com_cmd)
        if quorum_status:
            if quorum_status.stopping_nodes_cause_quorum_loss(node_list):
                report_processor.report(
                    forceable_report_creator(
                        reports.corosync_quorum_will_be_lost))
        elif failures or not targets_to_remove:
            report_processor.report(
                forceable_report_creator(
                    reports.corosync_quorum_loss_unable_to_check, ))

    if report_processor.has_errors:
        raise LibraryError()

    # validations done

    unknown_to_remove = [name for name in unknown_nodes if name in node_list]
    if unknown_to_remove:
        report_processor.report(
            reports.nodes_to_remove_unreachable(unknown_to_remove))
    if targets_to_remove:
        com_cmd = cluster.DestroyWarnOnFailure(report_processor)
        com_cmd.set_targets(targets_to_remove)
        run_and_raise(env.get_node_communicator(), com_cmd)

    corosync_conf.remove_nodes(node_list)
    if atb_has_to_be_enabled:
        corosync_conf.set_quorum_options(dict(auto_tie_breaker="1"))

    com_cmd = DistributeCorosyncConf(
        env.report_processor,
        corosync_conf.config.export(),
        allow_skip_offline=False,
    )
    com_cmd.set_targets(staying_online_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    com_cmd = ReloadCorosyncConf(env.report_processor)
    com_cmd.set_targets(staying_online_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # try to remove nodes from pcmk using crm_node -R <node> --force and if not
    # successful remove it directly from CIB file on all nodes in parallel
    com_cmd = RemoveNodesFromCib(env.report_processor, node_list)
    com_cmd.set_targets(staying_online_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)
Ejemplo n.º 10
0
def add_nodes(
    env,
    nodes,
    wait=False,
    start=False,
    enable=False,
    force=False,
    force_unresolvable=False,
    skip_offline_nodes=False,
    no_watchdog_validation=False,
):
    # pylint: disable=too-many-locals
    """
    Add specified nodes to the local cluster
    Raise LibraryError on any error.

    env LibraryEnvironment
    nodes list -- list of dicts which represents node.
        Supported keys are: name (required), addrs (list), devices (list),
        watchdog
    wait -- specifies if command should try to wait for cluster to start up.
        Has no effect start is False. If set to False command will not wait for
        cluster to start. If None command will wait for some default timeout.
        If int wait set timeout to int value of seconds.
    start bool -- if True start cluster when it is set up
    enable bool -- if True enable cluster when it is set up
    force bool -- if True some validations errors are treated as warnings
    force_unresolvable bool -- if True not resolvable addresses of nodes are
        treated as warnings
    skip_offline_nodes bool -- if True non fatal connection failures to other
        hosts are treated as warnings
    no_watchdog_validation bool -- if True do not validate specified watchdogs
        on remote hosts
    """
    _ensure_live_env(env)  # raises if env is not live

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()
    is_sbd_enabled = sbd.is_sbd_enabled(env.cmd_runner())
    corosync_conf = env.get_corosync_conf()
    cluster_nodes_names = corosync_conf.get_nodes_names()
    corosync_node_options = {"name", "addrs"}
    sbd_node_options = {"devices", "watchdog"}

    keys_to_normalize = {"addrs"}
    if is_sbd_enabled:
        keys_to_normalize |= sbd_node_options
    new_nodes = [_normalize_dict(node, keys_to_normalize) for node in nodes]

    # get targets for existing nodes
    target_report_list, cluster_nodes_target_list = (
        target_factory.get_target_list_with_reports(
            cluster_nodes_names,
            skip_non_existing=skip_offline_nodes,
        ))
    report_processor.report_list(target_report_list)
    # get a target for qnetd if needed
    qdevice_model, qdevice_model_options, _, _ = (
        corosync_conf.get_quorum_device_settings())
    if qdevice_model == "net":
        try:
            qnetd_target = target_factory.get_target(
                qdevice_model_options["host"])
        except HostNotFound:
            report_processor.report(
                reports.host_not_found([qdevice_model_options["host"]]))

    # Get targets for new nodes and report unknown (== not-authorized) nodes.
    # If a node doesn't contain the 'name' key, validation of inputs reports it.
    # That means we don't report missing names but cannot rely on them being
    # present either.
    target_report_list, new_nodes_target_list = (
        target_factory.get_target_list_with_reports(
            [node["name"] for node in new_nodes if "name" in node],
            allow_skip=False,
        ))
    report_processor.report_list(target_report_list)

    # Set default values for not-specified node options.
    # Use an address defined in known-hosts for each node with no addresses
    # specified. This allows users not to specify node addresses at all which
    # simplifies the whole node add command / form significantly.
    new_nodes_target_dict = {
        target.label: target
        for target in new_nodes_target_list
    }
    addrs_defaulter = _get_addrs_defaulter(report_processor,
                                           new_nodes_target_dict)
    new_nodes_defaulters = {"addrs": addrs_defaulter}
    if is_sbd_enabled:
        watchdog_defaulter = _get_watchdog_defaulter(report_processor,
                                                     new_nodes_target_dict)
        new_nodes_defaulters["devices"] = lambda _: []
        new_nodes_defaulters["watchdog"] = watchdog_defaulter
    new_nodes = [
        _set_defaults_in_dict(node, new_nodes_defaulters) for node in new_nodes
    ]
    new_nodes_dict = {
        node["name"]: node
        for node in new_nodes if "name" in node
    }

    # Validate inputs - node options names
    # We do not want to make corosync validators know about SBD options and
    # vice versa. Therefore corosync and SBD validators get only valid corosync
    # and SBD options respectively, and we need to check for any surplus
    # options here.
    report_processor.report_list(
        validate_names_in(
            corosync_node_options | sbd_node_options,
            set([
                option for node_options in [node.keys() for node in new_nodes]
                for option in node_options
            ]),
            option_type="node"))

    # Validate inputs - corosync part
    try:
        cib = env.get_cib()
        cib_nodes = get_remote_nodes(cib) + get_guest_nodes(cib)
    except LibraryError:
        cib_nodes = []
        report_processor.report(
            reports.get_problem_creator(
                report_codes.FORCE_LOAD_NODES_FROM_CIB,
                force)(reports.cib_load_error_get_nodes_for_validation))
    # corosync validator rejects non-corosync keys
    new_nodes_corosync = [{
        key: node[key]
        for key in corosync_node_options if key in node
    } for node in new_nodes]
    report_processor.report_list(
        config_validators.add_nodes(new_nodes_corosync,
                                    corosync_conf.get_nodes(),
                                    cib_nodes,
                                    force_unresolvable=force_unresolvable))

    # Validate inputs - SBD part
    if is_sbd_enabled:
        report_processor.report_list(
            sbd.validate_new_nodes_devices({
                node["name"]: node["devices"]
                for node in new_nodes if "name" in node
            }))
    else:
        for node in new_nodes:
            sbd_options = sbd_node_options.intersection(node.keys())
            if sbd_options and "name" in node:
                report_processor.report(
                    reports.sbd_not_used_cannot_set_sbd_options(
                        sbd_options, node["name"]))

    # Validate inputs - flags part
    wait_timeout = _get_validated_wait_timeout(report_processor, wait, start)

    # Get online cluster nodes
    # This is the only call in which we accept skip_offline_nodes option for the
    # cluster nodes. In all the other actions we communicate only with the
    # online nodes. This allows us to simplify code as any communication issue
    # is considered an error, ends the command processing and is not possible
    # to skip it by skip_offline_nodes. We do not have to care about a situation
    # when a communication command cannot connect to some nodes and then the
    # next command can connect but fails due to the previous one did not
    # succeed.
    online_cluster_target_list = []
    if cluster_nodes_target_list:
        com_cmd = GetOnlineTargets(
            report_processor,
            ignore_offline_targets=skip_offline_nodes,
        )
        com_cmd.set_targets(cluster_nodes_target_list)
        online_cluster_target_list = run_com(env.get_node_communicator(),
                                             com_cmd)
        offline_cluster_target_list = [
            target for target in cluster_nodes_target_list
            if target not in online_cluster_target_list
        ]
        if len(online_cluster_target_list) == 0:
            report_processor.report(
                reports.unable_to_perform_operation_on_any_node())
        elif offline_cluster_target_list and skip_offline_nodes:
            # TODO: report (warn) how to fix offline nodes when they come online
            # report_processor.report(None)
            pass

    # Validate existing cluster nodes status
    atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(),
                                                      corosync_conf,
                                                      len(new_nodes))
    if atb_has_to_be_enabled:
        report_processor.report(
            reports.corosync_quorum_atb_will_be_enabled_due_to_sbd())
        if online_cluster_target_list:
            com_cmd = CheckCorosyncOffline(
                report_processor,
                allow_skip_offline=False,
            )
            com_cmd.set_targets(online_cluster_target_list)
            run_com(env.get_node_communicator(), com_cmd)

    # Validate new nodes. All new nodes have to be online.
    com_cmd = GetHostInfo(report_processor)
    com_cmd.set_targets(new_nodes_target_list)
    report_processor.report_list(
        _host_check_cluster_setup(
            run_com(env.get_node_communicator(), com_cmd),
            force,
            # version of services may not be the same across the existing
            # cluster nodes, so it's not easy to make this check properly
            check_services_versions=False,
        ))

    # Validate SBD on new nodes
    if is_sbd_enabled:
        if no_watchdog_validation:
            report_processor.report(reports.sbd_watchdog_validation_inactive())
        com_cmd = CheckSbd(report_processor)
        for new_node_target in new_nodes_target_list:
            new_node = new_nodes_dict[new_node_target.label]
            # Do not send watchdog if validation is turned off. Listing of
            # available watchdogs in pcsd may restart the machine in some
            # corner cases.
            com_cmd.add_request(
                new_node_target,
                watchdog=""
                if no_watchdog_validation else new_node["watchdog"],
                device_list=new_node["devices"],
            )
        run_com(env.get_node_communicator(), com_cmd)

    if report_processor.has_errors:
        raise LibraryError()

    # Validation done. If errors occured, an exception has been raised and we
    # don't get below this line.

    # First set up everything else than corosync. Once the new nodes are present
    # in corosync.conf, they're considered part of a cluster and the node add
    # command cannot be run again. So we need to minimize the amout of actions
    # (and therefore possible failures) after adding the nodes to corosync.

    # distribute auth tokens of all cluster nodes (including the new ones) to
    # all new nodes
    com_cmd = UpdateKnownHosts(
        env.report_processor,
        known_hosts_to_add=env.get_known_hosts(cluster_nodes_names +
                                               list(new_nodes_dict.keys())),
        known_hosts_to_remove=[],
    )
    com_cmd.set_targets(new_nodes_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # qdevice setup
    if qdevice_model == "net":
        qdevice_net.set_up_client_certificates(
            env.cmd_runner(),
            env.report_processor,
            env.communicator_factory,
            qnetd_target,
            corosync_conf.get_cluster_name(),
            new_nodes_target_list,
            # we don't want to allow skiping offline nodes which are being
            # added, otherwise qdevice will not work properly
            skip_offline_nodes=False,
            allow_skip_offline=False)

    # sbd setup
    if is_sbd_enabled:
        sbd_cfg = environment_file_to_dict(sbd.get_local_sbd_config())

        com_cmd = SetSbdConfig(env.report_processor)
        for new_node_target in new_nodes_target_list:
            new_node = new_nodes_dict[new_node_target.label]
            com_cmd.add_request(
                new_node_target,
                sbd.create_sbd_config(
                    sbd_cfg,
                    new_node["name"],
                    watchdog=new_node["watchdog"],
                    device_list=new_node["devices"],
                ))
        run_and_raise(env.get_node_communicator(), com_cmd)

        com_cmd = EnableSbdService(env.report_processor)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)
    else:
        com_cmd = DisableSbdService(env.report_processor)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)

    # booth setup
    booth_sync.send_all_config_to_node(
        env.get_node_communicator(),
        env.report_processor,
        new_nodes_target_list,
        rewrite_existing=force,
        skip_wrong_config=force,
    )

    # distribute corosync and pacemaker authkeys
    files_action = {}
    forceable_io_error_creator = reports.get_problem_creator(
        report_codes.SKIP_FILE_DISTRIBUTION_ERRORS, force)
    if os.path.isfile(settings.corosync_authkey_file):
        try:
            files_action.update(
                node_communication_format.corosync_authkey_file(
                    open(settings.corosync_authkey_file, "rb").read()))
        except EnvironmentError as e:
            report_processor.report(
                forceable_io_error_creator(
                    reports.file_io_error,
                    env_file_role_codes.COROSYNC_AUTHKEY,
                    file_path=settings.corosync_authkey_file,
                    operation="read",
                    reason=format_environment_error(e)))

    if os.path.isfile(settings.pacemaker_authkey_file):
        try:
            files_action.update(
                node_communication_format.pcmk_authkey_file(
                    open(settings.pacemaker_authkey_file, "rb").read()))
        except EnvironmentError as e:
            report_processor.report(
                forceable_io_error_creator(
                    reports.file_io_error,
                    env_file_role_codes.PACEMAKER_AUTHKEY,
                    file_path=settings.pacemaker_authkey_file,
                    operation="read",
                    reason=format_environment_error(e)))

    # pcs_settings.conf was previously synced using pcsdcli send_local_configs.
    # This has been changed temporarily until new system for distribution and
    # syncronization of configs will be introduced.
    if os.path.isfile(settings.pcsd_settings_conf_location):
        try:
            files_action.update(
                node_communication_format.pcs_settings_conf_file(
                    open(settings.pcsd_settings_conf_location, "r").read()))
        except EnvironmentError as e:
            report_processor.report(
                forceable_io_error_creator(
                    reports.file_io_error,
                    env_file_role_codes.PCS_SETTINGS_CONF,
                    file_path=settings.pcsd_settings_conf_location,
                    operation="read",
                    reason=format_environment_error(e)))

    # stop here if one of the files could not be loaded and it was not forced
    if report_processor.has_errors:
        raise LibraryError()

    if files_action:
        com_cmd = DistributeFilesWithoutForces(env.report_processor,
                                               files_action)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute and reload pcsd SSL certificate
    report_processor.report(
        reports.pcsd_ssl_cert_and_key_distribution_started(
            [target.label for target in new_nodes_target_list]))

    try:
        with open(settings.pcsd_cert_location, "r") as f:
            ssl_cert = f.read()
    except EnvironmentError as e:
        report_processor.report(
            reports.file_io_error(
                env_file_role_codes.PCSD_SSL_CERT,
                file_path=settings.pcsd_cert_location,
                reason=format_environment_error(e),
                operation="read",
            ))
    try:
        with open(settings.pcsd_key_location, "r") as f:
            ssl_key = f.read()
    except EnvironmentError as e:
        report_processor.report(
            reports.file_io_error(
                env_file_role_codes.PCSD_SSL_KEY,
                file_path=settings.pcsd_key_location,
                reason=format_environment_error(e),
                operation="read",
            ))
    if report_processor.has_errors:
        raise LibraryError()

    com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key)
    com_cmd.set_targets(new_nodes_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # When corosync >= 2 is in use, the procedure for adding a node is:
    # 1. add the new node to corosync.conf on all existing nodes
    # 2. reload corosync.conf before the new node is started
    # 3. start the new node
    # If done otherwise, membership gets broken and qdevice hangs. Cluster
    # will recover after a minute or so but still it's a wrong way.

    corosync_conf.add_nodes(new_nodes_corosync)
    if atb_has_to_be_enabled:
        corosync_conf.set_quorum_options(dict(auto_tie_breaker="1"))

    com_cmd = DistributeCorosyncConf(
        env.report_processor,
        corosync_conf.config.export(),
        allow_skip_offline=False,
    )
    com_cmd.set_targets(online_cluster_target_list + new_nodes_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    com_cmd = ReloadCorosyncConf(env.report_processor)
    com_cmd.set_targets(online_cluster_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Optionally enable and start cluster services.
    if enable:
        com_cmd = EnableCluster(env.report_processor)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)
    if start:
        _start_cluster(
            env.communicator_factory,
            env.report_processor,
            new_nodes_target_list,
            wait_timeout=wait_timeout,
        )
Ejemplo n.º 11
0
def setup(env,
          cluster_name,
          nodes,
          transport_type=None,
          transport_options=None,
          link_list=None,
          compression_options=None,
          crypto_options=None,
          totem_options=None,
          quorum_options=None,
          wait=False,
          start=False,
          enable=False,
          force=False,
          force_unresolvable=False):
    """
    Set up cluster on specified nodes.
    Validation of the inputs is done here. Possible existing clusters are
    destroyed (when using force). Authkey files for corosync and pacemaer,
    known hosts and and newly generated corosync.conf are distributed to all
    nodes.
    Raise LibraryError on any error.

    env LibraryEnvironment
    cluster_name string -- name of a cluster to set up
    nodes list -- list of dicts which represents node.
        Supported keys are: name (required), addrs
    transport_type string -- transport type of a cluster
    transport_options dict -- transport specific options
    link_list list of dict -- list of links, depends of transport_type
    compression_options dict -- only available for transport_type == 'knet'. In
        corosync.conf they are prefixed 'knet_compression_'
    crypto_options dict -- only available for transport_type == 'knet'. In
        corosync.conf they are prefixed 'crypto_'
    totem_options dict -- options of section 'totem' in corosync.conf
    quorum_options dict -- options of section 'quorum' in corosync.conf
    wait -- specifies if command should try to wait for cluster to start up.
        Has no effect start is False. If set to False command will not wait for
        cluster to start. If None command will wait for some default timeout.
        If int wait set timeout to int value of seconds.
    start bool -- if True start cluster when it is set up
    enable bool -- if True enable cluster when it is set up
    force bool -- if True some validations errors are treated as warnings
    force_unresolvable bool -- if True not resolvable addresses of nodes are
        treated as warnings
    """
    _ensure_live_env(env)  # raises if env is not live

    transport_options = transport_options or {}
    link_list = link_list or []
    compression_options = compression_options or {}
    crypto_options = crypto_options or {}
    totem_options = totem_options or {}
    quorum_options = quorum_options or {}
    nodes = [_normalize_dict(node, {"addrs"}) for node in nodes]

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()

    # Get targets for all nodes and report unknown (== not-authorized) nodes.
    # If a node doesn't contain the 'name' key, validation of inputs reports it.
    # That means we don't report missing names but cannot rely on them being
    # present either.
    target_report_list, target_list = (
        target_factory.get_target_list_with_reports(
            [node["name"] for node in nodes if "name" in node],
            allow_skip=False,
        ))
    report_processor.report_list(target_report_list)

    # Use an address defined in known-hosts for each node with no addresses
    # specified. This allows users not to specify node addresses at all which
    # simplifies the whole cluster setup command / form significantly.
    addrs_defaulter = _get_addrs_defaulter(
        report_processor, {target.label: target
                           for target in target_list})
    nodes = [
        _set_defaults_in_dict(node, {"addrs": addrs_defaulter})
        for node in nodes
    ]

    # Validate inputs.
    report_processor.report_list(
        config_validators.create(cluster_name,
                                 nodes,
                                 transport_type,
                                 force_unresolvable=force_unresolvable))
    if transport_type in corosync_constants.TRANSPORTS_KNET:
        max_link_number = max([len(node["addrs"]) for node in nodes],
                              default=0)
        report_processor.report_list(
            config_validators.create_transport_knet(
                transport_options, compression_options, crypto_options) +
            config_validators.create_link_list_knet(link_list, max_link_number)
        )
    elif transport_type in corosync_constants.TRANSPORTS_UDP:
        report_processor.report_list(
            config_validators.create_transport_udp(
                transport_options, compression_options, crypto_options) +
            config_validators.create_link_list_udp(link_list))
    report_processor.report_list(
        config_validators.create_totem(totem_options) +
        # We are creating the config and we know there is no qdevice in it.
        config_validators.create_quorum_options(quorum_options, False))

    # Validate flags
    wait_timeout = _get_validated_wait_timeout(report_processor, wait, start)

    # Validate the nodes
    com_cmd = GetHostInfo(report_processor)
    com_cmd.set_targets(target_list)
    report_processor.report_list(
        _host_check_cluster_setup(
            run_com(env.get_node_communicator(), com_cmd), force))

    if report_processor.has_errors:
        raise LibraryError()

    # Validation done. If errors occured, an exception has been raised and we
    # don't get below this line.

    # Destroy cluster on all nodes.
    com_cmd = cluster.Destroy(env.report_processor)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute auth tokens.
    com_cmd = UpdateKnownHosts(
        env.report_processor,
        known_hosts_to_add=env.get_known_hosts(
            [target.label for target in target_list]),
        known_hosts_to_remove=[],
    )
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute configuration files except corosync.conf. Sending
    # corosync.conf serves as a "commit" as its presence on a node marks the
    # node as a part of a cluster.
    corosync_authkey = generate_binary_key(random_bytes_count=128)
    pcmk_authkey = generate_binary_key(random_bytes_count=128)
    actions = {}
    actions.update(
        node_communication_format.corosync_authkey_file(corosync_authkey))
    actions.update(node_communication_format.pcmk_authkey_file(pcmk_authkey))
    com_cmd = DistributeFilesWithoutForces(env.report_processor, actions)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)
    # TODO This should be in the previous call but so far we don't have a call
    # which allows to save and delete files at the same time.
    com_cmd = RemoveFilesWithoutForces(
        env.report_processor,
        {"pcsd settings": {
            "type": "pcsd_settings"
        }},
    )
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute and reload pcsd SSL certificate
    report_processor.report(
        reports.pcsd_ssl_cert_and_key_distribution_started(
            [target.label for target in target_list]))
    ssl_key_raw = ssl.generate_key()
    ssl_key = ssl.dump_key(ssl_key_raw)
    ssl_cert = ssl.dump_cert(
        ssl.generate_cert(ssl_key_raw, target_list[0].label))
    com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Create and distribute corosync.conf. Once a node saves corosync.conf it
    # is considered to be in a cluster.
    corosync_conf = config_facade.ConfigFacade.create(cluster_name, nodes,
                                                      transport_type)
    corosync_conf.set_totem_options(totem_options)
    corosync_conf.set_quorum_options(quorum_options)
    corosync_conf.create_link_list(link_list)
    if transport_type in corosync_constants.TRANSPORTS_KNET:
        corosync_conf.set_transport_knet_options(transport_options,
                                                 compression_options,
                                                 crypto_options)
    elif transport_type in corosync_constants.TRANSPORTS_UDP:
        corosync_conf.set_transport_udp_options(transport_options)

    com_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.corosync_conf_file(
            corosync_conf.config.export()),
    )
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    env.report_processor.process(reports.cluster_setup_success())

    # Optionally enable and start cluster services.
    if enable:
        com_cmd = EnableCluster(env.report_processor)
        com_cmd.set_targets(target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)
    if start:
        _start_cluster(
            env.communicator_factory,
            env.report_processor,
            target_list,
            wait_timeout=wait_timeout,
        )
Ejemplo n.º 12
0
def _prepare_pacemaker_remote_environment(
    env, report_processor, existing_nodes_target_list, new_node_target,
    new_node_name, skip_offline_nodes, allow_incomplete_distribution,
    allow_fails
):
    if new_node_target:
        com_cmd = GetOnlineTargets(
            report_processor,
            ignore_offline_targets=skip_offline_nodes,
        )
        com_cmd.set_targets([new_node_target])
        online_new_target_list = run_com(env.get_node_communicator(), com_cmd)
        if not online_new_target_list and not skip_offline_nodes:
            raise LibraryError()
    else:
        online_new_target_list = []

    # check new nodes
    if online_new_target_list:
        com_cmd = GetHostInfo(report_processor)
        com_cmd.set_targets(online_new_target_list)
        report_processor.report_list(
            _host_check_remote_node(
                run_com(env.get_node_communicator(), com_cmd)
            )
        )
        if report_processor.has_errors:
            raise LibraryError()
    else:
        report_processor.report_list(
            _reports_skip_new_node(new_node_name, "unreachable")
        )

    # share pacemaker authkey
    if env.pacemaker.has_authkey:
        authkey_content = env.pacemaker.get_authkey_content()
        authkey_targets = online_new_target_list
    else:
        authkey_content = generate_binary_key(
            random_bytes_count=settings.pacemaker_authkey_bytes
        )
        authkey_targets = existing_nodes_target_list + online_new_target_list
    if authkey_targets:
        com_cmd = DistributeFiles(
            report_processor,
            node_communication_format.pcmk_authkey_file(authkey_content),
            skip_offline_targets=skip_offline_nodes,
            allow_fails=allow_incomplete_distribution,
        )
        com_cmd.set_targets(authkey_targets)
        run_and_raise(env.get_node_communicator(), com_cmd)

    # start and enable pacemaker_remote
    if online_new_target_list:
        com_cmd = ServiceAction(
            report_processor,
            node_communication_format.create_pcmk_remote_actions([
                "start",
                "enable",
            ]),
            allow_fails=allow_fails,
        )
        com_cmd.set_targets(online_new_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)