def config_sync(env, skip_offline_nodes=False): """ Send specified local booth configuration to all nodes in cluster. env -- LibraryEnvironment skip_offline_nodes -- if True offline nodes will be skipped """ config = env.booth.get_config_content() authfile_path = config_structure.get_authfile(parse(config)) authfile_content = config_files.read_authfile( env.report_processor, authfile_path ) cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf() ) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) env.report_processor.process_list(report_list) com_cmd = BoothSendConfig( env.report_processor, env.booth.name, config, authfile=authfile_path, authfile_data=authfile_content, skip_offline_targets=skip_offline_nodes ) com_cmd.set_targets( env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) ) run_and_raise(env.get_node_communicator(), com_cmd)
def get_cluster_sbd_config(lib_env): """ Returns list of SBD config from all cluster nodes in cluster. Structure of data: [ { "node": <NodeAddress> "config": <sbd_config_dict> or None if there was failure, }, ... ] If error occurs while obtaining config from some node, it's config will be None. If obtaining config fail on all node returns empty dictionary. lib_env -- LibraryEnvironment """ node_list, get_nodes_report_list = get_existing_nodes_names( lib_env.get_corosync_conf() ) if not node_list: get_nodes_report_list.append(reports.corosync_config_no_nodes_defined()) lib_env.report_processor.process_list(get_nodes_report_list) com_cmd = GetSbdConfig(lib_env.report_processor) com_cmd.set_targets( lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=True ) ) return run_com(lib_env.get_node_communicator(), com_cmd)
def push_corosync_conf( self, corosync_conf_facade, skip_offline_nodes=False ): corosync_conf_data = corosync_conf_facade.config.export() if self.is_corosync_conf_live: node_name_list, report_list = get_existing_nodes_names( corosync_conf_facade, # Pcs is unable to communicate with nodes missing names. It # cannot send new corosync.conf to them. That might break the # cluster. Hence we error out. error_on_missing_name=True ) self.report_processor.process_list(report_list) self._push_corosync_conf_live( self.get_node_target_factory().get_target_list( node_name_list, skip_non_existing=skip_offline_nodes, ), corosync_conf_data, corosync_conf_facade.need_stopped_cluster, corosync_conf_facade.need_qdevice_reload, skip_offline_nodes, ) else: self._corosync_conf_data = corosync_conf_data
def get_cluster_sbd_status(lib_env): """ Returns status of SBD service in cluster in dictionary with format: { <NodeAddress>: { "installed": <boolean>, "enabled": <boolean>, "running": <boolean> }, ... } lib_env -- LibraryEnvironment """ node_list, get_nodes_report_list = get_existing_nodes_names( lib_env.get_corosync_conf() ) if not node_list: get_nodes_report_list.append(reports.corosync_config_no_nodes_defined()) lib_env.report_processor.process_list(get_nodes_report_list) com_cmd = GetSbdStatus(lib_env.report_processor) com_cmd.set_targets( lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=True ) ) return run_com(lib_env.get_node_communicator(), com_cmd)
def quorum_unblock_cmd(lib, argv, modifiers): """ Options: * --force - no error when removing non existing property and no warning about this action """ modifiers.ensure_only_supported("--force") if argv: raise CmdLineInputError() output, retval = utils.run( ["corosync-cmapctl", "-g", "runtime.votequorum.wait_for_all_status"] ) if retval != 0: utils.err("unable to check quorum status") if output.split("=")[-1].strip() != "1": utils.err("cluster is not waiting for nodes to establish quorum") all_nodes, report_list = get_existing_nodes_names( utils.get_corosync_conf_facade() ) if report_list: utils.process_library_reports(report_list) unjoined_nodes = set(all_nodes) - set(utils.getCorosyncActiveNodes()) if not unjoined_nodes: utils.err("no unjoined nodes found") if not modifiers.get("--force"): answer = utils.get_terminal_input( ( "WARNING: If node(s) {nodes} are not powered off or they do" + " have access to shared resources, data corruption and/or" + " cluster failure may occur. Are you sure you want to" + " continue? [y/N] " ).format(nodes=", ".join(unjoined_nodes)) ) if answer.lower() not in ["y", "yes"]: print("Canceled") return for node in unjoined_nodes: # pass --force so no warning will be displayed stonith.stonith_confirm( lib, [node], parse_args.InputModifiers({"--force": ""}) ) output, retval = utils.run( ["corosync-cmapctl", "-s", "quorum.cancel_wait_for_all", "u8", "1"] ) if retval != 0: utils.err("unable to cancel waiting for nodes") print("Quorum unblocked") startup_fencing = utils.get_set_properties().get("startup-fencing", "") utils.set_cib_property( "startup-fencing", "false" if startup_fencing.lower() != "false" else "true" ) utils.set_cib_property("startup-fencing", startup_fencing) print("Waiting for nodes canceled")
def disable_sbd(lib_env, ignore_offline_nodes=False): """ Disable SBD on all nodes in cluster. lib_env -- LibraryEnvironment ignore_offline_nodes -- if True, omit offline nodes """ node_list, get_nodes_report_list = get_existing_nodes_names( lib_env.get_corosync_conf() ) if not node_list: get_nodes_report_list.append(reports.corosync_config_no_nodes_defined()) lib_env.report_processor.process_list(get_nodes_report_list) com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets( lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, ) ) online_nodes = run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = SetStonithWatchdogTimeoutToZero(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = DisableSbdService(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.process( reports.cluster_restart_required_to_apply_changes() )
def nodes_status(lib, argv, modifiers): """ Options: * -f - CIB file - for config subcommand and not for both or corosync * --corosync_conf - only for config subcommand NOTE: modifiers check is in subcommand """ del lib if len(argv) == 1 and (argv[0] == "config"): modifiers.ensure_only_supported("-f", "--corosync_conf") if utils.hasCorosyncConf(): corosync_nodes, report_list = get_existing_nodes_names( utils.get_corosync_conf_facade()) if report_list: process_library_reports(report_list) else: corosync_nodes = [] try: pacemaker_nodes = sorted([ node.attrs.name for node in ClusterState( get_cluster_status_dom( utils.cmd_runner())).node_section.nodes if node.attrs.type != "remote" ]) except LibraryError as e: process_library_reports(e.args) print("Corosync Nodes:") if corosync_nodes: print(" " + " ".join(corosync_nodes)) print("Pacemaker Nodes:") if pacemaker_nodes: print(" " + " ".join(pacemaker_nodes)) return if len(argv) == 1 and (argv[0] == "corosync" or argv[0] == "both"): modifiers.ensure_only_supported() all_nodes, report_list = get_existing_nodes_names( utils.get_corosync_conf_facade()) if report_list: process_library_reports(report_list) online_nodes = utils.getCorosyncActiveNodes() offline_nodes = [] for node in all_nodes: if node not in online_nodes: offline_nodes.append(node) online_nodes.sort() offline_nodes.sort() print("Corosync Nodes:") print(" ".join([" Online:"] + online_nodes)) print(" ".join([" Offline:"] + offline_nodes)) if argv[0] != "both": sys.exit(0) modifiers.ensure_only_supported("-f") info_dom = utils.getClusterState() nodes = info_dom.getElementsByTagName("nodes") if nodes.length == 0: utils.err("No nodes section found") onlinenodes = [] offlinenodes = [] standbynodes = [] standbynodes_with_resources = [] maintenancenodes = [] remote_onlinenodes = [] remote_offlinenodes = [] remote_standbynodes = [] remote_standbynodes_with_resources = [] remote_maintenancenodes = [] for node in nodes[0].getElementsByTagName("node"): node_name = node.getAttribute("name") node_remote = node.getAttribute("type") == "remote" if node.getAttribute("online") == "true": if node.getAttribute("standby") == "true": is_running_resources = (node.getAttribute("resources_running") != "0") if node_remote: if is_running_resources: remote_standbynodes_with_resources.append(node_name) else: remote_standbynodes.append(node_name) else: if is_running_resources: standbynodes_with_resources.append(node_name) else: standbynodes.append(node_name) if node.getAttribute("maintenance") == "true": if node_remote: remote_maintenancenodes.append(node_name) else: maintenancenodes.append(node_name) if (node.getAttribute("standby") == "false" and node.getAttribute("maintenance") == "false"): if node_remote: remote_onlinenodes.append(node_name) else: onlinenodes.append(node_name) else: if node_remote: remote_offlinenodes.append(node_name) else: offlinenodes.append(node_name) print("Pacemaker Nodes:") print(" ".join([" Online:"] + onlinenodes)) print(" ".join([" Standby:"] + standbynodes)) print(" ".join([" Standby with resource(s) running:"] + standbynodes_with_resources)) print(" ".join([" Maintenance:"] + maintenancenodes)) print(" ".join([" Offline:"] + offlinenodes)) print("Pacemaker Remote Nodes:") print(" ".join([" Online:"] + remote_onlinenodes)) print(" ".join([" Standby:"] + remote_standbynodes)) print(" ".join([" Standby with resource(s) running:"] + remote_standbynodes_with_resources)) print(" ".join([" Maintenance:"] + remote_maintenancenodes)) print(" ".join([" Offline:"] + remote_offlinenodes))
def status_all_sites_plaintext( env: LibraryEnvironment, hide_inactive_resources: bool = False, verbose: bool = False, ) -> List[Mapping[str, Any]]: """ Return local site's and all remote sites' status as plaintext env -- LibraryEnvironment hide_inactive_resources -- if True, do not display non-running resources verbose -- if True, display more info """ # The command does not provide an option to skip offline / unreacheable / # misbehaving nodes. # The point of such skipping is to stop a command if it is unable to make # changes on all nodes. The user can then decide to proceed anyway and # make changes on the skipped nodes later manually. # This command only reads from nodes so it automatically asks other nodes # if one is offline / misbehaving. class SiteData(): def __init__( self, local: bool, role: DrRole, target_list: Iterable[RequestTarget], ) -> None: self.local = local self.role = role self.target_list = target_list self.status_loaded = False self.status_plaintext = "" if env.ghost_file_codes: raise LibraryError( reports.live_environment_required(env.ghost_file_codes)) report_processor = env.report_processor report_list, dr_config = _load_dr_config(env.get_dr_env().config) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() site_data_list = [] target_factory = env.get_node_target_factory() # get local nodes local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf()) report_processor.report_list(report_list) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, skip_non_existing=True, ) report_processor.report_list(report_list) site_data_list.append(SiteData(True, dr_config.local_role, local_targets)) # get remote sites' nodes for conf_remote_site in dr_config.get_remote_site_list(): report_list, remote_targets = ( target_factory.get_target_list_with_reports( conf_remote_site.node_name_list, skip_non_existing=True, )) report_processor.report_list(report_list) site_data_list.append( SiteData(False, conf_remote_site.role, remote_targets)) if report_processor.has_errors: raise LibraryError() # get all statuses for site_data in site_data_list: com_cmd = GetFullClusterStatusPlaintext( report_processor, hide_inactive_resources=hide_inactive_resources, verbose=verbose, ) com_cmd.set_targets(site_data.target_list) site_data.status_loaded, site_data.status_plaintext = run_com_cmd( env.get_node_communicator(), com_cmd) return [ dto.to_dict( DrSiteStatusDto( local_site=site_data.local, site_role=site_data.role, status_plaintext=site_data.status_plaintext, status_successfully_obtained=site_data.status_loaded, )) for site_data in site_data_list ]
def add_device( lib_env, model, model_options, generic_options, heuristics_options, force_model=False, force_options=False, skip_offline_nodes=False ): """ Add a quorum device to a cluster, distribute and reload configs if live string model -- quorum device model dict model_options -- model specific options dict generic_options -- generic quorum device options dict heuristics_options -- heuristics options bool force_model -- continue even if the model is not valid bool force_options -- continue even if options are not valid bool skip_offline_nodes -- continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if cfg.has_quorum_device(): raise LibraryError(reports.qdevice_already_defined()) report_processor = SimpleReportProcessor(lib_env.report_processor) report_processor.report_list( corosync_conf_validators.add_quorum_device( model, model_options, generic_options, heuristics_options, [node.nodeid for node in cfg.get_nodes()], force_model=force_model, force_options=force_options ) ) if lib_env.is_corosync_conf_live: cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() cfg.add_quorum_device( model, model_options, generic_options, heuristics_options, ) if cfg.is_quorum_device_heuristics_enabled_with_no_exec(): lib_env.report_processor.process( reports.corosync_quorum_heuristics_enabled_with_no_exec() ) # First setup certificates for qdevice, then send corosync.conf to nodes. # If anything fails, nodes will not have corosync.conf with qdevice in it, # so there is no effect on the cluster. if lib_env.is_corosync_conf_live: target_factory = lib_env.get_node_target_factory() target_list = target_factory.get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # Do model specific configuration. # If the model is not known to pcs and was forced, do not configure # anything else than corosync.conf, as we do not know what to do # anyway. if model == "net": qdevice_net.set_up_client_certificates( lib_env.cmd_runner(), lib_env.report_processor, lib_env.communicator_factory, # We are sure the "host" key is there, it has been validated # above. target_factory.get_target_from_hostname(model_options["host"]), cfg.get_cluster_name(), target_list, skip_offline_nodes ) lib_env.report_processor.process( reports.service_enable_started("corosync-qdevice") ) com_cmd = qdevice_com.Enable( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # everything set up, it's safe to tell the nodes to use qdevice lib_env.push_corosync_conf(cfg, skip_offline_nodes) # Now, when corosync.conf has been reloaded, we can start qdevice service. if lib_env.is_corosync_conf_live: lib_env.report_processor.process( reports.service_start_started("corosync-qdevice") ) com_cmd = qdevice_com.Start( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd)
def remove_device(lib_env, skip_offline_nodes=False): """ Stop using quorum device, distribute and reload configs if live skip_offline_nodes continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if not cfg.has_quorum_device(): raise LibraryError(reports.qdevice_not_defined()) model = cfg.get_quorum_device_model() cfg.remove_quorum_device() if lib_env.is_corosync_conf_live: report_processor = SimpleReportProcessor(lib_env.report_processor) # get nodes for communication cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() target_list = lib_env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # fix quorum options for SBD to work properly if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg): lib_env.report_processor.process( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd() ) cfg.set_quorum_options({"auto_tie_breaker": "1"}) # disable qdevice lib_env.report_processor.process( reports.service_disable_started("corosync-qdevice") ) com_cmd = qdevice_com.Disable( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # stop qdevice lib_env.report_processor.process( reports.service_stop_started("corosync-qdevice") ) com_cmd = qdevice_com.Stop( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # handle model specific configuration if model == "net": lib_env.report_processor.process( reports.qdevice_certificate_removal_started() ) com_cmd = qdevice_net_com.ClientDestroy( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.push_corosync_conf(cfg, skip_offline_nodes)
def remove_device(lib_env: LibraryEnvironment, skip_offline_nodes=False): """ Stop using quorum device, distribute and reload configs if live skip_offline_nodes continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if not cfg.has_quorum_device(): raise LibraryError( ReportItem.error(reports.messages.QdeviceNotDefined())) model = cfg.get_quorum_device_model() cfg.remove_quorum_device() if lib_env.is_corosync_conf_live: report_processor = lib_env.report_processor # get nodes for communication cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True, ) if report_processor.report_list(report_list).has_errors: raise LibraryError() target_list = lib_env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # fix quorum options for SBD to work properly if sbd.atb_has_to_be_enabled(lib_env.service_manager, cfg): lib_env.report_processor.report( ReportItem.warning( reports.messages.CorosyncQuorumAtbWillBeEnabledDueToSbd())) cfg.set_quorum_options({"auto_tie_breaker": "1"}) # disable qdevice lib_env.report_processor.report( ReportItem.info( reports.messages.ServiceActionStarted( reports.const.SERVICE_ACTION_DISABLE, "corosync-qdevice"))) com_cmd_disable = qdevice_com.Disable(lib_env.report_processor, skip_offline_nodes) com_cmd_disable.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_disable) # stop qdevice lib_env.report_processor.report( ReportItem.info( reports.messages.ServiceActionStarted( reports.const.SERVICE_ACTION_STOP, "corosync-qdevice"))) com_cmd_stop = qdevice_com.Stop(lib_env.report_processor, skip_offline_nodes) com_cmd_stop.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_stop) # handle model specific configuration if model == "net": lib_env.report_processor.report( ReportItem.info( reports.messages.QdeviceCertificateRemovalStarted())) com_cmd_client_destroy = qdevice_net_com.ClientDestroy( lib_env.report_processor, skip_offline_nodes) com_cmd_client_destroy.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_client_destroy) lib_env.push_corosync_conf(cfg, skip_offline_nodes)
def full_cluster_status_plaintext( env: LibraryEnvironment, hide_inactive_resources: bool = False, verbose: bool = False, ) -> str: """ Return full cluster status as plaintext env -- LibraryEnvironment hide_inactive_resources -- if True, do not display non-running resources verbose -- if True, display more info """ # pylint: disable=too-many-branches # pylint: disable=too-many-locals # validation if not env.is_cib_live and env.is_corosync_conf_live: raise LibraryError( reports.live_environment_not_consistent( [file_type_codes.CIB], [file_type_codes.COROSYNC_CONF], )) if env.is_cib_live and not env.is_corosync_conf_live: raise LibraryError( reports.live_environment_not_consistent( [file_type_codes.COROSYNC_CONF], [file_type_codes.CIB], )) # initialization runner = env.cmd_runner() report_processor = SimpleReportProcessor(env.report_processor) live = env.is_cib_live and env.is_corosync_conf_live is_sbd_running = False # load status, cib, corosync.conf status_text, warning_list = get_cluster_status_text( runner, hide_inactive_resources, verbose) corosync_conf = env.get_corosync_conf() cib = env.get_cib() if verbose: ticket_status_text, ticket_status_stderr, ticket_status_retval = ( get_ticket_status_text(runner)) # get extra info if live if live: try: is_sbd_running = is_service_running(runner, get_sbd_service_name()) except LibraryError: pass local_services_status = _get_local_services_status(runner) if verbose: node_name_list, node_names_report_list = get_existing_nodes_names( corosync_conf) report_processor.report_list(node_names_report_list) node_reachability = _get_node_reachability( env.get_node_target_factory(), env.get_node_communicator(), report_processor, node_name_list, ) # check stonith configuration warning_list = list(warning_list) warning_list.extend(_stonith_warnings(cib, is_sbd_running)) # put it all together if report_processor.has_errors: raise LibraryError() parts = [] parts.append(f"Cluster name: {corosync_conf.get_cluster_name()}") if warning_list: parts.extend(["", "WARNINGS:"] + warning_list + [""]) parts.append(status_text) if verbose: parts.extend(["", "Tickets:"]) if ticket_status_retval != 0: ticket_warning_parts = [ "WARNING: Unable to get information about tickets" ] if ticket_status_stderr: ticket_warning_parts.extend( indent(ticket_status_stderr.splitlines())) parts.extend(indent(ticket_warning_parts)) else: parts.extend(indent(ticket_status_text.splitlines())) if live: if verbose: parts.extend(["", "PCSD Status:"]) parts.extend( indent( _format_node_reachability(node_name_list, node_reachability))) parts.extend(["", "Daemon Status:"]) parts.extend( indent(_format_local_services_status(local_services_status))) return "\n".join(parts)
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None: """ Set up disaster recovery with the local cluster being the primary site env node_name -- a known host from the recovery site """ # pylint: disable=too-many-locals if env.ghost_file_codes: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentRequired( env.ghost_file_codes))) report_processor = env.report_processor dr_env = env.get_dr_env() if dr_env.config.raw_file.exists(): report_processor.report( ReportItem.error(reports.messages.DrConfigAlreadyExist())) target_factory = env.get_node_target_factory() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True) report_processor.report_list(report_list) if node_name in local_nodes: report_processor.report( ReportItem.error(reports.messages.NodeInLocalCluster(node_name))) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) report_list, remote_targets = target_factory.get_target_list_with_reports( [node_name], allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # TODO The new file framework doesn't support network communication yet. com_cmd = GetCorosyncConf(env.report_processor) com_cmd.set_targets(remote_targets) corosync_conf_instance = FileInstance.for_corosync_conf() try: remote_cluster_nodes, report_list = get_existing_nodes_names( cast( CorosyncConfigFacade, corosync_conf_instance.raw_to_facade( run_and_raise(env.get_node_communicator(), com_cmd).encode("utf-8")), ), error_on_missing_name=True, ) except ParserErrorException as e: report_processor.report_list( corosync_conf_instance.toolbox.parser.exception_to_report_list( e, file_type_codes.COROSYNC_CONF, None, force_code=None, is_forced_or_warning=False, )) if report_processor.report_list(report_list).has_errors: raise LibraryError() # ensure we have tokens for all nodes of remote cluster report_list, remote_targets = target_factory.get_target_list_with_reports( remote_cluster_nodes, allow_skip=False, report_none_host_found=False) if report_processor.report_list(report_list).has_errors: raise LibraryError() dr_config_exporter = get_file_toolbox( file_type_codes.PCS_DR_CONFIG).exporter # create dr config for remote cluster remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY) remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes) # send config to all node of remote cluster distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(remote_dr_cfg.config)), ) distribute_file_cmd.set_targets(remote_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd) # create new dr config, with local cluster as primary site local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY) local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes) distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(local_dr_cfg.config)), ) distribute_file_cmd.set_targets(local_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd)
def location_prefer(lib, argv, modifiers): """ Options: * --force - allow unknown options, allow constraint for any resource type * -f - CIB file """ modifiers.ensure_only_supported("--force", "-f") rsc = argv.pop(0) prefer_option = argv.pop(0) dummy_rsc_type, rsc_value = parse_args.parse_typed_arg( rsc, [RESOURCE_TYPE_RESOURCE, RESOURCE_TYPE_REGEXP], RESOURCE_TYPE_RESOURCE) if prefer_option == "prefers": prefer = True elif prefer_option == "avoids": prefer = False else: raise CmdLineInputError() skip_node_check = False if modifiers.is_specified("-f") or modifiers.get("--force"): skip_node_check = True warn(LOCATION_NODE_VALIDATION_SKIP_MSG) else: lib_env = utils.get_lib_env() existing_nodes, report_list = get_existing_nodes_names( corosync_conf=lib_env.get_corosync_conf(), cib=lib_env.get_cib(), ) if report_list: process_library_reports(report_list) report_list = [] parameters_list = [] for nodeconf in argv: nodeconf_a = nodeconf.split("=", 1) node = nodeconf_a[0] if not skip_node_check: report_list += _verify_node_name(node, existing_nodes) if len(nodeconf_a) == 1: if prefer: score = "INFINITY" else: score = "-INFINITY" else: score = nodeconf_a[1] _verify_score(score) if not prefer: if score[0] == "-": score = score[1:] else: score = "-" + score parameters_list.append([ sanitize_id(f"location-{rsc_value}-{node}-{score}"), rsc, node, score ]) if report_list: process_library_reports(report_list) modifiers = modifiers.get_subset("--force", "-f") for parameters in parameters_list: location_add(lib, parameters, modifiers, skip_score_and_node_check=True)
def full_cluster_status_plaintext( env: LibraryEnvironment, hide_inactive_resources: bool = False, verbose: bool = False, ) -> str: """ Return full cluster status as plaintext env -- LibraryEnvironment hide_inactive_resources -- if True, do not display non-running resources verbose -- if True, display more info """ # pylint: disable=too-many-branches # pylint: disable=too-many-locals # pylint: disable=too-many-statements # validation if not env.is_cib_live and env.is_corosync_conf_live: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentNotConsistent( [file_type_codes.CIB], [file_type_codes.COROSYNC_CONF], ))) if env.is_cib_live and not env.is_corosync_conf_live: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentNotConsistent( [file_type_codes.COROSYNC_CONF], [file_type_codes.CIB], ))) # initialization runner = env.cmd_runner() report_processor = env.report_processor live = env.is_cib_live and env.is_corosync_conf_live is_sbd_running = False # load status, cib, corosync.conf status_text, warning_list = get_cluster_status_text( runner, hide_inactive_resources, verbose) corosync_conf = None # If we are live on a remote node, we have no corosync.conf. # TODO Use the new file framework so the path is not exposed. if not live or os.path.exists(settings.corosync_conf_file): corosync_conf = env.get_corosync_conf() cib = env.get_cib() if verbose: ( ticket_status_text, ticket_status_stderr, ticket_status_retval, ) = get_ticket_status_text(runner) # get extra info if live if live: try: is_sbd_running = is_service_running(runner, get_sbd_service_name()) except LibraryError: pass local_services_status = _get_local_services_status(runner) if verbose and corosync_conf: node_name_list, node_names_report_list = get_existing_nodes_names( corosync_conf) report_processor.report_list(node_names_report_list) node_reachability = _get_node_reachability( env.get_node_target_factory(), env.get_node_communicator(), report_processor, node_name_list, ) # check stonith configuration warning_list = list(warning_list) warning_list.extend(_stonith_warnings(cib, is_sbd_running)) # put it all together if report_processor.has_errors: raise LibraryError() cluster_name = (corosync_conf.get_cluster_name() if corosync_conf else nvpair.get_value("cluster_property_set", get_crm_config(cib), "cluster-name", "")) parts = [] parts.append(f"Cluster name: {cluster_name}") if warning_list: parts.extend(["", "WARNINGS:"] + warning_list + [""]) parts.append(status_text) if verbose: parts.extend(["", "Tickets:"]) if ticket_status_retval != 0: ticket_warning_parts = [ "WARNING: Unable to get information about tickets" ] if ticket_status_stderr: ticket_warning_parts.extend( indent(ticket_status_stderr.splitlines())) parts.extend(indent(ticket_warning_parts)) else: parts.extend(indent(ticket_status_text.splitlines())) if live: if verbose and corosync_conf: parts.extend(["", "PCSD Status:"]) parts.extend( indent( _format_node_reachability(node_name_list, node_reachability))) parts.extend(["", "Daemon Status:"]) parts.extend( indent(_format_local_services_status(local_services_status))) return "\n".join(parts)
def update_scsi_devices( env: LibraryEnvironment, stonith_id: str, set_device_list: Iterable[str], force_flags: Container[reports.types.ForceCode] = (), ) -> None: """ Update scsi fencing devices without restart and affecting other resources. env -- provides all for communication with externals stonith_id -- id of stonith resource set_device_list -- paths to the scsi devices that would be set for stonith resource force_flags -- list of flags codes """ if not is_getting_resource_digest_supported(env.cmd_runner()): raise LibraryError( ReportItem.error( reports.messages.StonithRestartlessUpdateOfScsiDevicesNotSupported() ) ) cib = env.get_cib() if not set_device_list: env.report_processor.report( ReportItem.error( reports.messages.InvalidOptionValue( "devices", "", None, cannot_be_empty=True ) ) ) ( stonith_el, report_list, ) = stonith.validate_stonith_restartless_update(cib, stonith_id) if env.report_processor.report_list(report_list).has_errors: raise LibraryError() # for mypy, this should not happen because exeption would be raised if stonith_el is None: raise AssertionError("stonith element is None") stonith.update_scsi_devices_without_restart( env.cmd_runner(), env.get_cluster_state(), stonith_el, IdProvider(cib), set_device_list, ) # Unfencing cluster_nodes_names, nodes_report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True, ) env.report_processor.report_list(nodes_report_list) ( target_report_list, cluster_nodes_target_list, ) = env.get_node_target_factory().get_target_list_with_reports( cluster_nodes_names, allow_skip=False, ) env.report_processor.report_list(target_report_list) if env.report_processor.has_errors: raise LibraryError() com_cmd: AllSameDataMixin = GetCorosyncOnlineTargets( env.report_processor, skip_offline_targets=reports.codes.SKIP_OFFLINE_NODES in force_flags, ) com_cmd.set_targets(cluster_nodes_target_list) online_corosync_target_list = run_and_raise( env.get_node_communicator(), com_cmd ) com_cmd = Unfence(env.report_processor, sorted(set_device_list)) com_cmd.set_targets(online_corosync_target_list) run_and_raise(env.get_node_communicator(), com_cmd) env.push_cib()
def config_sync( env: LibraryEnvironment, instance_name=None, skip_offline_nodes=False, ): """ Send specified local booth configuration to all nodes in the local cluster. env string instance_name -- booth instance name skip_offline_nodes -- if True offline nodes will be skipped """ report_processor = env.report_processor booth_env = env.get_booth_env(instance_name) if not env.is_cib_live: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentRequired([file_type_codes.CIB]) ) ) cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf() ) if not cluster_nodes_names: report_list.append( ReportItem.error(reports.messages.CorosyncConfigNoNodesDefined()) ) report_processor.report_list(report_list) try: booth_conf_data = booth_env.config.read_raw() booth_conf = booth_env.config.raw_to_facade(booth_conf_data) if isinstance(booth_env.config.raw_file, GhostFile): authfile_data = booth_env.key.read_raw() authfile_path = booth_conf.get_authfile() authfile_name = ( os.path.basename(authfile_path) if authfile_path else None ) else: ( authfile_name, authfile_data, authfile_report_list, ) = config_files.get_authfile_name_and_data(booth_conf) report_processor.report_list(authfile_report_list) except RawFileError as e: report_processor.report(raw_file_error_report(e)) except ParserErrorException as e: report_processor.report_list( booth_env.config.parser_exception_to_report_list(e) ) if report_processor.has_errors: raise LibraryError() com_cmd = BoothSendConfig( env.report_processor, booth_env.instance_name, booth_conf_data, authfile=authfile_name, authfile_data=authfile_data, skip_offline_targets=skip_offline_nodes, ) com_cmd.set_targets( env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) ) run_and_raise(env.get_node_communicator(), com_cmd)
def synchronize_ssl_certificate(env: LibraryEnvironment, skip_offline=False): """ Send the local pcsd SSL cert and key to all full nodes in the local cluster. Consider the pcs Web UI is accessed via an IP running as a resource in the cluster. When the IP is moved, the user's browser connects to the new node and we want it to get the same certificate to make the transition a seamless experience (otherwise the browser display a warning that the certificate has changed). Using pcsd Web UI on remote and guest nodes is not supported (pcs/pcsd depends on the corosanc.conf file being present on the local node) so we send the cert only to corossync (== full stack) nodes. """ report_processor = env.report_processor target_factory = env.get_node_target_factory() cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf()) if not cluster_nodes_names: report_list.append( ReportItem.error(reports.messages.CorosyncConfigNoNodesDefined())) report_processor.report_list(report_list) try: with open(settings.pcsd_cert_location, "r") as file: ssl_cert = file.read() except EnvironmentError as e: report_processor.report( ReportItem.error( reports.messages.FileIoError( file_type_codes.PCSD_SSL_CERT, RawFileError.ACTION_READ, format_environment_error(e), file_path=settings.pcsd_cert_location, ))) try: with open(settings.pcsd_key_location, "r") as file: ssl_key = file.read() except EnvironmentError as e: report_processor.report( ReportItem.error( reports.messages.FileIoError( file_type_codes.PCSD_SSL_KEY, RawFileError.ACTION_READ, format_environment_error(e), file_path=settings.pcsd_key_location, ))) ( target_report_list, target_list, ) = target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline) report_processor.report_list(target_report_list) if report_processor.has_errors: raise LibraryError() env.report_processor.report( ReportItem.info( reports.messages.PcsdSslCertAndKeyDistributionStarted( sorted([target.label for target in target_list])))) com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def location_add(lib, argv, modifiers, skip_score_and_node_check=False): """ Options: * --force - allow unknown options, allow constraint for any resource type * -f - CIB file """ del lib modifiers.ensure_only_supported("--force", "-f") if len(argv) < 4: raise CmdLineInputError() constraint_id = argv.pop(0) rsc_type, rsc_value = parse_args.parse_typed_arg( argv.pop(0), [RESOURCE_TYPE_RESOURCE, RESOURCE_TYPE_REGEXP], RESOURCE_TYPE_RESOURCE) node = argv.pop(0) score = argv.pop(0) options = [] # For now we only allow setting resource-discovery if argv: for arg in argv: if '=' in arg: options.append(arg.split('=', 1)) else: raise CmdLineInputError(f"bad option '{arg}'") if (options[-1][0] != "resource-discovery" and not modifiers.get("--force")): utils.err("bad option '%s', use --force to override" % options[-1][0]) # Verify that specified node exists in the cluster and score is valid if not skip_score_and_node_check: if modifiers.is_specified("-f") or modifiers.get("--force"): warn(LOCATION_NODE_VALIDATION_SKIP_MSG) else: lib_env = utils.get_lib_env() existing_nodes, report_list = get_existing_nodes_names( corosync_conf=lib_env.get_corosync_conf(), cib=lib_env.get_cib(), ) report_list = _verify_node_name(node, existing_nodes) if report_list: process_library_reports(report_list) _verify_score(score) id_valid, id_error = utils.validate_xml_id(constraint_id, 'constraint id') if not id_valid: utils.err(id_error) required_version = None if [x for x in options if x[0] == "resource-discovery"]: required_version = 2, 2, 0 if rsc_type == RESOURCE_TYPE_REGEXP: required_version = 2, 6, 0 if required_version: dom = utils.cluster_upgrade_to_version(required_version) else: dom = utils.get_cib_dom() if rsc_type == RESOURCE_TYPE_RESOURCE: rsc_valid, rsc_error, dummy_correct_id = ( utils.validate_constraint_resource(dom, rsc_value)) if not rsc_valid: utils.err(rsc_error) # Verify current constraint doesn't already exist # If it does we replace it with the new constraint dummy_dom, constraintsElement = getCurrentConstraints(dom) elementsToRemove = [] # If the id matches, or the rsc & node match, then we replace/remove for rsc_loc in constraintsElement.getElementsByTagName('rsc_location'): # pylint: disable=too-many-boolean-expressions if (rsc_loc.getAttribute("id") == constraint_id or (rsc_loc.getAttribute("node") == node and ((RESOURCE_TYPE_RESOURCE == rsc_type and rsc_loc.getAttribute("rsc") == rsc_value) or (RESOURCE_TYPE_REGEXP == rsc_type and rsc_loc.getAttribute("rsc-pattern") == rsc_value)))): elementsToRemove.append(rsc_loc) for etr in elementsToRemove: constraintsElement.removeChild(etr) element = dom.createElement("rsc_location") element.setAttribute("id", constraint_id) if rsc_type == RESOURCE_TYPE_RESOURCE: element.setAttribute("rsc", rsc_value) elif rsc_type == RESOURCE_TYPE_REGEXP: element.setAttribute("rsc-pattern", rsc_value) element.setAttribute("node", node) element.setAttribute("score", score) for option in options: element.setAttribute(option[0], option[1]) constraintsElement.appendChild(element) utils.replace_cib_configuration(dom)
def enable_sbd( lib_env, default_watchdog, watchdog_dict, sbd_options, default_device_list=None, node_device_dict=None, allow_unknown_opts=False, ignore_offline_nodes=False, no_watchdog_validation=False, allow_invalid_option_values=False, ): # pylint: disable=too-many-arguments, too-many-locals """ Enable SBD on all nodes in cluster. lib_env -- LibraryEnvironment default_watchdog -- watchdog for nodes which are not specified in watchdog_dict. Uses default value from settings if None. watchdog_dict -- dictionary with node names as keys and watchdog path as value sbd_options -- dictionary in format: <SBD config option>: <value> default_device_list -- list of devices for all nodes node_device_dict -- dictionary with node names as keys and list of devices as value allow_unknown_opts -- if True, accept also unknown options. ignore_offline_nodes -- if True, omit offline nodes no_watchdog_validation -- it True, do not validate existance of a watchdog on the nodes allow_invalid_option_values -- if True, invalid values of some options will be treated as warning instead of errors """ using_devices = not ( default_device_list is None and node_device_dict is None ) if default_device_list is None: default_device_list = [] if node_device_dict is None: node_device_dict = {} if not default_watchdog: default_watchdog = settings.sbd_watchdog_default sbd_options = {opt.upper(): val for opt, val in sbd_options.items()} corosync_conf = lib_env.get_corosync_conf() node_list, get_nodes_report_list = get_existing_nodes_names(corosync_conf) if not node_list: get_nodes_report_list.append(reports.corosync_config_no_nodes_defined()) target_list = lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, ) full_watchdog_dict = _get_full_target_dict( target_list, watchdog_dict, default_watchdog ) full_device_dict = _get_full_target_dict( target_list, node_device_dict, default_device_list ) lib_env.report_processor.process_list( get_nodes_report_list + [ reports.node_not_found(node) for node in ( set(list(watchdog_dict.keys()) + list(node_device_dict.keys())) - set(node_list) ) ] + _validate_watchdog_dict(full_watchdog_dict) + (sbd.validate_nodes_devices(full_device_dict) if using_devices else []) + _validate_sbd_options( sbd_options, allow_unknown_opts, allow_invalid_option_values ) ) com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(target_list) online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd) # check if SBD can be enabled if no_watchdog_validation: lib_env.report_processor.report( reports.sbd_watchdog_validation_inactive() ) com_cmd = CheckSbd(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, ( # Do not send watchdog if validation is turned off. Listing of # available watchdogs in pcsd may restart the machine in some # corner cases. "" if no_watchdog_validation else full_watchdog_dict[target.label] ), full_device_dict[target.label] if using_devices else [], ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable ATB if needed if not using_devices: if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf): lib_env.report_processor.process( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd() ) corosync_conf.set_quorum_options({"auto_tie_breaker": "1"}) lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes) # distribute SBD configuration config = sbd.get_default_sbd_config() config.update(sbd_options) com_cmd = SetSbdConfig(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, sbd.create_sbd_config( config, target.label, full_watchdog_dict[target.label], full_device_dict[target.label] ) ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # remove cluster prop 'stonith_watchdog_timeout' com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable SBD service an all nodes com_cmd = EnableSbdService(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.process( reports.cluster_restart_required_to_apply_changes() )
def add_device( lib_env: LibraryEnvironment, model, model_options, generic_options, heuristics_options, force_model=False, force_options=False, skip_offline_nodes=False, ): # pylint: disable=too-many-locals """ Add a quorum device to a cluster, distribute and reload configs if live string model -- quorum device model dict model_options -- model specific options dict generic_options -- generic quorum device options dict heuristics_options -- heuristics options bool force_model -- continue even if the model is not valid bool force_options -- continue even if options are not valid bool skip_offline_nodes -- continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if cfg.has_quorum_device(): raise LibraryError( ReportItem.error(reports.messages.QdeviceAlreadyDefined())) report_processor = lib_env.report_processor report_processor.report_list( corosync_conf_validators.add_quorum_device( model, model_options, generic_options, heuristics_options, [node.nodeid for node in cfg.get_nodes()], force_model=force_model, force_options=force_options, )) if lib_env.is_corosync_conf_live: cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True, ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() cfg.add_quorum_device( model, model_options, generic_options, heuristics_options, ) if cfg.is_quorum_device_heuristics_enabled_with_no_exec(): lib_env.report_processor.report( ReportItem.warning( reports.messages.CorosyncQuorumHeuristicsEnabledWithNoExec())) # First setup certificates for qdevice, then send corosync.conf to nodes. # If anything fails, nodes will not have corosync.conf with qdevice in it, # so there is no effect on the cluster. if lib_env.is_corosync_conf_live: target_factory = lib_env.get_node_target_factory() target_list = target_factory.get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # Do model specific configuration. # If the model is not known to pcs and was forced, do not configure # anything else than corosync.conf, as we do not know what to do # anyway. if model == "net": qdevice_net.set_up_client_certificates( lib_env.cmd_runner(), lib_env.report_processor, lib_env.communicator_factory, # We are sure the "host" key is there, it has been validated # above. target_factory.get_target_from_hostname(model_options["host"]), cfg.get_cluster_name(), target_list, skip_offline_nodes, ) lib_env.report_processor.report( ReportItem.info( reports.messages.ServiceActionStarted( reports.const.SERVICE_ACTION_ENABLE, "corosync-qdevice"))) com_cmd = qdevice_com.Enable(lib_env.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # everything set up, it's safe to tell the nodes to use qdevice lib_env.push_corosync_conf(cfg, skip_offline_nodes) # Now, when corosync.conf has been reloaded, we can start qdevice service. if lib_env.is_corosync_conf_live: lib_env.report_processor.report( ReportItem.info( reports.messages.ServiceActionStarted( reports.const.SERVICE_ACTION_START, "corosync-qdevice"))) com_cmd_start = qdevice_com.Start(lib_env.report_processor, skip_offline_nodes) com_cmd_start.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_start)
def config_restore_remote(infile_name, infile_obj): """ Commandline options: * --request-timeout - timeout for HTTP requests """ extracted = { "version.txt": "", "corosync.conf": "", } try: tarball = tarfile.open(infile_name, "r|*", infile_obj) while True: # next(tarball) does not work in python2.6 tar_member_info = tarball.next() if tar_member_info is None: break if tar_member_info.name in extracted: tar_member = tarball.extractfile(tar_member_info) extracted[tar_member_info.name] = tar_member.read() tar_member.close() tarball.close() except (tarfile.TarError, EnvironmentError) as e: utils.err("unable to read the tarball: %s" % e) config_backup_check_version(extracted["version.txt"]) node_list, report_list = get_existing_nodes_names( utils.get_corosync_conf_facade( conf_text=extracted["corosync.conf"].decode("utf-8"))) if report_list: process_library_reports(report_list) if not node_list: utils.err("no nodes found in the tarball") err_msgs = [] for node in node_list: try: retval, output = utils.checkStatus(node) if retval != 0: err_msgs.append(output) continue _status = json.loads(output) if (_status["corosync"] or _status["pacemaker"] or # not supported by older pcsd, do not fail if not present _status.get("pacemaker_remote", False)): err_msgs.append( "Cluster is currently running on node %s. You need to stop " "the cluster in order to restore the configuration." % node) continue except (ValueError, NameError, LookupError): err_msgs.append("unable to determine status of the node %s" % node) if err_msgs: for msg in err_msgs: utils.err(msg, False) sys.exit(1) # Temporarily disable config files syncing thread in pcsd so it will not # rewrite restored files. 10 minutes should be enough time to restore. # If node returns HTTP 404 it does not support config syncing at all. for node in node_list: retval, output = utils.pauseConfigSyncing(node, 10 * 60) if not (retval == 0 or "(HTTP error: 404)" in output): utils.err(output) if infile_obj: infile_obj.seek(0) tarball_data = infile_obj.read() else: with open(infile_name, "rb") as tarball: tarball_data = tarball.read() error_list = [] for node in node_list: retval, error = utils.restoreConfig(node, tarball_data) if retval != 0: error_list.append(error) if error_list: utils.err("unable to restore all nodes\n" + "\n".join(error_list))
def config_restore_remote(infile_name, infile_obj): """ Commandline options: * --request-timeout - timeout for HTTP requests """ extracted = { "version.txt": "", "corosync.conf": "", } try: tarball = tarfile.open(infile_name, "r|*", infile_obj) while True: # next(tarball) does not work in python2.6 tar_member_info = tarball.next() if tar_member_info is None: break if tar_member_info.name in extracted: tar_member = tarball.extractfile(tar_member_info) extracted[tar_member_info.name] = tar_member.read() tar_member.close() tarball.close() except (tarfile.TarError, EnvironmentError) as e: utils.err("unable to read the tarball: %s" % e) config_backup_check_version(extracted["version.txt"]) node_list, report_list = get_existing_nodes_names( utils.get_corosync_conf_facade( conf_text=extracted["corosync.conf"].decode("utf-8") ) ) if report_list: utils.process_library_reports(report_list) if not node_list: utils.err("no nodes found in the tarball") err_msgs = [] for node in node_list: try: retval, output = utils.checkStatus(node) if retval != 0: err_msgs.append(output) continue _status = json.loads(output) if ( _status["corosync"] or _status["pacemaker"] or # not supported by older pcsd, do not fail if not present _status.get("pacemaker_remote", False) ): err_msgs.append( "Cluster is currently running on node %s. You need to stop " "the cluster in order to restore the configuration." % node ) continue except (ValueError, NameError, LookupError): err_msgs.append("unable to determine status of the node %s" % node) if err_msgs: for msg in err_msgs: utils.err(msg, False) sys.exit(1) # Temporarily disable config files syncing thread in pcsd so it will not # rewrite restored files. 10 minutes should be enough time to restore. # If node returns HTTP 404 it does not support config syncing at all. for node in node_list: retval, output = utils.pauseConfigSyncing(node, 10 * 60) if not (retval == 0 or "(HTTP error: 404)" in output): utils.err(output) if infile_obj: infile_obj.seek(0) tarball_data = infile_obj.read() else: with open(infile_name, "rb") as tarball: tarball_data = tarball.read() error_list = [] for node in node_list: retval, error = utils.restoreConfig(node, tarball_data) if retval != 0: error_list.append(error) if error_list: utils.err("unable to restore all nodes\n" + "\n".join(error_list))
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None: """ Set up disaster recovery with the local cluster being the primary site env node_name -- a known host from the recovery site """ if env.ghost_file_codes: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentRequired( env.ghost_file_codes))) report_processor = env.report_processor dr_env = env.get_dr_env() if dr_env.config.raw_file.exists(): report_processor.report( ReportItem.error(reports.messages.DrConfigAlreadyExist())) target_factory = env.get_node_target_factory() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True) report_processor.report_list(report_list) if node_name in local_nodes: report_processor.report( ReportItem.error(reports.messages.NodeInLocalCluster(node_name))) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) report_list, remote_targets = target_factory.get_target_list_with_reports( [node_name], allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() com_cmd = GetCorosyncConf(env.report_processor) com_cmd.set_targets(remote_targets) remote_cluster_nodes, report_list = get_existing_nodes_names( CorosyncConfigFacade.from_string( run_and_raise(env.get_node_communicator(), com_cmd)), error_on_missing_name=True, ) if report_processor.report_list(report_list).has_errors: raise LibraryError() # ensure we have tokens for all nodes of remote cluster report_list, remote_targets = target_factory.get_target_list_with_reports( remote_cluster_nodes, allow_skip=False, report_none_host_found=False) if report_processor.report_list(report_list).has_errors: raise LibraryError() dr_config_exporter = get_file_toolbox( file_type_codes.PCS_DR_CONFIG).exporter # create dr config for remote cluster remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY) remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes) # send config to all node of remote cluster distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(remote_dr_cfg.config)), ) distribute_file_cmd.set_targets(remote_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd) # create new dr config, with local cluster as primary site local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY) local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes) distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(local_dr_cfg.config)), ) distribute_file_cmd.set_targets(local_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd)
def enable_sbd( lib_env, default_watchdog, watchdog_dict, sbd_options, default_device_list=None, node_device_dict=None, allow_unknown_opts=False, ignore_offline_nodes=False, no_watchdog_validation=False, allow_invalid_option_values=False, ): # pylint: disable=too-many-arguments, too-many-locals """ Enable SBD on all nodes in cluster. lib_env -- LibraryEnvironment default_watchdog -- watchdog for nodes which are not specified in watchdog_dict. Uses default value from settings if None. watchdog_dict -- dictionary with node names as keys and watchdog path as value sbd_options -- dictionary in format: <SBD config option>: <value> default_device_list -- list of devices for all nodes node_device_dict -- dictionary with node names as keys and list of devices as value allow_unknown_opts -- if True, accept also unknown options. ignore_offline_nodes -- if True, omit offline nodes no_watchdog_validation -- it True, do not validate existance of a watchdog on the nodes allow_invalid_option_values -- if True, invalid values of some options will be treated as warning instead of errors """ using_devices = not (default_device_list is None and node_device_dict is None) if default_device_list is None: default_device_list = [] if node_device_dict is None: node_device_dict = {} if not default_watchdog: default_watchdog = settings.sbd_watchdog_default sbd_options = {opt.upper(): val for opt, val in sbd_options.items()} corosync_conf = lib_env.get_corosync_conf() node_list, get_nodes_report_list = get_existing_nodes_names(corosync_conf) if not node_list: get_nodes_report_list.append( ReportItem.error(reports.messages.CorosyncConfigNoNodesDefined())) target_list = lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, ) full_watchdog_dict = _get_full_target_dict(target_list, watchdog_dict, default_watchdog) full_device_dict = _get_full_target_dict(target_list, node_device_dict, default_device_list) if lib_env.report_processor.report_list( get_nodes_report_list + [ ReportItem.error(reports.messages.NodeNotFound(node)) for node in (set(list(watchdog_dict.keys()) + list(node_device_dict.keys())) - set(node_list)) ] + _validate_watchdog_dict(full_watchdog_dict) + (sbd.validate_nodes_devices(full_device_dict) if using_devices else [] ) + _validate_sbd_options(sbd_options, allow_unknown_opts, allow_invalid_option_values)).has_errors: raise LibraryError() com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(target_list) online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd) # check if SBD can be enabled if no_watchdog_validation: lib_env.report_processor.report( ReportItem.warning( reports.messages.SbdWatchdogValidationInactive())) com_cmd = CheckSbd(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, ( # Do not send watchdog if validation is turned off. Listing of # available watchdogs in pcsd may restart the machine in some # corner cases. "" if no_watchdog_validation else full_watchdog_dict[target.label]), full_device_dict[target.label] if using_devices else [], ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable ATB if needed if not using_devices: if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf): lib_env.report_processor.report( ReportItem.warning( reports.messages.CorosyncQuorumAtbWillBeEnabledDueToSbd())) corosync_conf.set_quorum_options({"auto_tie_breaker": "1"}) lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes) # distribute SBD configuration config = sbd.get_default_sbd_config() config.update(sbd_options) com_cmd = SetSbdConfig(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, sbd.create_sbd_config( config, target.label, full_watchdog_dict[target.label], full_device_dict[target.label], ), ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # remove cluster prop 'stonith_watchdog_timeout' com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable SBD service an all nodes com_cmd = EnableSbdService(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.report( ReportItem.warning( reports.messages.ClusterRestartRequiredToApplyChanges()))