def _destroy_pcmk_remote_env( env, node_names_list, skip_offline_nodes, allow_fails ): actions = node_communication_format.create_pcmk_remote_actions([ "stop", "disable", ]) files = { "pacemaker_remote authkey": {"type": "pcmk_remote_authkey"}, } target_list = env.get_node_target_factory().get_target_list( node_names_list, skip_non_existing=skip_offline_nodes, ) com_cmd = ServiceAction( env.report_processor, actions, skip_offline_targets=skip_offline_nodes, allow_fails=allow_fails, ) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = RemoveFiles( env.report_processor, files, skip_offline_targets=skip_offline_nodes, allow_fails=allow_fails, ) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def _start_cluster( communicator_factory, report_processor, target_list, wait_timeout=False, ): # Large clusters take longer time to start up. So we make the timeout # longer for each 8 nodes: # 1 - 8 nodes: 1 * timeout # 9 - 16 nodes: 2 * timeout # 17 - 24 nodes: 3 * timeout # and so on ... # Users can override this and set their own timeout by specifying # the --request-timeout option. timeout = int(settings.default_request_timeout * math.ceil(len(target_list) / 8.0)) com_cmd = StartCluster(report_processor) com_cmd.set_targets(target_list) run_and_raise( communicator_factory.get_communicator(request_timeout=timeout), com_cmd) if wait_timeout is not False: report_processor.process_list( _wait_for_pacemaker_to_start( communicator_factory.get_communicator(), report_processor, target_list, timeout=wait_timeout, # wait_timeout is either None or a timeout ))
def _push_corosync_conf_live( self, target_list, corosync_conf_data, need_stopped_cluster, need_qdevice_reload, skip_offline_nodes ): if need_stopped_cluster: com_cmd = CheckCorosyncOffline( self.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) com_cmd = DistributeCorosyncConf( self.report_processor, corosync_conf_data, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) if is_service_running(self.cmd_runner(), "corosync"): reload_corosync_config(self.cmd_runner()) self.report_processor.process( reports.corosync_config_reloaded() ) if need_qdevice_reload: self.report_processor.process( reports.qdevice_client_reload_started() ) com_cmd = qdevice.Stop(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) report_list = com_cmd.error_list com_cmd = qdevice.Start(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) report_list += com_cmd.error_list if report_list: raise LibraryError()
def config_sync(env, skip_offline_nodes=False): """ Send specified local booth configuration to all nodes in cluster. env -- LibraryEnvironment skip_offline_nodes -- if True offline nodes will be skipped """ config = env.booth.get_config_content() authfile_path = config_structure.get_authfile(parse(config)) authfile_content = config_files.read_authfile( env.report_processor, authfile_path ) cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf() ) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) env.report_processor.process_list(report_list) com_cmd = BoothSendConfig( env.report_processor, env.booth.name, config, authfile=authfile_path, authfile_data=authfile_content, skip_offline_targets=skip_offline_nodes ) com_cmd.set_targets( env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) ) run_and_raise(env.get_node_communicator(), com_cmd)
def config_sync(env, skip_offline_nodes=False): """ Send specified local booth configuration to all nodes in cluster. env -- LibraryEnvironment skip_offline_nodes -- if True offline nodes will be skipped """ config = env.booth.get_config_content() authfile_path = config_structure.get_authfile(parse(config)) authfile_content = config_files.read_authfile( env.report_processor, authfile_path ) com_cmd = BoothSendConfig( env.report_processor, env.booth.name, config, authfile=authfile_path, authfile_data=authfile_content, skip_offline_targets=skip_offline_nodes ) com_cmd.set_targets( env.get_node_target_factory().get_target_list( env.get_corosync_conf().get_nodes_names(), skip_non_existing=skip_offline_nodes, ) ) run_and_raise(env.get_node_communicator(), com_cmd)
def add_device(lib_env, model, model_options, generic_options, force_model=False, force_options=False, skip_offline_nodes=False): """ Add quorum device to cluster, distribute and reload configs if live model quorum device model model_options model specific options dict generic_options generic quorum device options dict force_model continue even if the model is not valid force_options continue even if options are not valid skip_offline_nodes continue even if not all nodes are accessible """ __ensure_not_cman(lib_env) cfg = lib_env.get_corosync_conf() # Try adding qdevice to corosync.conf. This validates all the options and # makes sure qdevice is not defined in corosync.conf yet. cfg.add_quorum_device(lib_env.report_processor, model, model_options, generic_options, force_model, force_options) target_list = lib_env.get_node_target_factory().get_target_list( cfg.get_nodes()) # First setup certificates for qdevice, then send corosync.conf to nodes. # If anything fails, nodes will not have corosync.conf with qdevice in it, # so there is no effect on the cluster. if lib_env.is_corosync_conf_live: # do model specific configuration # if model is not known to pcs and was forced, do not configure antyhing # else but corosync.conf, as we do not know what to do anyways if model == "net": _add_device_model_net( lib_env, # we are sure it's there, it was validated in add_quorum_device model_options["host"], cfg.get_cluster_name(), cfg.get_nodes(), skip_offline_nodes) lib_env.report_processor.process( reports.service_enable_started("corosync-qdevice")) com_cmd = qdevice_com.Enable(lib_env.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # everything set up, it's safe to tell the nodes to use qdevice lib_env.push_corosync_conf(cfg, skip_offline_nodes) # Now, when corosync.conf has been reloaded, we can start qdevice service. if lib_env.is_corosync_conf_live: lib_env.report_processor.process( reports.service_start_started("corosync-qdevice")) com_cmd = qdevice_com.Start(lib_env.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd)
def disable_sbd(lib_env, ignore_offline_nodes=False): """ Disable SBD on all nodes in cluster. lib_env -- LibraryEnvironment ignore_offline_nodes -- if True, omit offline nodes """ node_list, get_nodes_report_list = get_existing_nodes_names( lib_env.get_corosync_conf()) if not node_list: get_nodes_report_list.append( reports.corosync_config_no_nodes_defined()) if lib_env.report_processor.report_list(get_nodes_report_list).has_errors: raise LibraryError() com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, )) online_nodes = run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = SetStonithWatchdogTimeoutToZero(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = DisableSbdService(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.report( reports.cluster_restart_required_to_apply_changes())
def _destroy_pcmk_remote_env(env, node_names_list, skip_offline_nodes, allow_fails): actions = node_communication_format.create_pcmk_remote_actions([ "stop", "disable", ]) files = { "pacemaker_remote authkey": { "type": "pcmk_remote_authkey" }, } target_list = env.get_node_target_factory().get_target_list( node_names_list, skip_non_existing=skip_offline_nodes, ) com_cmd = ServiceAction( env.report_processor, actions, skip_offline_targets=skip_offline_nodes, allow_fails=allow_fails, ) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = RemoveFiles( env.report_processor, files, skip_offline_targets=skip_offline_nodes, allow_fails=allow_fails, ) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def _unfencing_scsi_devices( env: LibraryEnvironment, stonith_el: _Element, original_devices: Iterable[str], updated_devices: Iterable[str], force_flags: Container[reports.types.ForceCode] = (), ) -> None: """ Unfence scsi devices provided in device_list if it is possible to connect to pcsd and corosync is running. env -- provides all for communication with externals original_devices -- devices before update updated_devices -- devices after update force_flags -- list of flags codes """ devices_to_unfence = set(updated_devices) - set(original_devices) if not devices_to_unfence: return cluster_nodes_names, nodes_report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True, ) env.report_processor.report_list(nodes_report_list) ( target_report_list, cluster_nodes_target_list, ) = env.get_node_target_factory().get_target_list_with_reports( cluster_nodes_names, allow_skip=False, ) env.report_processor.report_list(target_report_list) if env.report_processor.has_errors: raise LibraryError() com_cmd: AllSameDataMixin = GetCorosyncOnlineTargets( env.report_processor, skip_offline_targets=reports.codes.SKIP_OFFLINE_NODES in force_flags, ) com_cmd.set_targets(cluster_nodes_target_list) online_corosync_target_list = run_and_raise(env.get_node_communicator(), com_cmd) if stonith_el.get("type") == "fence_mpath": com_cmd = UnfenceMpath( env.report_processor, original_devices=sorted(original_devices), updated_devices=sorted(updated_devices), node_key_map=resource.stonith.get_node_key_map_for_mpath( stonith_el, [target.label for target in online_corosync_target_list], ), ) else: # fence_scsi com_cmd = Unfence( env.report_processor, original_devices=sorted(original_devices), updated_devices=sorted(updated_devices), ) com_cmd.set_targets(online_corosync_target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def synchronize_ssl_certificate(env, skip_offline=False): """ Send the local pcsd SSL cert and key to all full nodes in the local cluster. Consider the pcs Web UI is accessed via an IP running as a resource in the cluster. When the IP is moved, the user's browser connects to the new node and we want it to get the same certificate to make the transition a seamless experience (otherwise the browser display a warning that the certificate has changed). Using pcsd Web UI on remote and guest nodes is not supported (pcs/pcsd depends on the corosanc.conf file being present on the local node) so we send the cert only to corossync (== full stack) nodes. """ report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf()) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) report_processor.report_list(report_list) try: with open(settings.pcsd_cert_location, "r") as file: ssl_cert = file.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_CERT, file_path=settings.pcsd_cert_location, reason=format_environment_error(e), operation="read", )) try: with open(settings.pcsd_key_location, "r") as file: ssl_key = file.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_KEY, file_path=settings.pcsd_key_location, reason=format_environment_error(e), operation="read", )) target_report_list, target_list = ( target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline)) report_processor.report_list(target_report_list) if report_processor.has_errors: raise LibraryError() env.report_processor.process( reports.pcsd_ssl_cert_and_key_distribution_started( [target.label for target in target_list])) com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def config_sync( env: LibraryEnvironment, instance_name=None, skip_offline_nodes=False, ): """ Send specified local booth configuration to all nodes in the local cluster. env string instance_name -- booth instance name skip_offline_nodes -- if True offline nodes will be skipped """ report_processor = env.report_processor booth_env = env.get_booth_env(instance_name) if not env.is_cib_live: raise LibraryError( reports.live_environment_required([file_type_codes.CIB], )) cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf()) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) report_processor.report_list(report_list) try: booth_conf_data = booth_env.config.read_raw() booth_conf = booth_env.config.raw_to_facade(booth_conf_data) if isinstance(booth_env.config.raw_file, GhostFile): authfile_data = booth_env.key.read_raw() authfile_path = booth_conf.get_authfile() authfile_name = (os.path.basename(authfile_path) if authfile_path else None) else: authfile_name, authfile_data, authfile_report_list = ( config_files.get_authfile_name_and_data(booth_conf)) report_processor.report_list(authfile_report_list) except RawFileError as e: report_processor.report(raw_file_error_report(e)) except ParserErrorException as e: report_processor.report_list( booth_env.config.parser_exception_to_report_list(e)) if report_processor.has_errors: raise LibraryError() com_cmd = BoothSendConfig(env.report_processor, booth_env.instance_name, booth_conf_data, authfile=authfile_name, authfile_data=authfile_data, skip_offline_targets=skip_offline_nodes) com_cmd.set_targets(env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, )) run_and_raise(env.get_node_communicator(), com_cmd)
def destroy( env: LibraryEnvironment, force_flags: Container[reports.types.ForceCode] = (), ) -> None: """ Destroy disaster-recovery configuration on all sites """ if env.ghost_file_codes: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentRequired( env.ghost_file_codes))) report_processor = env.report_processor skip_offline = report_codes.SKIP_OFFLINE_NODES in force_flags report_list, dr_config = _load_dr_config(env.get_dr_env().config) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf()) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() remote_nodes: List[str] = [] for conf_remote_site in dr_config.get_remote_site_list(): remote_nodes.extend(conf_remote_site.node_name_list) target_factory = env.get_node_target_factory() report_list, targets = target_factory.get_target_list_with_reports( remote_nodes + local_nodes, skip_non_existing=skip_offline, ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() com_cmd = RemoveFilesWithoutForces( env.report_processor, { "pcs disaster-recovery config": { "type": "pcs_disaster_recovery_conf", }, }, ) com_cmd.set_targets(targets) run_and_raise(env.get_node_communicator(), com_cmd)
def _remove_device_model_net(lib_env, cluster_nodes, skip_offline_nodes): """ remove configuration used by qdevice model net NodeAddressesList cluster_nodes list of cluster nodes addresses bool skip_offline_nodes continue even if not all nodes are accessible """ reporter = lib_env.report_processor reporter.process(reports.qdevice_certificate_removal_started()) com_cmd = qdevice_net_com.ClientDestroy(reporter) com_cmd.set_targets( lib_env.get_node_target_factory().get_target_list(cluster_nodes)) run_and_raise(lib_env.get_node_communicator(), com_cmd)
def remove_device(lib_env, skip_offline_nodes=False): """ Stop using quorum device, distribute and reload configs if live skip_offline_nodes continue even if not all nodes are accessible """ __ensure_not_cman(lib_env) cfg = lib_env.get_corosync_conf() model, dummy_options, dummy_options, dummy_options = ( cfg.get_quorum_device_settings() ) cfg.remove_quorum_device() if lib_env.is_corosync_conf_live: target_list = lib_env.get_node_target_factory().get_target_list( cfg.get_nodes() ) # fix quorum options for SBD to work properly if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg): lib_env.report_processor.process(reports.sbd_requires_atb()) cfg.set_quorum_options( lib_env.report_processor, {"auto_tie_breaker": "1"} ) # disable qdevice lib_env.report_processor.process( reports.service_disable_started("corosync-qdevice") ) com_cmd = qdevice_com.Disable( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # stop qdevice lib_env.report_processor.process( reports.service_stop_started("corosync-qdevice") ) com_cmd = qdevice_com.Stop( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # handle model specific configuration if model == "net": _remove_device_model_net( lib_env, cfg.get_nodes(), skip_offline_nodes ) lib_env.push_corosync_conf(cfg, skip_offline_nodes)
def pull_config(env, node_name, name): """ Get config from specified node and save it on local system. It will rewrite existing files. env -- LibraryEnvironment node_name -- string, name of node from which config should be fetched name -- string, name of booth instance of which config should be fetched """ env.report_processor.process( booth_reports.booth_fetching_config_from_node_started(node_name, name)) com_cmd = BoothGetConfig(env.report_processor, name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)]) output = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: env.booth.create_config(output["config"]["data"], True) if (output["authfile"]["name"] is not None and output["authfile"]["data"]): env.booth.set_key_path( os.path.join(settings.booth_config_dir, output["authfile"]["name"])) env.booth.create_key( base64.b64decode(output["authfile"]["data"].encode("utf-8")), True) env.report_processor.process( booth_reports.booth_config_accepted_by_node(name_list=[name])) except KeyError: raise LibraryError(reports.invalid_response_format(node_name))
def pull_config(env: LibraryEnvironment, node_name, instance_name=None): """ Get config from specified node and save it on local system. It will rewrite existing files. env string node_name -- name of the node from which the config should be fetched string instance_name -- booth instance name """ report_processor = env.report_processor booth_env = env.get_booth_env(instance_name) instance_name = booth_env.instance_name _ensure_live_env(env, booth_env) conf_dir = os.path.dirname(booth_env.config_path) env.report_processor.report( ReportItem.info( reports.messages.BoothFetchingConfigFromNode( node_name, config=instance_name, ))) com_cmd = BoothGetConfig(env.report_processor, instance_name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)]) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. output = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: # TODO adapt to new file transfer framework once it is written if (output["authfile"]["name"] is not None and output["authfile"]["data"]): authfile_name = output["authfile"]["name"] report_list = config_validators.check_instance_name(authfile_name) if report_list: raise LibraryError(*report_list) booth_key = FileInstance.for_booth_key(authfile_name) booth_key.write_raw( base64.b64decode(output["authfile"]["data"].encode("utf-8")), can_overwrite=True, ) booth_env.config.write_raw(output["config"]["data"].encode("utf-8"), can_overwrite=True) env.report_processor.report( ReportItem.info( reports.messages.BoothConfigAcceptedByNode( name_list=[instance_name]))) except RawFileError as e: if not os.path.exists(conf_dir): report_processor.report( ReportItem.error( reports.messages.BoothPathNotExists(conf_dir))) else: report_processor.report(raw_file_error_report(e)) except KeyError as e: raise LibraryError( ReportItem.error( reports.messages.InvalidResponseFormat(node_name))) from e if report_processor.has_errors: raise LibraryError()
def _start_and_enable_pacemaker_remote(env, node_list, skip_offline_nodes=False, allow_fails=False): com_cmd = ServiceAction( env.report_processor, node_communication_format.create_pcmk_remote_actions([ "start", "enable", ]), skip_offline_targets=skip_offline_nodes, allow_fails=allow_fails, description="start of service pacemaker_remote") com_cmd.set_targets( env.get_node_target_factory().get_target_list(node_list)) run_and_raise(env.get_node_communicator(), com_cmd)
def disable_sbd(lib_env, ignore_offline_nodes=False): """ Disable SBD on all nodes in cluster. lib_env -- LibraryEnvironment ignore_offline_nodes -- if True, omit offline nodes """ com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(lib_env.get_node_target_factory().get_target_list( _get_cluster_nodes(lib_env))) online_nodes = run_and_raise(lib_env.get_node_communicator(), com_cmd) if lib_env.is_cman_cluster: com_cmd = CheckCorosyncOffline( lib_env.report_processor, skip_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = SetStonithWatchdogTimeoutToZero(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = DisableSbdService(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) if not lib_env.is_cman_cluster: lib_env.report_processor.process( reports.cluster_restart_required_to_apply_changes())
def _share_authkey(env, current_nodes, candidate_node_addresses, skip_offline_nodes=False, allow_incomplete_distribution=False): if env.pacemaker.has_authkey: authkey_content = env.pacemaker.get_authkey_content() node_addresses_list = NodeAddressesList([candidate_node_addresses]) else: authkey_content = generate_key() node_addresses_list = current_nodes + [candidate_node_addresses] com_cmd = DistributeFiles( env.report_processor, node_communication_format.pcmk_authkey_file(authkey_content), skip_offline_targets=skip_offline_nodes, allow_fails=allow_incomplete_distribution, description="remote node configuration files", ) com_cmd.set_targets( env.get_node_target_factory().get_target_list(node_addresses_list)) run_and_raise(env.get_node_communicator(), com_cmd)
def remove_device(lib_env: LibraryEnvironment, skip_offline_nodes=False): """ Stop using quorum device, distribute and reload configs if live skip_offline_nodes continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if not cfg.has_quorum_device(): raise LibraryError(reports.qdevice_not_defined()) model = cfg.get_quorum_device_model() cfg.remove_quorum_device() if lib_env.is_corosync_conf_live: report_processor = lib_env.report_processor # get nodes for communication cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True) if report_processor.report_list(report_list).has_errors: raise LibraryError() target_list = lib_env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # fix quorum options for SBD to work properly if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg): lib_env.report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) cfg.set_quorum_options({"auto_tie_breaker": "1"}) # disable qdevice lib_env.report_processor.report( reports.service_disable_started("corosync-qdevice")) com_cmd_disable = qdevice_com.Disable(lib_env.report_processor, skip_offline_nodes) com_cmd_disable.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_disable) # stop qdevice lib_env.report_processor.report( reports.service_stop_started("corosync-qdevice")) com_cmd_stop = qdevice_com.Stop(lib_env.report_processor, skip_offline_nodes) com_cmd_stop.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_stop) # handle model specific configuration if model == "net": lib_env.report_processor.report( reports.qdevice_certificate_removal_started()) com_cmd_client_destroy = qdevice_net_com.ClientDestroy( lib_env.report_processor, skip_offline_nodes) com_cmd_client_destroy.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_client_destroy) lib_env.push_corosync_conf(cfg, skip_offline_nodes)
def disable_sbd(lib_env, ignore_offline_nodes=False): """ Disable SBD on all nodes in cluster. lib_env -- LibraryEnvironment ignore_offline_nodes -- if True, omit offline nodes """ node_list, get_nodes_report_list = get_existing_nodes_names( lib_env.get_corosync_conf() ) if not node_list: get_nodes_report_list.append(reports.corosync_config_no_nodes_defined()) lib_env.report_processor.process_list(get_nodes_report_list) com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets( lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, ) ) online_nodes = run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = SetStonithWatchdogTimeoutToZero(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) com_cmd = DisableSbdService(lib_env.report_processor) com_cmd.set_targets(online_nodes) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.process( reports.cluster_restart_required_to_apply_changes() )
def remove_device(lib_env, skip_offline_nodes=False): """ Stop using quorum device, distribute and reload configs if live skip_offline_nodes continue even if not all nodes are accessible """ __ensure_not_cman(lib_env) cfg = lib_env.get_corosync_conf() if not cfg.has_quorum_device(): raise LibraryError(reports.qdevice_not_defined()) model = cfg.get_quorum_device_model() cfg.remove_quorum_device() if lib_env.is_corosync_conf_live: target_list = lib_env.get_node_target_factory().get_target_list( cfg.get_nodes_names(), skip_non_existing=skip_offline_nodes, ) # fix quorum options for SBD to work properly if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg): lib_env.report_processor.process( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd() ) cfg.set_quorum_options({"auto_tie_breaker": "1"}) # disable qdevice lib_env.report_processor.process( reports.service_disable_started("corosync-qdevice") ) com_cmd = qdevice_com.Disable( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # stop qdevice lib_env.report_processor.process( reports.service_stop_started("corosync-qdevice") ) com_cmd = qdevice_com.Stop( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # handle model specific configuration if model == "net": lib_env.report_processor.process( reports.qdevice_certificate_removal_started() ) com_cmd = qdevice_net_com.ClientDestroy( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.push_corosync_conf(cfg, skip_offline_nodes)
def set_up_client_certificates( runner, reporter, communicator_factory, qnetd_target, cluster_name, cluster_nodes_target_list, skip_offline_nodes, allow_skip_offline=True, ): """ setup cluster nodes for using qdevice model net CommandRunner runner -- command runner instance ReportProcessor reporter -- report processor instance NodeCommunicatorFactory communicator_factory -- communicator facto. instance Target qnetd_target -- qdevice provider (qnetd host) string cluster_name -- name of the cluster to which qdevice is being added list cluster_nodes_target_list -- list of cluster nodes targets bool skip_offline_nodes -- continue even if not all nodes are accessible bool allow_skip_offline -- enables forcing errors by skip_offline_nodes """ reporter.report( ReportItem.info( reports.messages.QdeviceCertificateDistributionStarted() ) ) # get qnetd CA certificate com_cmd = qdevice_net_com.GetCaCert(reporter) com_cmd.set_targets([qnetd_target]) qnetd_ca_cert = run_and_raise( communicator_factory.get_communicator(), com_cmd )[0][1] # init certificate storage on all nodes com_cmd = qdevice_net_com.ClientSetup( reporter, qnetd_ca_cert, skip_offline_nodes, allow_skip_offline ) com_cmd.set_targets(cluster_nodes_target_list) run_and_raise(communicator_factory.get_communicator(), com_cmd) # create client certificate request cert_request = client_generate_certificate_request(runner, cluster_name) # sign the request on qnetd host com_cmd = qdevice_net_com.SignCertificate(reporter) com_cmd.add_request(qnetd_target, cert_request, cluster_name) signed_certificate = run_and_raise( communicator_factory.get_communicator(), com_cmd )[0][1] # transform the signed certificate to pk12 format which can sent to nodes pk12 = client_cert_request_to_pk12(runner, signed_certificate) # distribute final certificate to nodes com_cmd = qdevice_net_com.ClientImportCertificateAndKey( reporter, pk12, skip_offline_nodes, allow_skip_offline ) com_cmd.set_targets(cluster_nodes_target_list) run_and_raise(communicator_factory.get_communicator(), com_cmd)
def config_text(env, name, node_name=None): """ get configuration in raw format string name -- name of booth instance whose config should be returned string node_name -- get the config from specified node or local host if None """ if node_name is None: # TODO add name support return env.booth.get_config_content() com_cmd = BoothGetConfig(env.report_processor, name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)]) remote_data = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: return remote_data["config"]["data"] except KeyError: raise LibraryError(reports.invalid_response_format(node_name))
def _push_corosync_conf_live( self, target_list, corosync_conf_data, need_stopped_cluster, need_qdevice_reload, skip_offline_nodes, ): # TODO # * check for online nodes and run all commands on them only # * if those commands fail, exit with an error # * add support for allow_skip_offline=False # * use simple report procesor # Correct reloading is done in pcs.lib.cluster.remove_nodes for example. # Check if the cluster is stopped when needed if need_stopped_cluster: com_cmd = CheckCorosyncOffline( self.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Distribute corosync.conf com_cmd = DistributeCorosyncConf( self.report_processor, corosync_conf_data, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Reload corosync if not need_stopped_cluster: # If cluster must be stopped then we cannot reload corosync because # the cluster is stopped. If it is not stopped, we do not even get # here. com_cmd = ReloadCorosyncConf(self.report_processor) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Reload qdevice if needed if need_qdevice_reload: self.report_processor.report( ReportItem.info(reports.messages.QdeviceClientReloadStarted()) ) com_cmd = qdevice.Stop(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) has_errors = com_cmd.has_errors com_cmd = qdevice.Start(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) has_errors = has_errors or com_cmd.has_errors if has_errors: raise LibraryError()
def _add_device_model_net( lib_env, qnetd_host, cluster_name, cluster_nodes, skip_offline_nodes ): """ setup cluster nodes for using qdevice model net string qnetd_host address of qdevice provider (qnetd host) string cluster_name name of the cluster to which qdevice is being added NodeAddressesList cluster_nodes list of cluster nodes addresses bool skip_offline_nodes continue even if not all nodes are accessible """ runner = lib_env.cmd_runner() reporter = lib_env.report_processor target_factory = lib_env.get_node_target_factory() qnetd_target = target_factory.get_target_from_hostname(qnetd_host) target_list = target_factory.get_target_list(cluster_nodes) reporter.process( reports.qdevice_certificate_distribution_started() ) # get qnetd CA certificate com_cmd = qdevice_net_com.GetCaCert(reporter) com_cmd.set_targets([qnetd_target]) qnetd_ca_cert = run_and_raise( lib_env.get_node_communicator(), com_cmd )[0][1] # init certificate storage on all nodes com_cmd = qdevice_net_com.ClientSetup( reporter, qnetd_ca_cert, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # create client certificate request cert_request = qdevice_net.client_generate_certificate_request( runner, cluster_name ) # sign the request on qnetd host com_cmd = qdevice_net_com.SignCertificate(reporter) com_cmd.add_request(qnetd_target, cert_request, cluster_name) signed_certificate = run_and_raise( lib_env.get_node_communicator(), com_cmd )[0][1] # transform the signed certificate to pk12 format which can sent to nodes pk12 = qdevice_net.client_cert_request_to_pk12(runner, signed_certificate) # distribute final certificate to nodes com_cmd = qdevice_net_com.ClientImportCertificateAndKey( reporter, pk12, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd)
def config_text(env: LibraryEnvironment, instance_name=None, node_name=None): """ get configuration in raw format env string instance_name -- booth instance name string node_name -- get the config from specified node or local host if None """ report_processor = env.report_processor booth_env = env.get_booth_env(instance_name) instance_name = booth_env.instance_name # It does not make any sense for the cli to read a ghost file and send it # to lib so that the lib could return it unchanged to cli. Just use 'cat'. # When node_name is specified, using ghost files doesn't make any sense # either. _ensure_live_env(env, booth_env) if node_name is None: try: return booth_env.config.read_raw() except RawFileError as e: report_processor.report(raw_file_error_report(e)) if report_processor.has_errors: raise LibraryError() com_cmd = BoothGetConfig(env.report_processor, instance_name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)] ) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. remote_data = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: # TODO switch to new file transfer commands (not implemented yet) # which send and receive configs as bytes instead of strings return remote_data["config"]["data"].encode("utf-8") except KeyError: raise LibraryError( ReportItem.error(reports.messages.InvalidResponseFormat(node_name)) )
def pull_config(env, node_name): """ Get config from specified node and save it on local system. It will rewrite existing files. env -- LibraryEnvironment node_name -- string, name of node from which config should be fetched """ name = env.booth.name env.report_processor.process( booth_reports.booth_fetching_config_from_node_started(node_name, name) ) com_cmd = BoothGetConfig(env.report_processor, name) com_cmd.set_targets([ env.get_node_target_factory().get_target_from_hostname(node_name) ]) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. output = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: env.booth.create_config(output["config"]["data"], True) if ( output["authfile"]["name"] is not None and output["authfile"]["data"] ): env.booth.set_key_path(os.path.join( settings.booth_config_dir, output["authfile"]["name"] )) env.booth.create_key( base64.b64decode( output["authfile"]["data"].encode("utf-8") ), True ) env.report_processor.process( booth_reports.booth_config_accepted_by_node(name_list=[name]) ) except KeyError: raise LibraryError(reports.invalid_response_format(node_name))
def config_text(env, node_name=None): """ get configuration in raw format string node_name -- get the config from specified node or local host if None """ if node_name is None: # TODO add name support return env.booth.get_config_content() name = env.booth.name com_cmd = BoothGetConfig(env.report_processor, name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)]) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. remote_data = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: return remote_data["config"]["data"] except KeyError: raise LibraryError(reports.invalid_response_format(node_name))
def config_text(env, node_name=None): """ get configuration in raw format string node_name -- get the config from specified node or local host if None """ if node_name is None: # TODO add name support return env.booth.get_config_content() name = env.booth.name com_cmd = BoothGetConfig(env.report_processor, name) com_cmd.set_targets([ env.get_node_target_factory().get_target_from_hostname(node_name) ]) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. remote_data = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: return remote_data["config"]["data"] except KeyError: raise LibraryError(reports.invalid_response_format(node_name))
def _push_corosync_conf_live( self, target_list, corosync_conf_data, need_stopped_cluster, need_qdevice_reload, skip_offline_nodes, ): # Check if the cluster is stopped when needed if need_stopped_cluster: com_cmd = CheckCorosyncOffline(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Distribute corosync.conf com_cmd = DistributeCorosyncConf(self.report_processor, corosync_conf_data, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Reload corosync if not need_stopped_cluster: # If cluster must be stopped then we cannot reload corosync because # the cluster is stopped. If it is not stopped, we do not even get # here. com_cmd = ReloadCorosyncConf(self.report_processor) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Reload qdevice if needed if need_qdevice_reload: self.report_processor.report( ReportItem.info(reports.messages.QdeviceClientReloadStarted())) com_cmd = qdevice.Stop(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) has_errors = com_cmd.has_errors com_cmd = qdevice.Start(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) has_errors = has_errors or com_cmd.has_errors if has_errors: raise LibraryError()
def _push_corosync_conf_live( self, target_list, corosync_conf_data, need_stopped_cluster, need_qdevice_reload, skip_offline_nodes ): # Check if the cluster is stopped when needed if need_stopped_cluster: com_cmd = CheckCorosyncOffline( self.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Distribute corosync.conf com_cmd = DistributeCorosyncConf( self.report_processor, corosync_conf_data, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Reload corosync if not need_stopped_cluster: # If cluster must be stopped then we cannot reload corosync because # the cluster is stopped. If it is not stopped, we do not even get # here. com_cmd = ReloadCorosyncConf(self.report_processor) com_cmd.set_targets(target_list) run_and_raise(self.get_node_communicator(), com_cmd) # Reload qdevice if needed if need_qdevice_reload: self.report_processor.process( reports.qdevice_client_reload_started() ) com_cmd = qdevice.Stop(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) report_list = com_cmd.error_list com_cmd = qdevice.Start(self.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run(self.get_node_communicator(), com_cmd) report_list += com_cmd.error_list if report_list: raise LibraryError()
def enable_sbd( lib_env, default_watchdog, watchdog_dict, sbd_options, default_device_list=None, node_device_dict=None, allow_unknown_opts=False, ignore_offline_nodes=False, no_watchdog_validation=False, allow_invalid_option_values=False, ): # pylint: disable=too-many-arguments, too-many-locals """ Enable SBD on all nodes in cluster. lib_env -- LibraryEnvironment default_watchdog -- watchdog for nodes which are not specified in watchdog_dict. Uses default value from settings if None. watchdog_dict -- dictionary with node names as keys and watchdog path as value sbd_options -- dictionary in format: <SBD config option>: <value> default_device_list -- list of devices for all nodes node_device_dict -- dictionary with node names as keys and list of devices as value allow_unknown_opts -- if True, accept also unknown options. ignore_offline_nodes -- if True, omit offline nodes no_watchdog_validation -- it True, do not validate existance of a watchdog on the nodes allow_invalid_option_values -- if True, invalid values of some options will be treated as warning instead of errors """ using_devices = not ( default_device_list is None and node_device_dict is None ) if default_device_list is None: default_device_list = [] if node_device_dict is None: node_device_dict = {} if not default_watchdog: default_watchdog = settings.sbd_watchdog_default sbd_options = {opt.upper(): val for opt, val in sbd_options.items()} corosync_conf = lib_env.get_corosync_conf() node_list, get_nodes_report_list = get_existing_nodes_names(corosync_conf) if not node_list: get_nodes_report_list.append(reports.corosync_config_no_nodes_defined()) target_list = lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, ) full_watchdog_dict = _get_full_target_dict( target_list, watchdog_dict, default_watchdog ) full_device_dict = _get_full_target_dict( target_list, node_device_dict, default_device_list ) lib_env.report_processor.process_list( get_nodes_report_list + [ reports.node_not_found(node) for node in ( set(list(watchdog_dict.keys()) + list(node_device_dict.keys())) - set(node_list) ) ] + _validate_watchdog_dict(full_watchdog_dict) + (sbd.validate_nodes_devices(full_device_dict) if using_devices else []) + _validate_sbd_options( sbd_options, allow_unknown_opts, allow_invalid_option_values ) ) com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(target_list) online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd) # check if SBD can be enabled if no_watchdog_validation: lib_env.report_processor.report( reports.sbd_watchdog_validation_inactive() ) com_cmd = CheckSbd(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, ( # Do not send watchdog if validation is turned off. Listing of # available watchdogs in pcsd may restart the machine in some # corner cases. "" if no_watchdog_validation else full_watchdog_dict[target.label] ), full_device_dict[target.label] if using_devices else [], ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable ATB if needed if not using_devices: if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf): lib_env.report_processor.process( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd() ) corosync_conf.set_quorum_options({"auto_tie_breaker": "1"}) lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes) # distribute SBD configuration config = sbd.get_default_sbd_config() config.update(sbd_options) com_cmd = SetSbdConfig(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, sbd.create_sbd_config( config, target.label, full_watchdog_dict[target.label], full_device_dict[target.label] ) ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # remove cluster prop 'stonith_watchdog_timeout' com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable SBD service an all nodes com_cmd = EnableSbdService(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.process( reports.cluster_restart_required_to_apply_changes() )
def enable_sbd( lib_env, default_watchdog, watchdog_dict, sbd_options, default_device_list=None, node_device_dict=None, allow_unknown_opts=False, ignore_offline_nodes=False, ): """ Enable SBD on all nodes in cluster. lib_env -- LibraryEnvironment default_watchdog -- watchdog for nodes which are not specified in watchdog_dict. Uses default value from settings if None. watchdog_dict -- dictionary with node names as keys and watchdog path as value sbd_options -- dictionary in format: <SBD config option>: <value> default_device_list -- list of devices for all nodes node_device_dict -- dictionary with node names as keys and list of devices as value allow_unknown_opts -- if True, accept also unknown options. ignore_offline_nodes -- if True, omit offline nodes """ node_list = _get_cluster_nodes(lib_env) target_list = lib_env.get_node_target_factory().get_target_list(node_list) using_devices = not ( default_device_list is None and node_device_dict is None ) if default_device_list is None: default_device_list = [] if node_device_dict is None: node_device_dict = {} if not default_watchdog: default_watchdog = settings.sbd_watchdog_default sbd_options = dict([(opt.upper(), val) for opt, val in sbd_options.items()]) full_watchdog_dict = _get_full_target_dict( target_list, watchdog_dict, default_watchdog ) full_device_dict = _get_full_target_dict( target_list, node_device_dict, default_device_list ) lib_env.report_processor.process_list( _check_node_names_in_cluster( node_list, list(watchdog_dict.keys()) + list(node_device_dict.keys()) ) + _validate_watchdog_dict(full_watchdog_dict) + (_validate_device_dict(full_device_dict) if using_devices else []) + _validate_sbd_options(sbd_options, allow_unknown_opts) ) com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(target_list) online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd) # check if SBD can be enabled com_cmd = CheckSbd(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, full_watchdog_dict[target.label], full_device_dict[target.label] if using_devices else [], ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable ATB if neede if not lib_env.is_cman_cluster and not using_devices: corosync_conf = lib_env.get_corosync_conf() if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf): lib_env.report_processor.process(reports.sbd_requires_atb()) corosync_conf.set_quorum_options( lib_env.report_processor, {"auto_tie_breaker": "1"} ) lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes) # distribute SBD configuration config = sbd.get_default_sbd_config() config.update(sbd_options) com_cmd = SetSbdConfig(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, sbd.create_sbd_config( config, target.label, full_watchdog_dict[target.label], full_device_dict[target.label] ) ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # remove cluster prop 'stonith_watchdog_timeout' com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable SBD service an all nodes com_cmd = EnableSbdService(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.process( reports.cluster_restart_required_to_apply_changes() )
def add_device( lib_env, model, model_options, generic_options, heuristics_options, force_model=False, force_options=False, skip_offline_nodes=False ): """ Add a quorum device to a cluster, distribute and reload configs if live string model -- quorum device model dict model_options -- model specific options dict generic_options -- generic quorum device options dict heuristics_options -- heuristics options bool force_model -- continue even if the model is not valid bool force_options -- continue even if options are not valid bool skip_offline_nodes -- continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if cfg.has_quorum_device(): raise LibraryError(reports.qdevice_already_defined()) report_processor = SimpleReportProcessor(lib_env.report_processor) report_processor.report_list( corosync_conf_validators.add_quorum_device( model, model_options, generic_options, heuristics_options, [node.nodeid for node in cfg.get_nodes()], force_model=force_model, force_options=force_options ) ) if lib_env.is_corosync_conf_live: cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() cfg.add_quorum_device( model, model_options, generic_options, heuristics_options, ) if cfg.is_quorum_device_heuristics_enabled_with_no_exec(): lib_env.report_processor.process( reports.corosync_quorum_heuristics_enabled_with_no_exec() ) # First setup certificates for qdevice, then send corosync.conf to nodes. # If anything fails, nodes will not have corosync.conf with qdevice in it, # so there is no effect on the cluster. if lib_env.is_corosync_conf_live: target_factory = lib_env.get_node_target_factory() target_list = target_factory.get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # Do model specific configuration. # If the model is not known to pcs and was forced, do not configure # anything else than corosync.conf, as we do not know what to do # anyway. if model == "net": qdevice_net.set_up_client_certificates( lib_env.cmd_runner(), lib_env.report_processor, lib_env.communicator_factory, # We are sure the "host" key is there, it has been validated # above. target_factory.get_target_from_hostname(model_options["host"]), cfg.get_cluster_name(), target_list, skip_offline_nodes ) lib_env.report_processor.process( reports.service_enable_started("corosync-qdevice") ) com_cmd = qdevice_com.Enable( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # everything set up, it's safe to tell the nodes to use qdevice lib_env.push_corosync_conf(cfg, skip_offline_nodes) # Now, when corosync.conf has been reloaded, we can start qdevice service. if lib_env.is_corosync_conf_live: lib_env.report_processor.process( reports.service_start_started("corosync-qdevice") ) com_cmd = qdevice_com.Start( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd)
def remove_device(lib_env, skip_offline_nodes=False): """ Stop using quorum device, distribute and reload configs if live skip_offline_nodes continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if not cfg.has_quorum_device(): raise LibraryError(reports.qdevice_not_defined()) model = cfg.get_quorum_device_model() cfg.remove_quorum_device() if lib_env.is_corosync_conf_live: report_processor = SimpleReportProcessor(lib_env.report_processor) # get nodes for communication cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() target_list = lib_env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # fix quorum options for SBD to work properly if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg): lib_env.report_processor.process( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd() ) cfg.set_quorum_options({"auto_tie_breaker": "1"}) # disable qdevice lib_env.report_processor.process( reports.service_disable_started("corosync-qdevice") ) com_cmd = qdevice_com.Disable( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # stop qdevice lib_env.report_processor.process( reports.service_stop_started("corosync-qdevice") ) com_cmd = qdevice_com.Stop( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # handle model specific configuration if model == "net": lib_env.report_processor.process( reports.qdevice_certificate_removal_started() ) com_cmd = qdevice_net_com.ClientDestroy( lib_env.report_processor, skip_offline_nodes ) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.push_corosync_conf(cfg, skip_offline_nodes)
def enable_sbd( lib_env, default_watchdog, watchdog_dict, sbd_options, default_device_list=None, node_device_dict=None, allow_unknown_opts=False, ignore_offline_nodes=False, no_watchdog_validation=False, allow_invalid_option_values=False, ): # pylint: disable=too-many-arguments, too-many-locals """ Enable SBD on all nodes in cluster. lib_env -- LibraryEnvironment default_watchdog -- watchdog for nodes which are not specified in watchdog_dict. Uses default value from settings if None. watchdog_dict -- dictionary with node names as keys and watchdog path as value sbd_options -- dictionary in format: <SBD config option>: <value> default_device_list -- list of devices for all nodes node_device_dict -- dictionary with node names as keys and list of devices as value allow_unknown_opts -- if True, accept also unknown options. ignore_offline_nodes -- if True, omit offline nodes no_watchdog_validation -- it True, do not validate existance of a watchdog on the nodes allow_invalid_option_values -- if True, invalid values of some options will be treated as warning instead of errors """ using_devices = not (default_device_list is None and node_device_dict is None) if default_device_list is None: default_device_list = [] if node_device_dict is None: node_device_dict = {} if not default_watchdog: default_watchdog = settings.sbd_watchdog_default sbd_options = {opt.upper(): val for opt, val in sbd_options.items()} corosync_conf = lib_env.get_corosync_conf() node_list, get_nodes_report_list = get_existing_nodes_names(corosync_conf) if not node_list: get_nodes_report_list.append( reports.corosync_config_no_nodes_defined()) target_list = lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, ) full_watchdog_dict = _get_full_target_dict(target_list, watchdog_dict, default_watchdog) full_device_dict = _get_full_target_dict(target_list, node_device_dict, default_device_list) if lib_env.report_processor.report_list( get_nodes_report_list + [ reports.node_not_found(node) for node in (set(list(watchdog_dict.keys()) + list(node_device_dict.keys())) - set(node_list)) ] + _validate_watchdog_dict(full_watchdog_dict) + (sbd.validate_nodes_devices(full_device_dict) if using_devices else [] ) + _validate_sbd_options(sbd_options, allow_unknown_opts, allow_invalid_option_values)).has_errors: raise LibraryError() com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(target_list) online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd) # check if SBD can be enabled if no_watchdog_validation: lib_env.report_processor.report( reports.sbd_watchdog_validation_inactive()) com_cmd = CheckSbd(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, ( # Do not send watchdog if validation is turned off. Listing of # available watchdogs in pcsd may restart the machine in some # corner cases. "" if no_watchdog_validation else full_watchdog_dict[target.label]), full_device_dict[target.label] if using_devices else [], ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable ATB if needed if not using_devices: if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf): lib_env.report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) corosync_conf.set_quorum_options({"auto_tie_breaker": "1"}) lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes) # distribute SBD configuration config = sbd.get_default_sbd_config() config.update(sbd_options) com_cmd = SetSbdConfig(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, sbd.create_sbd_config(config, target.label, full_watchdog_dict[target.label], full_device_dict[target.label])) run_and_raise(lib_env.get_node_communicator(), com_cmd) # remove cluster prop 'stonith_watchdog_timeout' com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable SBD service an all nodes com_cmd = EnableSbdService(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.report( reports.cluster_restart_required_to_apply_changes())
def _prepare_pacemaker_remote_environment( env, report_processor, existing_nodes_target_list, new_node_target, new_node_name, skip_offline_nodes, allow_incomplete_distribution, allow_fails ): if new_node_target: com_cmd = GetOnlineTargets( report_processor, ignore_offline_targets=skip_offline_nodes, ) com_cmd.set_targets([new_node_target]) online_new_target_list = run_com(env.get_node_communicator(), com_cmd) if not online_new_target_list and not skip_offline_nodes: raise LibraryError() else: online_new_target_list = [] # check new nodes if online_new_target_list: com_cmd = GetHostInfo(report_processor) com_cmd.set_targets(online_new_target_list) report_processor.report_list( _host_check_remote_node( run_com(env.get_node_communicator(), com_cmd) ) ) if report_processor.has_errors: raise LibraryError() else: report_processor.report_list( _reports_skip_new_node(new_node_name, "unreachable") ) # share pacemaker authkey if env.pacemaker.has_authkey: authkey_content = env.pacemaker.get_authkey_content() authkey_targets = online_new_target_list else: authkey_content = generate_binary_key( random_bytes_count=settings.pacemaker_authkey_bytes ) authkey_targets = existing_nodes_target_list + online_new_target_list if authkey_targets: com_cmd = DistributeFiles( report_processor, node_communication_format.pcmk_authkey_file(authkey_content), skip_offline_targets=skip_offline_nodes, allow_fails=allow_incomplete_distribution, ) com_cmd.set_targets(authkey_targets) run_and_raise(env.get_node_communicator(), com_cmd) # start and enable pacemaker_remote if online_new_target_list: com_cmd = ServiceAction( report_processor, node_communication_format.create_pcmk_remote_actions([ "start", "enable", ]), allow_fails=allow_fails, ) com_cmd.set_targets(online_new_target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None: """ Set up disaster recovery with the local cluster being the primary site env node_name -- a known host from the recovery site """ if env.ghost_file_codes: raise LibraryError( reports.live_environment_required(env.ghost_file_codes)) report_processor = SimpleReportProcessor(env.report_processor) dr_env = env.get_dr_env() if dr_env.config.raw_file.exists(): report_processor.report(reports.dr_config_already_exist()) target_factory = env.get_node_target_factory() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True) report_processor.report_list(report_list) if node_name in local_nodes: report_processor.report(reports.node_in_local_cluster(node_name)) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) report_list, remote_targets = (target_factory.get_target_list_with_reports( [node_name], allow_skip=False, report_none_host_found=False)) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() com_cmd = GetCorosyncConf(env.report_processor) com_cmd.set_targets(remote_targets) remote_cluster_nodes, report_list = get_existing_nodes_names( CorosyncConfigFacade.from_string( run_and_raise(env.get_node_communicator(), com_cmd)), error_on_missing_name=True) if report_processor.report_list(report_list): raise LibraryError() # ensure we have tokens for all nodes of remote cluster report_list, remote_targets = target_factory.get_target_list_with_reports( remote_cluster_nodes, allow_skip=False, report_none_host_found=False) if report_processor.report_list(report_list): raise LibraryError() dr_config_exporter = (get_file_toolbox( file_type_codes.PCS_DR_CONFIG).exporter) # create dr config for remote cluster remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY) remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes) # send config to all node of remote cluster distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(remote_dr_cfg.config))) distribute_file_cmd.set_targets(remote_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd) # create new dr config, with local cluster as primary site local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY) local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes) distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(local_dr_cfg.config))) distribute_file_cmd.set_targets(local_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd)
def add_device(lib_env, model, model_options, generic_options, heuristics_options, force_model=False, force_options=False, skip_offline_nodes=False): """ Add a quorum device to a cluster, distribute and reload configs if live string model -- quorum device model dict model_options -- model specific options dict generic_options -- generic quorum device options dict heuristics_options -- heuristics options bool force_model -- continue even if the model is not valid bool force_options -- continue even if options are not valid bool skip_offline_nodes -- continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if cfg.has_quorum_device(): raise LibraryError(reports.qdevice_already_defined()) lib_env.report_processor.process_list( corosync_conf_validators.add_quorum_device( model, model_options, generic_options, heuristics_options, [node.nodeid for node in cfg.get_nodes()], force_model=force_model, force_options=force_options)) cfg.add_quorum_device( model, model_options, generic_options, heuristics_options, ) if cfg.is_quorum_device_heuristics_enabled_with_no_exec(): lib_env.report_processor.process( reports.corosync_quorum_heuristics_enabled_with_no_exec()) # First setup certificates for qdevice, then send corosync.conf to nodes. # If anything fails, nodes will not have corosync.conf with qdevice in it, # so there is no effect on the cluster. if lib_env.is_corosync_conf_live: target_factory = lib_env.get_node_target_factory() target_list = target_factory.get_target_list( cfg.get_nodes_names(), skip_non_existing=skip_offline_nodes, ) # Do model specific configuration. # If the model is not known to pcs and was forced, do not configure # anything else than corosync.conf, as we do not know what to do # anyway. if model == "net": qdevice_net.set_up_client_certificates( lib_env.cmd_runner(), lib_env.report_processor, lib_env.communicator_factory, # We are sure the "host" key is there, it has been validated # above. target_factory.get_target_from_hostname(model_options["host"]), cfg.get_cluster_name(), target_list, skip_offline_nodes) lib_env.report_processor.process( reports.service_enable_started("corosync-qdevice")) com_cmd = qdevice_com.Enable(lib_env.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # everything set up, it's safe to tell the nodes to use qdevice lib_env.push_corosync_conf(cfg, skip_offline_nodes) # Now, when corosync.conf has been reloaded, we can start qdevice service. if lib_env.is_corosync_conf_live: lib_env.report_processor.process( reports.service_start_started("corosync-qdevice")) com_cmd = qdevice_com.Start(lib_env.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd)