def pull_config(env: LibraryEnvironment, node_name, instance_name=None): """ Get config from specified node and save it on local system. It will rewrite existing files. env string node_name -- name of the node from which the config should be fetched string instance_name -- booth instance name """ report_processor = env.report_processor booth_env = env.get_booth_env(instance_name) instance_name = booth_env.instance_name _ensure_live_env(env, booth_env) conf_dir = os.path.dirname(booth_env.config_path) env.report_processor.report( ReportItem.info( reports.messages.BoothFetchingConfigFromNode( node_name, config=instance_name, ))) com_cmd = BoothGetConfig(env.report_processor, instance_name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)]) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. output = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: # TODO adapt to new file transfer framework once it is written if (output["authfile"]["name"] is not None and output["authfile"]["data"]): authfile_name = output["authfile"]["name"] report_list = config_validators.check_instance_name(authfile_name) if report_list: raise LibraryError(*report_list) booth_key = FileInstance.for_booth_key(authfile_name) booth_key.write_raw( base64.b64decode(output["authfile"]["data"].encode("utf-8")), can_overwrite=True, ) booth_env.config.write_raw(output["config"]["data"].encode("utf-8"), can_overwrite=True) env.report_processor.report( ReportItem.info( reports.messages.BoothConfigAcceptedByNode( name_list=[instance_name]))) except RawFileError as e: if not os.path.exists(conf_dir): report_processor.report( ReportItem.error( reports.messages.BoothPathNotExists(conf_dir))) else: report_processor.report(raw_file_error_report(e)) except KeyError as e: raise LibraryError( ReportItem.error( reports.messages.InvalidResponseFormat(node_name))) from e if report_processor.has_errors: raise LibraryError()
def _unfencing_scsi_devices( env: LibraryEnvironment, stonith_el: _Element, original_devices: Iterable[str], updated_devices: Iterable[str], force_flags: Container[reports.types.ForceCode] = (), ) -> None: """ Unfence scsi devices provided in device_list if it is possible to connect to pcsd and corosync is running. env -- provides all for communication with externals original_devices -- devices before update updated_devices -- devices after update force_flags -- list of flags codes """ devices_to_unfence = set(updated_devices) - set(original_devices) if not devices_to_unfence: return cluster_nodes_names, nodes_report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True, ) env.report_processor.report_list(nodes_report_list) ( target_report_list, cluster_nodes_target_list, ) = env.get_node_target_factory().get_target_list_with_reports( cluster_nodes_names, allow_skip=False, ) env.report_processor.report_list(target_report_list) if env.report_processor.has_errors: raise LibraryError() com_cmd: AllSameDataMixin = GetCorosyncOnlineTargets( env.report_processor, skip_offline_targets=reports.codes.SKIP_OFFLINE_NODES in force_flags, ) com_cmd.set_targets(cluster_nodes_target_list) online_corosync_target_list = run_and_raise(env.get_node_communicator(), com_cmd) if stonith_el.get("type") == "fence_mpath": com_cmd = UnfenceMpath( env.report_processor, original_devices=sorted(original_devices), updated_devices=sorted(updated_devices), node_key_map=resource.stonith.get_node_key_map_for_mpath( stonith_el, [target.label for target in online_corosync_target_list], ), ) else: # fence_scsi com_cmd = Unfence( env.report_processor, original_devices=sorted(original_devices), updated_devices=sorted(updated_devices), ) com_cmd.set_targets(online_corosync_target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def synchronize_ssl_certificate(env: LibraryEnvironment, skip_offline=False): """ Send the local pcsd SSL cert and key to all full nodes in the local cluster. Consider the pcs Web UI is accessed via an IP running as a resource in the cluster. When the IP is moved, the user's browser connects to the new node and we want it to get the same certificate to make the transition a seamless experience (otherwise the browser display a warning that the certificate has changed). Using pcsd Web UI on remote and guest nodes is not supported (pcs/pcsd depends on the corosanc.conf file being present on the local node) so we send the cert only to corossync (== full stack) nodes. """ report_processor = env.report_processor target_factory = env.get_node_target_factory() cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf()) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) report_processor.report_list(report_list) try: with open(settings.pcsd_cert_location, "r") as file: ssl_cert = file.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( file_type_codes.PCSD_SSL_CERT, RawFileError.ACTION_READ, format_environment_error(e), file_path=settings.pcsd_cert_location, )) try: with open(settings.pcsd_key_location, "r") as file: ssl_key = file.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( file_type_codes.PCSD_SSL_KEY, RawFileError.ACTION_READ, format_environment_error(e), file_path=settings.pcsd_key_location, )) target_report_list, target_list = ( target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline)) report_processor.report_list(target_report_list) if report_processor.has_errors: raise LibraryError() env.report_processor.report( reports.pcsd_ssl_cert_and_key_distribution_started( [target.label for target in target_list])) com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def remove_device(lib_env: LibraryEnvironment, skip_offline_nodes=False): """ Stop using quorum device, distribute and reload configs if live skip_offline_nodes continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if not cfg.has_quorum_device(): raise LibraryError(reports.qdevice_not_defined()) model = cfg.get_quorum_device_model() cfg.remove_quorum_device() if lib_env.is_corosync_conf_live: report_processor = lib_env.report_processor # get nodes for communication cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True) if report_processor.report_list(report_list).has_errors: raise LibraryError() target_list = lib_env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # fix quorum options for SBD to work properly if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg): lib_env.report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) cfg.set_quorum_options({"auto_tie_breaker": "1"}) # disable qdevice lib_env.report_processor.report( reports.service_disable_started("corosync-qdevice")) com_cmd_disable = qdevice_com.Disable(lib_env.report_processor, skip_offline_nodes) com_cmd_disable.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_disable) # stop qdevice lib_env.report_processor.report( reports.service_stop_started("corosync-qdevice")) com_cmd_stop = qdevice_com.Stop(lib_env.report_processor, skip_offline_nodes) com_cmd_stop.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_stop) # handle model specific configuration if model == "net": lib_env.report_processor.report( reports.qdevice_certificate_removal_started()) com_cmd_client_destroy = qdevice_net_com.ClientDestroy( lib_env.report_processor, skip_offline_nodes) com_cmd_client_destroy.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_client_destroy) lib_env.push_corosync_conf(cfg, skip_offline_nodes)
def config_sync( env: LibraryEnvironment, instance_name=None, skip_offline_nodes=False, ): """ Send specified local booth configuration to all nodes in the local cluster. env string instance_name -- booth instance name skip_offline_nodes -- if True offline nodes will be skipped """ report_processor = env.report_processor booth_env = env.get_booth_env(instance_name) if not env.is_cib_live: raise LibraryError( reports.live_environment_required([file_type_codes.CIB], )) cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf()) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) report_processor.report_list(report_list) try: booth_conf_data = booth_env.config.read_raw() booth_conf = booth_env.config.raw_to_facade(booth_conf_data) if isinstance(booth_env.config.raw_file, GhostFile): authfile_data = booth_env.key.read_raw() authfile_path = booth_conf.get_authfile() authfile_name = (os.path.basename(authfile_path) if authfile_path else None) else: authfile_name, authfile_data, authfile_report_list = ( config_files.get_authfile_name_and_data(booth_conf)) report_processor.report_list(authfile_report_list) except RawFileError as e: report_processor.report(raw_file_error_report(e)) except ParserErrorException as e: report_processor.report_list( booth_env.config.parser_exception_to_report_list(e)) if report_processor.has_errors: raise LibraryError() com_cmd = BoothSendConfig(env.report_processor, booth_env.instance_name, booth_conf_data, authfile=authfile_name, authfile_data=authfile_data, skip_offline_targets=skip_offline_nodes) com_cmd.set_targets(env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, )) run_and_raise(env.get_node_communicator(), com_cmd)
def destroy( env: LibraryEnvironment, force_flags: Container[reports.types.ForceCode] = (), ) -> None: """ Destroy disaster-recovery configuration on all sites """ if env.ghost_file_codes: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentRequired( env.ghost_file_codes))) report_processor = env.report_processor skip_offline = report_codes.SKIP_OFFLINE_NODES in force_flags report_list, dr_config = _load_dr_config(env.get_dr_env().config) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf()) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() remote_nodes: List[str] = [] for conf_remote_site in dr_config.get_remote_site_list(): remote_nodes.extend(conf_remote_site.node_name_list) target_factory = env.get_node_target_factory() report_list, targets = target_factory.get_target_list_with_reports( remote_nodes + local_nodes, skip_non_existing=skip_offline, ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() com_cmd = RemoveFilesWithoutForces( env.report_processor, { "pcs disaster-recovery config": { "type": "pcs_disaster_recovery_conf", }, }, ) com_cmd.set_targets(targets) run_and_raise(env.get_node_communicator(), com_cmd)
def config_text(env: LibraryEnvironment, instance_name=None, node_name=None): """ get configuration in raw format env string instance_name -- booth instance name string node_name -- get the config from specified node or local host if None """ report_processor = env.report_processor booth_env = env.get_booth_env(instance_name) instance_name = booth_env.instance_name # It does not make any sense for the cli to read a ghost file and send it # to lib so that the lib could return it unchanged to cli. Just use 'cat'. # When node_name is specified, using ghost files doesn't make any sense # either. _ensure_live_env(env, booth_env) if node_name is None: try: return booth_env.config.read_raw() except RawFileError as e: report_processor.report(raw_file_error_report(e)) if report_processor.has_errors: raise LibraryError() com_cmd = BoothGetConfig(env.report_processor, instance_name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)] ) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. remote_data = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: # TODO switch to new file transfer commands (not implemented yet) # which send and receive configs as bytes instead of strings return remote_data["config"]["data"].encode("utf-8") except KeyError: raise LibraryError( ReportItem.error(reports.messages.InvalidResponseFormat(node_name)) )
def full_cluster_status_plaintext( env: LibraryEnvironment, hide_inactive_resources: bool = False, verbose: bool = False, ) -> str: """ Return full cluster status as plaintext env -- LibraryEnvironment hide_inactive_resources -- if True, do not display non-running resources verbose -- if True, display more info """ # pylint: disable=too-many-branches # pylint: disable=too-many-locals # pylint: disable=too-many-statements # validation if not env.is_cib_live and env.is_corosync_conf_live: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentNotConsistent( [file_type_codes.CIB], [file_type_codes.COROSYNC_CONF], ) ) ) if env.is_cib_live and not env.is_corosync_conf_live: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentNotConsistent( [file_type_codes.COROSYNC_CONF], [file_type_codes.CIB], ) ) ) # initialization runner = env.cmd_runner() report_processor = env.report_processor live = env.is_cib_live and env.is_corosync_conf_live is_sbd_running = False # load status, cib, corosync.conf status_text, warning_list = get_cluster_status_text( runner, hide_inactive_resources, verbose ) corosync_conf = None # If we are live on a remote node, we have no corosync.conf. # TODO Use the new file framework so the path is not exposed. if not live or os.path.exists(settings.corosync_conf_file): corosync_conf = env.get_corosync_conf() cib = env.get_cib() if verbose: ( ticket_status_text, ticket_status_stderr, ticket_status_retval, ) = get_ticket_status_text(runner) # get extra info if live if live: try: is_sbd_running = is_service_running(runner, get_sbd_service_name()) except LibraryError: pass local_services_status = _get_local_services_status(runner) if verbose and corosync_conf: node_name_list, node_names_report_list = get_existing_nodes_names( corosync_conf ) report_processor.report_list(node_names_report_list) node_reachability = _get_node_reachability( env.get_node_target_factory(), env.get_node_communicator(), report_processor, node_name_list, ) # check stonith configuration warning_list = list(warning_list) warning_list.extend(_stonith_warnings(cib, is_sbd_running)) # put it all together if report_processor.has_errors: raise LibraryError() cluster_name = ( corosync_conf.get_cluster_name() if corosync_conf else nvpair.get_value( "cluster_property_set", get_crm_config(cib), "cluster-name", "" ) ) parts = [] parts.append(f"Cluster name: {cluster_name}") if warning_list: parts.extend(["", "WARNINGS:"] + warning_list + [""]) parts.append(status_text) if verbose: parts.extend(["", "Tickets:"]) if ticket_status_retval != 0: ticket_warning_parts = [ "WARNING: Unable to get information about tickets" ] if ticket_status_stderr: ticket_warning_parts.extend( indent(ticket_status_stderr.splitlines()) ) parts.extend(indent(ticket_warning_parts)) else: parts.extend(indent(ticket_status_text.splitlines())) if live: if verbose and corosync_conf: parts.extend(["", "PCSD Status:"]) parts.extend( indent( _format_node_reachability(node_name_list, node_reachability) ) ) parts.extend(["", "Daemon Status:"]) parts.extend( indent(_format_local_services_status(local_services_status)) ) return "\n".join(parts)
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None: """ Set up disaster recovery with the local cluster being the primary site env node_name -- a known host from the recovery site """ if env.ghost_file_codes: raise LibraryError( reports.live_environment_required(env.ghost_file_codes)) report_processor = SimpleReportProcessor(env.report_processor) dr_env = env.get_dr_env() if dr_env.config.raw_file.exists(): report_processor.report(reports.dr_config_already_exist()) target_factory = env.get_node_target_factory() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True) report_processor.report_list(report_list) if node_name in local_nodes: report_processor.report(reports.node_in_local_cluster(node_name)) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) report_list, remote_targets = (target_factory.get_target_list_with_reports( [node_name], allow_skip=False, report_none_host_found=False)) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() com_cmd = GetCorosyncConf(env.report_processor) com_cmd.set_targets(remote_targets) remote_cluster_nodes, report_list = get_existing_nodes_names( CorosyncConfigFacade.from_string( run_and_raise(env.get_node_communicator(), com_cmd)), error_on_missing_name=True) if report_processor.report_list(report_list): raise LibraryError() # ensure we have tokens for all nodes of remote cluster report_list, remote_targets = target_factory.get_target_list_with_reports( remote_cluster_nodes, allow_skip=False, report_none_host_found=False) if report_processor.report_list(report_list): raise LibraryError() dr_config_exporter = (get_file_toolbox( file_type_codes.PCS_DR_CONFIG).exporter) # create dr config for remote cluster remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY) remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes) # send config to all node of remote cluster distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(remote_dr_cfg.config))) distribute_file_cmd.set_targets(remote_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd) # create new dr config, with local cluster as primary site local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY) local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes) distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(local_dr_cfg.config))) distribute_file_cmd.set_targets(local_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd)
def status_all_sites_plaintext( env: LibraryEnvironment, hide_inactive_resources: bool = False, verbose: bool = False, ) -> List[Mapping[str, Any]]: """ Return local site's and all remote sites' status as plaintext env -- LibraryEnvironment hide_inactive_resources -- if True, do not display non-running resources verbose -- if True, display more info """ # The command does not provide an option to skip offline / unreacheable / # misbehaving nodes. # The point of such skipping is to stop a command if it is unable to make # changes on all nodes. The user can then decide to proceed anyway and # make changes on the skipped nodes later manually. # This command only reads from nodes so it automatically asks other nodes # if one is offline / misbehaving. class SiteData(): local: bool role: DrRole target_list: Iterable[RequestTarget] status_loaded: bool status_plaintext: str def __init__(self, local, role, target_list): self.local = local self.role = role self.target_list = target_list self.status_loaded = False self.status_plaintext = "" if env.ghost_file_codes: raise LibraryError( reports.live_environment_required(env.ghost_file_codes)) report_processor = SimpleReportProcessor(env.report_processor) report_list, dr_config = _load_dr_config(env.get_dr_env().config) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() site_data_list = [] target_factory = env.get_node_target_factory() # get local nodes local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf()) report_processor.report_list(report_list) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, skip_non_existing=True, ) report_processor.report_list(report_list) site_data_list.append(SiteData(True, dr_config.local_role, local_targets)) # get remote sites' nodes for conf_remote_site in dr_config.get_remote_site_list(): report_list, remote_targets = ( target_factory.get_target_list_with_reports( conf_remote_site.node_name_list, skip_non_existing=True, )) report_processor.report_list(report_list) site_data_list.append( SiteData(False, conf_remote_site.role, remote_targets)) if report_processor.has_errors: raise LibraryError() # get all statuses for site_data in site_data_list: com_cmd = GetFullClusterStatusPlaintext( report_processor, hide_inactive_resources=hide_inactive_resources, verbose=verbose, ) com_cmd.set_targets(site_data.target_list) site_data.status_loaded, site_data.status_plaintext = run_com_cmd( env.get_node_communicator(), com_cmd) return [ DrSiteStatusDto( site_data.local, site_data.role, site_data.status_plaintext, site_data.status_loaded, ).to_dict() for site_data in site_data_list ]
def node_add_guest( env: LibraryEnvironment, node_name, resource_id, options, skip_offline_nodes=False, allow_incomplete_distribution=False, allow_pacemaker_remote_service_fail=False, wait: WaitType = False, ): # pylint: disable=too-many-branches # pylint: disable=too-many-locals # pylint: disable=too-many-statements """ Make a guest node from the specified resource LibraryEnvironment env -- provides all for communication with externals string node_name -- name of the guest node string resource_id -- specifies resource that should become a guest node dict options -- guest node options (remote-port, remote-addr, remote-connect-timeout) bool skip_offline_nodes -- if True, ignore when some nodes are offline bool allow_incomplete_distribution -- if True, allow this command to finish successfully even if file distribution did not succeed bool allow_pacemaker_remote_service_fail -- if True, allow this command to finish successfully even if starting/enabling pacemaker_remote did not succeed mixed wait -- a flag for controlling waiting for pacemaker idle mechanism """ wait_timeout = env.ensure_wait_satisfiable(wait) report_processor = env.report_processor cib = env.get_cib() id_provider = IdProvider(cib) corosync_conf: Optional[CorosyncConfigFacade] if env.is_cib_live: corosync_conf = env.get_corosync_conf() else: corosync_conf = None report_processor.report( ReportItem.info( reports.messages.CorosyncNodeConflictCheckSkipped( reports.const.REASON_NOT_LIVE_CIB, ))) ( existing_nodes_names, existing_nodes_addrs, report_list, ) = get_existing_nodes_names_addrs(corosync_conf, cib) if env.is_cib_live: # We just reported corosync checks are going to be skipped so we # shouldn't complain about errors related to corosync nodes report_processor.report_list(report_list) existing_target_list = [] if env.is_cib_live: target_factory = env.get_node_target_factory() existing_target_list, new_target_list = _get_targets_for_add( target_factory, report_processor, existing_nodes_names, [node_name], skip_offline_nodes, ) new_target = new_target_list[0] if new_target_list else None # default remote-addr to an address from known-hosts if "remote-addr" not in options or options["remote-addr"] is None: if new_target: new_addr = new_target.first_addr new_addr_source = ( reports.const.DEFAULT_ADDRESS_SOURCE_KNOWN_HOSTS) else: new_addr = node_name new_addr_source = reports.const.DEFAULT_ADDRESS_SOURCE_HOST_NAME options["remote-addr"] = new_addr report_processor.report( ReportItem.info( reports.messages.UsingDefaultAddressForHost( node_name, new_addr, new_addr_source))) else: # default remote-addr to an address from known-hosts if "remote-addr" not in options or options["remote-addr"] is None: known_hosts = env.get_known_hosts([node_name]) if known_hosts: new_addr = known_hosts[0].dest.addr new_addr_source = ( reports.const.DEFAULT_ADDRESS_SOURCE_KNOWN_HOSTS) else: new_addr = node_name new_addr_source = reports.const.DEFAULT_ADDRESS_SOURCE_HOST_NAME options["remote-addr"] = new_addr report_processor.report( ReportItem.info( reports.messages.UsingDefaultAddressForHost( node_name, new_addr, new_addr_source))) # validate inputs report_list = guest_node.validate_set_as_guest(cib, existing_nodes_names, existing_nodes_addrs, node_name, options) searcher = ElementSearcher(primitive.TAG, resource_id, get_resources(cib)) if searcher.element_found(): resource_element = searcher.get_element() report_list.extend(guest_node.validate_is_not_guest(resource_element)) else: report_list.extend(searcher.get_errors()) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # everything validated, let's set it up guest_node.set_as_guest( resource_element, id_provider, node_name, options.get("remote-addr", None), options.get("remote-port", None), options.get("remote-connect-timeout", None), ) if env.is_cib_live: _prepare_pacemaker_remote_environment( env, report_processor, existing_target_list, new_target, node_name, skip_offline_nodes, allow_incomplete_distribution, allow_pacemaker_remote_service_fail, ) else: report_processor.report_list( _reports_skip_new_node(node_name, "not_live_cib")) env.push_cib(wait_timeout=wait_timeout) if wait_timeout >= 0: _ensure_resource_running(env, resource_id)
def node_add_remote( env: LibraryEnvironment, node_name: str, node_addr: Optional[str], operations: Iterable[Mapping[str, str]], meta_attributes: Mapping[str, str], instance_attributes: Mapping[str, str], skip_offline_nodes: bool = False, allow_incomplete_distribution: bool = False, allow_pacemaker_remote_service_fail: bool = False, allow_invalid_operation: bool = False, allow_invalid_instance_attributes: bool = False, use_default_operations: bool = True, wait: WaitType = False, ): # pylint: disable=too-many-arguments # pylint: disable=too-many-branches # pylint: disable=too-many-locals # pylint: disable=too-many-statements """ create an ocf:pacemaker:remote resource and use it as a remote node env -- provides all for communication with externals node_name -- the name of the new node node_addr -- the address of the new node or None for default operations -- attributes for each entered operation meta_attributes -- attributes for primitive/meta_attributes instance_attributes -- attributes for primitive/instance_attributes skip_offline_nodes -- if True, ignore when some nodes are offline allow_incomplete_distribution -- if True, allow this command to finish successfully even if file distribution did not succeed allow_pacemaker_remote_service_fail -- if True, allow this command to finish successfully even if starting/enabling pacemaker_remote did not succeed allow_invalid_operation -- if True, allow to use operations that are not listed in a resource agent metadata allow_invalid_instance_attributes -- if True, allow to use instance attributes that are not listed in a resource agent metadata and allow to omit required instance_attributes use_default_operations -- if True, add operations specified in a resource agent metadata to the resource wait -- a flag for controlling waiting for pacemaker idle mechanism """ wait_timeout = env.ensure_wait_satisfiable(wait) report_processor = env.report_processor cib = env.get_cib( minimal_version=get_required_cib_version_for_primitive(operations)) id_provider = IdProvider(cib) if env.is_cib_live: corosync_conf: Optional[CorosyncConfigFacade] = env.get_corosync_conf() else: corosync_conf = None report_processor.report( ReportItem.info( reports.messages.CorosyncNodeConflictCheckSkipped( reports.const.REASON_NOT_LIVE_CIB, ))) ( existing_nodes_names, existing_nodes_addrs, report_list, ) = get_existing_nodes_names_addrs(corosync_conf, cib) if env.is_cib_live: # We just reported corosync checks are going to be skipped so we # shouldn't complain about errors related to corosync nodes report_processor.report_list(report_list) try: resource_agent_facade = ResourceAgentFacadeFactory( env.cmd_runner(), report_processor).facade_from_parsed_name(remote_node.AGENT_NAME) except ResourceAgentError as e: report_processor.report(resource_agent_error_to_report_item(e)) raise LibraryError() from e existing_target_list = [] if env.is_cib_live: target_factory = env.get_node_target_factory() existing_target_list, new_target_list = _get_targets_for_add( target_factory, report_processor, existing_nodes_names, [node_name], skip_offline_nodes, ) new_target = new_target_list[0] if new_target_list else None # default node_addr to an address from known-hosts if node_addr is None: if new_target: node_addr = new_target.first_addr node_addr_source = ( reports.const.DEFAULT_ADDRESS_SOURCE_KNOWN_HOSTS) else: node_addr = node_name node_addr_source = ( reports.const.DEFAULT_ADDRESS_SOURCE_HOST_NAME) report_processor.report( ReportItem.info( reports.messages.UsingDefaultAddressForHost( node_name, node_addr, node_addr_source))) else: # default node_addr to an address from known-hosts if node_addr is None: known_hosts = env.get_known_hosts([node_name]) if known_hosts: node_addr = known_hosts[0].dest.addr node_addr_source = ( reports.const.DEFAULT_ADDRESS_SOURCE_KNOWN_HOSTS) else: node_addr = node_name node_addr_source = ( reports.const.DEFAULT_ADDRESS_SOURCE_HOST_NAME) report_processor.report( ReportItem.info( reports.messages.UsingDefaultAddressForHost( node_name, node_addr, node_addr_source))) # validate inputs report_list = remote_node.validate_create( existing_nodes_names, existing_nodes_addrs, resource_agent_facade.metadata, node_name, node_addr, instance_attributes, ) if report_processor.report_list(report_list).has_errors: raise LibraryError() # validation + cib setup # TODO extract the validation to a separate function try: remote_resource_element = remote_node.create( env.report_processor, resource_agent_facade, get_resources(cib), id_provider, node_addr, node_name, operations, meta_attributes, instance_attributes, allow_invalid_operation, allow_invalid_instance_attributes, use_default_operations, ) except LibraryError as e: # Check unique id conflict with check against nodes. Until validation # resource create is not separated, we need to make unique post # validation. already_exists = [] unified_report_list = [] for report_item in report_list + list(e.args): # pylint: disable=no-member dto_obj = report_item.message.to_dto() if dto_obj.code not in ( reports.codes.ID_ALREADY_EXISTS, reports.codes.RESOURCE_INSTANCE_ATTR_VALUE_NOT_UNIQUE, ): unified_report_list.append(report_item) elif ("id" in dto_obj.payload and dto_obj.payload["id"] not in already_exists): unified_report_list.append(report_item) already_exists.append(dto_obj.payload["id"]) report_list = unified_report_list report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # everything validated, let's set it up if env.is_cib_live: _prepare_pacemaker_remote_environment( env, report_processor, existing_target_list, new_target, node_name, skip_offline_nodes, allow_incomplete_distribution, allow_pacemaker_remote_service_fail, ) else: report_processor.report_list( _reports_skip_new_node(node_name, "not_live_cib")) env.push_cib(wait_timeout=wait_timeout) if wait_timeout >= 0: _ensure_resource_running(env, remote_resource_element.attrib["id"])
def full_cluster_status_plaintext( env: LibraryEnvironment, hide_inactive_resources: bool = False, verbose: bool = False, ) -> str: """ Return full cluster status as plaintext env -- LibraryEnvironment hide_inactive_resources -- if True, do not display non-running resources verbose -- if True, display more info """ # pylint: disable=too-many-branches # pylint: disable=too-many-locals # validation if not env.is_cib_live and env.is_corosync_conf_live: raise LibraryError( reports.live_environment_not_consistent( [file_type_codes.CIB], [file_type_codes.COROSYNC_CONF], )) if env.is_cib_live and not env.is_corosync_conf_live: raise LibraryError( reports.live_environment_not_consistent( [file_type_codes.COROSYNC_CONF], [file_type_codes.CIB], )) # initialization runner = env.cmd_runner() report_processor = SimpleReportProcessor(env.report_processor) live = env.is_cib_live and env.is_corosync_conf_live is_sbd_running = False # load status, cib, corosync.conf status_text, warning_list = get_cluster_status_text( runner, hide_inactive_resources, verbose) corosync_conf = env.get_corosync_conf() cib = env.get_cib() if verbose: ticket_status_text, ticket_status_stderr, ticket_status_retval = ( get_ticket_status_text(runner)) # get extra info if live if live: try: is_sbd_running = is_service_running(runner, get_sbd_service_name()) except LibraryError: pass local_services_status = _get_local_services_status(runner) if verbose: node_name_list, node_names_report_list = get_existing_nodes_names( corosync_conf) report_processor.report_list(node_names_report_list) node_reachability = _get_node_reachability( env.get_node_target_factory(), env.get_node_communicator(), report_processor, node_name_list, ) # check stonith configuration warning_list = list(warning_list) warning_list.extend(_stonith_warnings(cib, is_sbd_running)) # put it all together if report_processor.has_errors: raise LibraryError() parts = [] parts.append(f"Cluster name: {corosync_conf.get_cluster_name()}") if warning_list: parts.extend(["", "WARNINGS:"] + warning_list + [""]) parts.append(status_text) if verbose: parts.extend(["", "Tickets:"]) if ticket_status_retval != 0: ticket_warning_parts = [ "WARNING: Unable to get information about tickets" ] if ticket_status_stderr: ticket_warning_parts.extend( indent(ticket_status_stderr.splitlines())) parts.extend(indent(ticket_warning_parts)) else: parts.extend(indent(ticket_status_text.splitlines())) if live: if verbose: parts.extend(["", "PCSD Status:"]) parts.extend( indent( _format_node_reachability(node_name_list, node_reachability))) parts.extend(["", "Daemon Status:"]) parts.extend( indent(_format_local_services_status(local_services_status))) return "\n".join(parts)
def add_device( lib_env: LibraryEnvironment, model, model_options, generic_options, heuristics_options, force_model=False, force_options=False, skip_offline_nodes=False, ): # pylint: disable=too-many-locals """ Add a quorum device to a cluster, distribute and reload configs if live string model -- quorum device model dict model_options -- model specific options dict generic_options -- generic quorum device options dict heuristics_options -- heuristics options bool force_model -- continue even if the model is not valid bool force_options -- continue even if options are not valid bool skip_offline_nodes -- continue even if not all nodes are accessible """ cfg = lib_env.get_corosync_conf() if cfg.has_quorum_device(): raise LibraryError( ReportItem.error(reports.messages.QdeviceAlreadyDefined())) report_processor = lib_env.report_processor report_processor.report_list( corosync_conf_validators.add_quorum_device( model, model_options, generic_options, heuristics_options, [node.nodeid for node in cfg.get_nodes()], force_model=force_model, force_options=force_options, )) if lib_env.is_corosync_conf_live: cluster_nodes_names, report_list = get_existing_nodes_names( cfg, # Pcs is unable to communicate with nodes missing names. It cannot # send new corosync.conf to them. That might break the cluster. # Hence we error out. error_on_missing_name=True, ) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() cfg.add_quorum_device( model, model_options, generic_options, heuristics_options, ) if cfg.is_quorum_device_heuristics_enabled_with_no_exec(): lib_env.report_processor.report( ReportItem.warning( reports.messages.CorosyncQuorumHeuristicsEnabledWithNoExec())) # First setup certificates for qdevice, then send corosync.conf to nodes. # If anything fails, nodes will not have corosync.conf with qdevice in it, # so there is no effect on the cluster. if lib_env.is_corosync_conf_live: target_factory = lib_env.get_node_target_factory() target_list = target_factory.get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, ) # Do model specific configuration. # If the model is not known to pcs and was forced, do not configure # anything else than corosync.conf, as we do not know what to do # anyway. if model == "net": qdevice_net.set_up_client_certificates( lib_env.cmd_runner(), lib_env.report_processor, lib_env.communicator_factory, # We are sure the "host" key is there, it has been validated # above. target_factory.get_target_from_hostname(model_options["host"]), cfg.get_cluster_name(), target_list, skip_offline_nodes, ) lib_env.report_processor.report( ReportItem.info( reports.messages.ServiceActionStarted( reports.const.SERVICE_ACTION_ENABLE, "corosync-qdevice"))) com_cmd = qdevice_com.Enable(lib_env.report_processor, skip_offline_nodes) com_cmd.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd) # everything set up, it's safe to tell the nodes to use qdevice lib_env.push_corosync_conf(cfg, skip_offline_nodes) # Now, when corosync.conf has been reloaded, we can start qdevice service. if lib_env.is_corosync_conf_live: lib_env.report_processor.report( ReportItem.info( reports.messages.ServiceActionStarted( reports.const.SERVICE_ACTION_START, "corosync-qdevice"))) com_cmd_start = qdevice_com.Start(lib_env.report_processor, skip_offline_nodes) com_cmd_start.set_targets(target_list) run_and_raise(lib_env.get_node_communicator(), com_cmd_start)
def update_scsi_devices( env: LibraryEnvironment, stonith_id: str, set_device_list: Iterable[str], force_flags: Container[reports.types.ForceCode] = (), ) -> None: """ Update scsi fencing devices without restart and affecting other resources. env -- provides all for communication with externals stonith_id -- id of stonith resource set_device_list -- paths to the scsi devices that would be set for stonith resource force_flags -- list of flags codes """ if not is_getting_resource_digest_supported(env.cmd_runner()): raise LibraryError( ReportItem.error( reports.messages.StonithRestartlessUpdateOfScsiDevicesNotSupported() ) ) cib = env.get_cib() if not set_device_list: env.report_processor.report( ReportItem.error( reports.messages.InvalidOptionValue( "devices", "", None, cannot_be_empty=True ) ) ) ( stonith_el, report_list, ) = stonith.validate_stonith_restartless_update(cib, stonith_id) if env.report_processor.report_list(report_list).has_errors: raise LibraryError() # for mypy, this should not happen because exeption would be raised if stonith_el is None: raise AssertionError("stonith element is None") stonith.update_scsi_devices_without_restart( env.cmd_runner(), env.get_cluster_state(), stonith_el, IdProvider(cib), set_device_list, ) # Unfencing cluster_nodes_names, nodes_report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True, ) env.report_processor.report_list(nodes_report_list) ( target_report_list, cluster_nodes_target_list, ) = env.get_node_target_factory().get_target_list_with_reports( cluster_nodes_names, allow_skip=False, ) env.report_processor.report_list(target_report_list) if env.report_processor.has_errors: raise LibraryError() com_cmd: AllSameDataMixin = GetCorosyncOnlineTargets( env.report_processor, skip_offline_targets=reports.codes.SKIP_OFFLINE_NODES in force_flags, ) com_cmd.set_targets(cluster_nodes_target_list) online_corosync_target_list = run_and_raise( env.get_node_communicator(), com_cmd ) com_cmd = Unfence(env.report_processor, sorted(set_device_list)) com_cmd.set_targets(online_corosync_target_list) run_and_raise(env.get_node_communicator(), com_cmd) env.push_cib()
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None: """ Set up disaster recovery with the local cluster being the primary site env node_name -- a known host from the recovery site """ # pylint: disable=too-many-locals if env.ghost_file_codes: raise LibraryError( ReportItem.error( reports.messages.LiveEnvironmentRequired( env.ghost_file_codes))) report_processor = env.report_processor dr_env = env.get_dr_env() if dr_env.config.raw_file.exists(): report_processor.report( ReportItem.error(reports.messages.DrConfigAlreadyExist())) target_factory = env.get_node_target_factory() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True) report_processor.report_list(report_list) if node_name in local_nodes: report_processor.report( ReportItem.error(reports.messages.NodeInLocalCluster(node_name))) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) report_list, remote_targets = target_factory.get_target_list_with_reports( [node_name], allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # TODO The new file framework doesn't support network communication yet. com_cmd = GetCorosyncConf(env.report_processor) com_cmd.set_targets(remote_targets) corosync_conf_instance = FileInstance.for_corosync_conf() try: remote_cluster_nodes, report_list = get_existing_nodes_names( cast( CorosyncConfigFacade, corosync_conf_instance.raw_to_facade( run_and_raise(env.get_node_communicator(), com_cmd).encode("utf-8")), ), error_on_missing_name=True, ) except ParserErrorException as e: report_processor.report_list( corosync_conf_instance.toolbox.parser.exception_to_report_list( e, file_type_codes.COROSYNC_CONF, None, force_code=None, is_forced_or_warning=False, )) if report_processor.report_list(report_list).has_errors: raise LibraryError() # ensure we have tokens for all nodes of remote cluster report_list, remote_targets = target_factory.get_target_list_with_reports( remote_cluster_nodes, allow_skip=False, report_none_host_found=False) if report_processor.report_list(report_list).has_errors: raise LibraryError() dr_config_exporter = get_file_toolbox( file_type_codes.PCS_DR_CONFIG).exporter # create dr config for remote cluster remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY) remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes) # send config to all node of remote cluster distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(remote_dr_cfg.config)), ) distribute_file_cmd.set_targets(remote_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd) # create new dr config, with local cluster as primary site local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY) local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes) distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(local_dr_cfg.config)), ) distribute_file_cmd.set_targets(local_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd)