def _unfence_node_devices( env: LibraryEnvironment, plug: str, original_devices: Iterable[str], updated_devices: Iterable[str], fence_agent: str, ): """ Unfence shared devices by calling fence agent script. Only newly added devices will be unfenced (set(updated_devices) - set(original_devices)). Before unfencing, original devices are checked if any of them are not fenced. If there is a fenced device, unfencing will be skipped. env -- provides communication with externals plug -- an information used for unfencing (a node name for fence_scsi, registration key for fence_mpath) original_devices -- list of devices defined before update updated_devices -- list of devices defined after update fence_agent -- fance agent name """ devices_to_unfence = set(updated_devices) - set(original_devices) if not devices_to_unfence: return fence_agent_bin = os.path.join(settings.fence_agent_binaries, fence_agent) fenced_devices = [] # do not check devices being removed for device in sorted(set(original_devices) & set(updated_devices)): stdout, stderr, return_code = env.cmd_runner().run([ fence_agent_bin, "--action=status", f"--devices={device}", f"--plug={plug}", ]) if return_code == 2: fenced_devices.append(device) elif return_code != 0: raise LibraryError( reports.ReportItem.error( reports.messages.StonithUnfencingDeviceStatusFailed( device, join_multilines([stderr, stdout])))) if fenced_devices: # At least one of existing devices is off, which means the node has # been fenced and new devices should not be unfenced. env.report_processor.report( reports.ReportItem.info( reports.messages.StonithUnfencingSkippedDevicesFenced( fenced_devices))) return stdout, stderr, return_code = env.cmd_runner().run([ fence_agent_bin, "--action=on", "--devices", ",".join(sorted(devices_to_unfence)), f"--plug={plug}", ], ) if return_code != 0: raise LibraryError( reports.ReportItem.error( reports.messages.StonithUnfencingFailed( join_multilines([stderr, stdout]))))
def resource_cleanup( runner: CommandRunner, resource: Optional[str] = None, node: Optional[str] = None, operation: Optional[str] = None, interval: Optional[str] = None, strict: bool = False, ): cmd = [__exec("crm_resource"), "--cleanup"] if resource: cmd.extend(["--resource", resource]) if node: cmd.extend(["--node", node]) if operation: cmd.extend(["--operation", operation]) if interval: cmd.extend(["--interval", interval]) if strict: cmd.extend(["--force"]) stdout, stderr, retval = runner.run(cmd) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.ResourceCleanupError( join_multilines([stderr, stdout]), resource, node))) # usefull output (what has been done) goes to stderr return join_multilines([stdout, stderr])
def resource_refresh( runner: CommandRunner, resource: Optional[str] = None, node: Optional[str] = None, strict: bool = False, force: bool = False, ): if not force and not node and not resource: summary = ClusterState(get_cluster_status_dom(runner)).summary operations = summary.nodes.attrs.count * summary.resources.attrs.count if operations > __RESOURCE_REFRESH_OPERATION_COUNT_THRESHOLD: raise LibraryError( ReportItem( reports.item.ReportItemSeverity.error(reports.codes.FORCE), reports.messages.ResourceRefreshTooTimeConsuming( __RESOURCE_REFRESH_OPERATION_COUNT_THRESHOLD), )) cmd = [__exec("crm_resource"), "--refresh"] if resource: cmd.extend(["--resource", resource]) if node: cmd.extend(["--node", node]) if strict: cmd.extend(["--force"]) stdout, stderr, retval = runner.run(cmd) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.ResourceRefreshError( join_multilines([stderr, stdout]), resource, node))) # usefull output (what has been done) goes to stderr return join_multilines([stdout, stderr])
def wait_for_idle(runner: CommandRunner, timeout: int) -> None: """ Run waiting command. Raise LibraryError if command failed. runner -- preconfigured object for running external programs timeout -- waiting timeout in seconds, wait indefinitely if non-positive integer """ args = [__exec("crm_resource"), "--wait"] if timeout > 0: args.append("--timeout={0}".format(timeout)) stdout, stderr, retval = runner.run(args) if retval != 0: # Usefull info goes to stderr - not only error messages, a list of # pending actions in case of timeout goes there as well. # We use stdout just to be sure if that's get changed. if retval == __EXITCODE_WAIT_TIMEOUT: raise LibraryError( ReportItem.error( reports.messages.WaitForIdleTimedOut( join_multilines([stderr, stdout])))) raise LibraryError( ReportItem.error( reports.messages.WaitForIdleError( join_multilines([stderr, stdout]))))
def get_cib_xml(runner, scope=None): stdout, stderr, retval = get_cib_xml_cmd_results(runner, scope) if retval != 0: if retval == __EXITCODE_CIB_SCOPE_VALID_BUT_NOT_PRESENT and scope: raise LibraryError( ReportItem.error( reports.messages.CibLoadErrorScopeMissing( scope, join_multilines([stderr, stdout])))) raise LibraryError( ReportItem.error( reports.messages.CibLoadError(join_multilines([stderr, stdout])))) return stdout
def _ticket_operation( operation, env: LibraryEnvironment, ticket_name, site_ip, instance_name ): booth_env = env.get_booth_env(instance_name) _ensure_live_env(env, booth_env) if not site_ip: site_ip_list = resource.find_bound_ip( get_resources(env.get_cib()), booth_env.config_path ) if len(site_ip_list) != 1: raise LibraryError( ReportItem.error( reports.messages.BoothCannotDetermineLocalSiteIp() ) ) site_ip = site_ip_list[0] stdout, stderr, return_code = env.cmd_runner().run( [settings.booth_binary, operation, "-s", site_ip, ticket_name] ) if return_code != 0: raise LibraryError( ReportItem.error( reports.messages.BoothTicketOperationFailed( operation, join_multilines([stderr, stdout]), site_ip, ticket_name, ) ) )
def get_cluster_status_text( runner: CommandRunner, hide_inactive_resources: bool, verbose: bool, ) -> Tuple[str, List[str]]: cmd = [__exec("crm_mon"), "--one-shot"] if not hide_inactive_resources: cmd.append("--inactive") if verbose: cmd.extend(["--show-detail", "--show-node-attributes", "--failcounts"]) # by default, pending and failed actions are displayed # with verbose==True, we display the whole history if is_fence_history_supported_status(runner): cmd.append("--fence-history=3") stdout, stderr, retval = runner.run(cmd) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.CrmMonError(join_multilines([stderr, stdout])))) warnings: List[str] = [] if stderr.strip(): warnings = [ line for line in stderr.strip().splitlines() if verbose or not line.startswith("DEBUG: ") ] return stdout.strip(), warnings
def _run_fence_history_command(runner, command, node=None): stdout, stderr, retval = runner.run( [__exec("stonith_admin"), "--history", node if node else "*", command]) if retval != 0: raise FenceHistoryCommandErrorException( join_multilines([stderr, stdout])) return stdout.strip()
def client_setup(runner, ca_certificate): """ initialize qdevice client on local host ca_certificate qnetd CA certificate """ client_destroy() # save CA certificate, corosync tool only works with files ca_file_path = os.path.join( settings.corosync_qdevice_net_client_certs_dir, settings.corosync_qdevice_net_client_ca_file_name, ) try: if not os.path.exists(ca_file_path): os.makedirs(settings.corosync_qdevice_net_client_certs_dir, mode=0o700) with open(ca_file_path, "wb") as ca_file: ca_file.write(ca_certificate) except EnvironmentError as e: raise LibraryError( ReportItem.error( reports.messages.QdeviceInitializationError( __model, e.strerror, ))) from e # initialize client's certificate storage stdout, stderr, retval = runner.run( [__qdevice_certutil, "-i", "-c", ca_file_path]) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceInitializationError( __model, join_multilines([stderr, stdout]), )))
def qdevice_sign_certificate_request(runner, cert_request, cluster_name): """ sign client certificate request cert_request certificate request data string cluster_name name of the cluster to which qdevice is being added """ if not qdevice_initialized(): raise LibraryError( ReportItem.error(reports.messages.QdeviceNotInitialized(__model))) # save the certificate request, corosync tool only works with files tmpfile = _store_to_tmpfile(cert_request, reports.messages.QdeviceCertificateSignError) # sign the request stdout, stderr, retval = runner.run( [__qnetd_certutil, "-s", "-c", tmpfile.name, "-n", cluster_name]) tmpfile.close() # temp file is deleted on close if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceCertificateSignError( join_multilines([stderr, stdout]), ))) # get signed certificate, corosync tool only works with files return _get_output_certificate( stdout, # pylint: disable=unnecessary-lambda lambda reason: reports.messages.QdeviceCertificateSignError(reason), )
def client_cert_request_to_pk12(runner, cert_request): """ transform signed certificate request to pk12 certificate which can be imported to nodes cert_request signed certificate request """ if not client_initialized(): raise LibraryError( ReportItem.error(reports.messages.QdeviceNotInitialized(__model))) # save the signed certificate request, corosync tool only works with files tmpfile = _store_to_tmpfile( cert_request, reports.messages.QdeviceCertificateImportError, ) # transform it stdout, stderr, retval = runner.run( [__qdevice_certutil, "-M", "-c", tmpfile.name]) tmpfile.close() # temp file is deleted on close if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceCertificateImportError( join_multilines([stderr, stdout]), ))) # get resulting pk12, corosync tool only works with files return _get_output_certificate( stdout, # pylint: disable=unnecessary-lambda lambda reason: reports.messages.QdeviceCertificateImportError(reason), )
def client_generate_certificate_request(runner, cluster_name): """ create a certificate request which can be signed by qnetd server string cluster_name name of the cluster to which qdevice is being added """ if not client_initialized(): raise LibraryError( ReportItem.error(reports.messages.QdeviceNotInitialized(__model)) ) stdout, stderr, retval = runner.run( [__qdevice_certutil, "-r", "-n", cluster_name] ) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceInitializationError( __model, join_multilines([stderr, stdout]), ) ) ) return _get_output_certificate( stdout, lambda reason: reports.messages.QdeviceInitializationError( __model, reason, ), )
def get_local_node_name(runner): stdout, stderr, retval = runner.run([__exec("crm_node"), "--name"]) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.PacemakerLocalNodeNameNotFound( join_multilines([stderr, stdout])))) return stdout.strip()
def get_cluster_status_xml(runner): stdout, stderr, retval = runner.run( [__exec("crm_mon"), "--one-shot", "--as-xml", "--inactive"]) if retval != 0: raise CrmMonErrorException( ReportItem.error( reports.messages.CrmMonError(join_multilines([stderr, stdout])))) return stdout
def get_local_node_name(runner): stdout, stderr, retval = runner.run([__exec("crm_node"), "--name"]) if retval != 0: klass = (PacemakerNotConnectedException if retval == __EXITCODE_NOT_CONNECTED else LibraryError) raise klass( ReportItem.error( reports.messages.PacemakerLocalNodeNameNotFound( join_multilines([stderr, stdout])))) return stdout.strip()
def get_peers_status(runner, name=None): cmd = [settings.booth_binary, "peers"] if name: cmd += ["-c", name] stdout, stderr, return_value = runner.run(cmd) if return_value != 0: raise LibraryError( ReportItem.error( reports.messages.BoothPeersStatusError( join_multilines([stderr, stdout]), ))) return stdout
def get_cluster_status_xml(runner): stdout, stderr, retval = runner.run( [__exec("crm_mon"), "--one-shot", "--as-xml", "--inactive"]) if retval != 0: klass = (PacemakerNotConnectedException if retval == __EXITCODE_NOT_CONNECTED else LibraryError) raise klass( ReportItem.error( reports.messages.CrmMonError(join_multilines([stderr, stdout])))) return stdout
def get_daemon_status(runner, name=None): cmd = [settings.booth_binary, "status"] if name: cmd += ["-c", name] stdout, stderr, return_value = runner.run(cmd) # 7 means that there is no booth instance running if return_value not in [0, 7]: raise LibraryError( ReportItem.error( reports.messages.BoothDaemonStatusError( join_multilines([stderr, stdout])))) return stdout
def remove_node(runner, node_name): stdout, stderr, retval = runner.run( [__exec("crm_node"), "--force", "--remove", node_name,] ) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.NodeRemoveInPacemakerFailed( node_list_to_remove=[node_name], reason=join_multilines([stderr, stdout]), ) ) )
def _upgrade_cib(runner): """ Upgrade CIB to the latest schema available locally or clusterwise. CommandRunner runner """ stdout, stderr, retval = runner.run( [__exec("cibadmin"), "--upgrade", "--force"]) # If we are already on the latest schema available, cibadmin exits with 0. # That is fine. We do not know here what version is required anyway. The # caller knows that and is responsible for dealing with it. if retval != 0: raise LibraryError( ReportItem.error( reports.messages.CibUpgradeFailed( join_multilines([stderr, stdout]))))
def _get_cluster_status_xml(runner: CommandRunner) -> str: """ Get pacemaker XML status. Using get_cluster_status_dom is preferred instead. runner -- a class for running external processes """ stdout, stderr, retval = get_cluster_status_xml_raw(runner) if retval == 0: return stdout # We parse error messages from XML. If we didn't get an XML, we pass it to # the exception as a plaintext. If we got an XML but it doesn't conform to # the schema, we raise an error. try: status = _get_status_from_api_result(_get_api_result_dom(stdout)) message = join_multilines([status.message] + list(status.errors)) except etree.XMLSyntaxError: message = join_multilines([stderr, stdout]) except etree.DocumentInvalid as e: raise LibraryError( ReportItem.error(reports.messages.BadClusterStateFormat())) from e klass = (PacemakerNotConnectedException if retval == __EXITCODE_NOT_CONNECTED else LibraryError) raise klass(ReportItem.error(reports.messages.CrmMonError(message)))
def qdevice_setup(runner): """ initialize qdevice on local host """ if qdevice_initialized(): raise LibraryError( ReportItem.error( reports.messages.QdeviceAlreadyInitialized(__model))) stdout, stderr, retval = runner.run([__qnetd_certutil, "-i"]) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceInitializationError( __model, join_multilines([stderr, stdout]), )))
def stop_service(runner, service, instance=None): """ Stop specified service in local system CommandRunner runner string service service name string instance instance name, it ha no effect on not systemd systems. If None no instance name will be used. """ if is_systemctl(): stdout, stderr, retval = runner.run( [_systemctl, "stop", _get_service_name(service, instance)]) else: stdout, stderr, retval = runner.run([_service, service, "stop"]) if retval != 0: raise StopServiceError(service, join_multilines([stderr, stdout]), instance)
def kill_services(runner, services): """ Kill specified services in local system CommandRunner runner iterable services service names """ # make killall not report that a process is not running stdout, stderr, retval = runner.run( [settings.killall_executable, "--quiet", "--signal", "9", "--"] + list(services)) # If a process isn't running, killall will still return 1 even with --quiet. # We don't consider that an error, so we check for output string as well. # If it's empty, no actual error happened. if retval != 0: message = join_multilines([stderr, stdout]) if message: raise KillServicesError(list(services), message)
def qdevice_status_generic_text(runner, verbose=False): """ get qdevice runtime status in plain text bool verbose get more detailed output """ args = ["-s"] if verbose: args.append("-v") stdout, stderr, retval = _qdevice_run_tool(runner, args) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceGetStatusError( __model, join_multilines([stderr, stdout]), ))) return stdout
def get_status_text(runner, verbose=False): """ Get quorum device client runtime status in plain text bool verbose get more detailed output """ cmd = [ os.path.join(settings.corosync_binaries, "corosync-qdevice-tool"), "-s", ] if verbose: cmd.append("-v") stdout, stderr, retval = runner.run(cmd) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.CorosyncQuorumGetStatusError( join_multilines([stderr, stdout])))) return stdout
def enable_service(runner, service, instance=None): """ Enable specified service in local system. Raise EnableServiceError or LibraryError on failure. runner -- CommandRunner service -- name of service instance -- instance name, it ha no effect on not systemd systems. If None no instance name will be used. """ if is_systemctl(): stdout, stderr, retval = runner.run( [_systemctl, "enable", _get_service_name(service, instance)]) else: stdout, stderr, retval = runner.run([_chkconfig, service, "on"]) if retval != 0: raise EnableServiceError(service, join_multilines([stderr, stdout]), instance)
def unfence_node(env: LibraryEnvironment, node: str, devices: Iterable[str]): """ Unfence scsi devices on a node by calling fence_scsi agent script. env -- provides communication with externals node -- node name on wich is unfencing performed devices -- scsi devices to be unfenced """ stdout, stderr, return_code = env.cmd_runner().run([ os.path.join(settings.fence_agent_binaries, "fence_scsi"), "--action=on", "--devices", ",".join(sorted(devices)), f"--plug={node}", ], ) if return_code != 0: raise LibraryError( reports.ReportItem.error( reports.messages.StonithUnfencingFailed( join_multilines([stderr, stdout]))))
def client_import_certificate_and_key(runner, pk12_certificate): """ import qdevice client certificate to the local node certificate storage """ if not client_initialized(): raise LibraryError( ReportItem.error(reports.messages.QdeviceNotInitialized(__model))) # save the certificate, corosync tool only works with files tmpfile = _store_to_tmpfile( pk12_certificate, reports.messages.QdeviceCertificateImportError, ) stdout, stderr, retval = runner.run( [__qdevice_certutil, "-m", "-c", tmpfile.name]) tmpfile.close() # temp file is deleted on close if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceCertificateImportError( join_multilines([stderr, stdout]), )))
def qdevice_status_cluster_text(runner, cluster=None, verbose=False): """ get qdevice runtime status in plain text bool verbose get more detailed output string cluster show information only about specified cluster """ args = ["-l"] if verbose: args.append("-v") if cluster: args.extend(["-c", cluster]) stdout, stderr, retval = _qdevice_run_tool(runner, args) if retval != 0: raise LibraryError( ReportItem.error( reports.messages.QdeviceGetStatusError( __model, join_multilines([stderr, stdout]), ))) return stdout