def _shutdown(): console_log.info("Initiating server shutdown per manager request") # This will initiate a "nice" shutdown with a wall from root, etc. AgentShell.try_run(["shutdown", "-H" if halt else "-h", at_time]) console_log.info("Terminating") os._exit(0)
def scanner_cmd(cmd): # Because we are pulling from device-scanner, # It is very important that we wait for # the udev queue to settle before requesting new data AgentShell.run(["udevadm", "settle"]) client = socket.socket(socket.AF_UNIX) client.settimeout(10) client.connect_ex("/var/run/device-scanner.sock") client.sendall(json.dumps(cmd) + "\n") out = "" begin = 0 while True: out += client.recv(1024) # Messages are expected to be separated by a newline # But sometimes it is not placed in the end of the line # Thus, take out only the first one idx = out.find("\n", begin) if idx >= 0: try: return json.loads(out[:idx]) except ValueError: return None begin = len(out)
def stop_target(ha_label): ''' Stop the high availability target Return: Value using simple return protocol ''' # HYD-7230: brute force, try up to 3 times to stop the target i = 0 while True: i += 1 # Issue the command to Pacemaker to stop the target if _resource_exists(_zfs_name(ha_label)): # Group disable will disable all members of group regardless of current status error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', _group_name(ha_label)]) else: error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', ha_label]) if error: return agent_error(error) if _wait_target(ha_label, False): return agent_result_ok if i < 4: console_log.info("failed to stop target %s", ha_label) else: return agent_error("Failed to stop target {}".format(ha_label))
def private_key_file(self): """Return a path to a PEM file""" if not os.path.exists(self.PRIVATE_KEY_FILE): console_log.info("Generating private key") AgentShell.try_run(['openssl', 'genrsa', '-out', self.PRIVATE_KEY_FILE, '2048', '-sha256']) return self.PRIVATE_KEY_FILE
def kernel_status(): """ :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>} """ running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip() try: required_kernel_stdout = AgentShell.try_run( ["rpm", "-qR", "lustre-modules"]) except AgentShell.CommandExecutionError: try: required_kernel_stdout = AgentShell.try_run( ["rpm", "-qR", "lustre-client-modules"]) except AgentShell.CommandExecutionError: required_kernel_stdout = None required_kernel = None if required_kernel_stdout: for line in required_kernel_stdout.split("\n"): if line.startswith('kernel'): required_kernel = "kernel-%s.%s" % (line.split(" = ")[1], platform.machine()) available_kernels = [] for installed_kernel in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n"): if installed_kernel: available_kernels.append(installed_kernel) return { 'running': running_kernel, 'required': required_kernel, 'available': available_kernels }
def cibadmin(command_args, timeout=120): assert timeout > 0, 'timeout must be greater than zero' # I think these are "errno" values, but I'm not positive # but going forward, any additions to this should try to be informative # about the type of exit code and why it's OK to retry RETRY_CODES = { 10: "something unknown", 41: "something unknown", 62: "Timer expired", 107: "Transport endpoint is not connected" } command_args.insert(0, 'cibadmin') # NB: This isn't a "true" timeout, in that it won't forcibly stop the # subprocess after a timeout. We'd need more invasive changes to # shell._run() for that. for _ in util.wait(timeout): result = AgentShell.run(command_args) if result.rc == 0: return result elif result.rc not in RETRY_CODES: break if result.rc in RETRY_CODES: raise PacemakerError( "%s timed out after %d seconds: rc: %s, stderr: %s" % (" ".join(command_args), timeout, result.rc, result.stderr)) else: raise AgentShell.CommandExecutionError(result, command_args)
def _reboot(): console_log.info("Initiating server reboot per manager request") # reboot(8) just calls shutdown anyhow. AgentShell.try_run(["shutdown", "-r", at_time]) console_log.info("Terminating") os._exit(0)
def set_address(self, ipv4_address, prefix): ifaddr = "%s/%s" % (ipv4_address, prefix) console_log.info("Set %s (%s) up" % (self.name, ifaddr)) if self.ipv4_address != ipv4_address: node_admin.unmanage_network(self.device, self.mac_address) AgentShell.try_run( ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up']) AgentShell.try_run( ['/sbin/ip', 'addr', 'add', ifaddr, 'dev', self.name]) # The link address change is asynchronous, so we need to wait for the # address to stick of we have a race condition. timeout = 30 while self.ipv4_address != ipv4_address and timeout != 0: self.refresh() time.sleep(1) timeout -= 1 if self.ipv4_address != ipv4_address: raise RuntimeError( 'Unable to set the address %s for interface %s' % (self.ipv4_address, self.name)) node_admin.write_ifcfg(self.device, self.mac_address, self.ipv4_address, self.ipv4_netmask) else: console_log.info("Nothing to do as %s already has address %s" % (self.name, ifaddr))
def action_two(arg1): """An action which invokes subprocess_one and subprocess_two""" assert arg1 == "arg2_test" stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg']) assert stdout == 'subprocess_one_stdout' AgentShell.try_run(['subprocess_two', 'subprocess_two_arg']) return ACTION_TWO_RETVAL
def mount_lustre_filesystem(mountspec, mountpoint): try: os.makedirs(mountpoint, 0o755) except OSError as e: if e.errno != errno.EEXIST: raise create_fstab_entry(mountspec, mountpoint) AgentShell.try_run(["/bin/mount", mountpoint])
def _configure_target_ha(ha_label, info, enabled=False): if enabled: extra = [] else: extra = ['--disabled'] bdev = info['bdev'] if info['device_type'] == 'zfs': extra += ['--group', _group_name(ha_label)] zpool = info['bdev'].split("/")[0] result = AgentShell.run([ 'pcs', 'resource', 'create', _zfs_name(ha_label), 'ocf:chroma:ZFS', 'pool={}'.format(zpool), 'op', 'start', 'timeout=120', 'op', 'stop', 'timeout=90' ] + extra) if result.rc != 0: console_log.error("Resource (%s) create failed:%d: %s", zpool, result.rc, result.stderr) return result if enabled and not _wait_target(_zfs_name(ha_label), True): return { "rc": -1, "stdout": "", "stderr": "ZFS Resource ({}) failed to start".format(_zfs_name(ha_label)) } else: # This is a hack for ocf:lustre:Lustre up to Lustre 2.10.5/2.11 see LU-11461 result = AgentShell.run(['realpath', info['bdev']]) if result.rc == 0 and result.stdout.startswith('/dev/sd'): bdev = result.stdout.strip() # Create Lustre resource and add target=uuid as an attribute result = AgentShell.run([ 'pcs', 'resource', 'create', ha_label, 'ocf:lustre:Lustre', 'target={}'.format(bdev), 'mountpoint={}'.format( info['mntpt']), 'op', 'start', 'timeout=600' ] + extra) if result.rc != 0 or enabled and not _wait_target(ha_label, True): if result.rc == 0: result.rc = -1 result.stderr = "Resource ({}) failed to start".format(ha_label) console_log.error("Failed to create resource %s:%d: %s", ha_label, result.rc, result.stderr) if info['device_type'] == 'zfs': AgentShell.run(['pcs', 'resource', 'delete', _zfs_name(ha_label)]) return result
def start_lnet(): ''' Place lnet into the 'up' state. ''' console_log.info("Starting LNet") # modprobe lust is a hack for HYD-1263 - Fix or work around LU-1279 - failure trying to mount # should be removed when LU-1279 is fixed return agent_ok_or_error(AgentShell.run_canned_error_message(["lctl", "net", "up"]) or AgentShell.run_canned_error_message(["modprobe", "lustre"]))
def delete_node(nodename): rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"]) node_id = None for line in stdout.split("\n"): node_id, name, status = line.split(" ") if name == nodename: break AgentShell.try_run(["crm_node", "--force", "-R", node_id]) cibxpath("delete", '//nodes/node[@uname="{}"]'.format(nodename)) cibxpath("delete", '//status/node_state[@uname="{}"]'.format(nodename))
def private_key_file(self): """Return a path to a PEM file""" if not os.path.exists(self.PRIVATE_KEY_FILE): console_log.info("Generating private key") AgentShell.try_run([ "openssl", "genrsa", "-out", self.PRIVATE_KEY_FILE, "2048", "-sha256" ]) return self.PRIVATE_KEY_FILE
def _move_target(target_label, dest_node): """ common plumbing for failover/failback. Move the target with label to the destination node. :param target_label: The label of the node to move :param dest_node: The target to move it to. :return: None if successful or an error message if an error occurred. """ # Issue the command to Pacemaker to move the target arg_list = [ "crm_resource", "--resource", target_label, "--move", "--node", dest_node, ] # For on going debug purposes, lets get the resource locations at the beginning. # This provides useful log output in the case where things don't work. AgentShell.run(["crm_mon", "-1"]) # Now before we start cleanup anything that has gone on before. HA is a fickle # old thing and this will make sure that everything is clean before we start. AgentShell.try_run( ["crm_resource", "--resource", target_label, "--cleanup"]) if _resource_exists(_zfs_name(target_label)): AgentShell.try_run([ "crm_resource", "--resource", _zfs_name(target_label), "--cleanup" ]) result = AgentShell.run(arg_list) if result.rc != 0: return "Error ({}) running '{}': '{}' '{}'".format( result.rc, " ".join(arg_list), result.stdout, result.stderr) timeout = 100 # Now wait for it to complete its move, this will succeed quickly if it was already there while timeout > 0: if get_resource_location(target_label) == dest_node: break time.sleep(1) timeout -= 1 # now delete the constraint that crm_resource --move created AgentShell.try_run([ "crm_resource", "--resource", target_label, "--un-move", "--node", dest_node ]) if timeout <= 0: return "Failed to move target {} to node {}".format( target_label, dest_node) return None
def disable_standby(self): AgentShell.try_run([ "crm_attribute", "-N", self.name, "-n", "standby", "-v", "off", "--lifetime=forever", ])
def _unconfigure_target_ha(ha_label, info, force=False): if force: extra = ["--force"] else: extra = [] result = AgentShell.run(['pcs', 'resource', 'delete', ha_label] + extra) if info['backfstype'] == "zfs": AgentShell.run(['pcs', 'resource', 'delete', _zfs_name(ha_label)] + extra) return result
def latest_kernel(kernel_list, modlist): required_kernel = None arch = AgentShell.try_run(["uname", "-m"]).strip() for kernel in kernel_list: if not kver_gt(kernel, required_kernel, arch): continue kver = kernel.split("-", 1)[1] if AgentShell.run(["modinfo", "-n", "-k", kver] + modlist).rc == 0: required_kernel = kernel return required_kernel
def stop_lnet(): """ Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state will be unloaded before lnet is stopped. """ console_log.info("Stopping LNet") return agent_ok_or_error( AgentShell.run_canned_error_message(["lustre_rmmod", "ptlrpc"]) or AgentShell.run_canned_error_message( ["lnetctl", "lnet", "unconfigure"]))
def set_attribute(self, key, value): AgentShell.try_run([ "crm_attribute", "-t", "nodes", "-U", self.name, "-n", key, "-v", str(value), ])
def configure_corosync2_stage_2(ring0_name, ring1_name, new_node_fqdn, mcast_port, pcs_password, create_cluster): """Process configuration including peers and negotiated multicast port, no IP address information required Note: "The pcs cluster setup command will automatically configure two_node: 1 in corosync.conf, so a two-node cluster will "just work". If you are using a different cluster shell, you will have to configure corosync.conf appropriately yourself." Therefore no-quorum-policy does not have to be set when setting up cluster with pcs. :param ring0_name: :param ring1_name: :param peer_fqdns: :param mcast_port: :return: """ interfaces = [InterfaceInfo(CorosyncRingInterface(name=ring0_name, ringnumber=0, mcastport=mcast_port), None, None), InterfaceInfo(CorosyncRingInterface(name=ring1_name, ringnumber=1, mcastport=mcast_port), None, None)] config_params = { 'token': '17000', 'fail_recv_const': '10', 'transport': 'udp', 'rrpmode': 'passive', 'addr0': interfaces[0].corosync_iface.bindnetaddr, 'addr1': interfaces[1].corosync_iface.bindnetaddr, 'mcast0': interfaces[0].corosync_iface.mcastaddr, 'mcast1': interfaces[1].corosync_iface.mcastaddr, 'mcastport0': interfaces[0].corosync_iface.mcastport, 'mcastport1': interfaces[1].corosync_iface.mcastport } # authenticate nodes in cluster authenticate_nodes_in_cluster_command = ['pcs', 'cluster', 'auth', new_node_fqdn, '-u', PCS_USER, '-p', pcs_password] # build command string for setup of cluster which will result in corosync.conf rather than # writing from template, note we don't start the cluster here as services are managed # independently if create_cluster: cluster_setup_command = ['pcs', 'cluster', 'setup', '--name', PCS_CLUSTER_NAME, '--force'] + [new_node_fqdn] for param in ['transport', 'rrpmode', 'addr0', 'mcast0', 'mcastport0', 'addr1', 'mcast1', 'mcastport1', 'token', 'fail_recv_const']: # pull this value from the dictionary using parameter keyword cluster_setup_command.extend(["--" + param, str(config_params[param])]) else: cluster_setup_command = ['pcs', 'cluster', 'node', 'add', new_node_fqdn] return agent_ok_or_error(AgentShell.run_canned_error_message(authenticate_nodes_in_cluster_command) or AgentShell.run_canned_error_message(cluster_setup_command))
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Started' ]) if error: return agent_error(error) # now wait for it to start _wait_target(ha_label, True) # and make sure it didn't start but (the RA) fail(ed) rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1']) failed = True for line in stdout.split("\n"): if line.lstrip().startswith(ha_label): if line.find("FAILED") < 0: failed = False if failed: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s" % ha_label) else: return agent_error("Failed to start target %s" % ha_label) else: location = get_resource_location(ha_label) if not location: return agent_error("Started %s but now can't locate it!" % ha_label) return agent_result(location)
def delete_node(nodename): rc, stdout, stderr = AgentShell.run_old(['crm_node', '-l']) node_id = None for line in stdout.split('\n'): node_id, name, status = line.split(" ") if name == nodename: break AgentShell.try_run(['crm_node', '--force', '-R', node_id]) cibadmin( ["--delete", "-o", "nodes", "-X", "<node uname=\"%s\"/>" % nodename]) cibadmin([ "--delete", "-o", "nodes", "--crm_xml", "<node_state uname=\"%s\"/>" % nodename ])
def unmanage_network(device, mac_address): """Rewrite the network configuration file to set NM_CONTROLLED="no" TODO: This is destructive and overwrites the file loosing all settings. This needs to be fixed up. """ ifcfg_path = write_ifcfg(device, mac_address, None, None) if platform_info.distro_version >= 7.0: try: AgentShell.try_run(['nmcli', 'con', 'load', ifcfg_path]) except AgentShell.CommandExecutionError as cee: if cee.result.rc not in [ 127, 2, 8 ]: # network manager may be uninstalled (127) stopped (8) raise
def _lnet_state(self): lnet_up = False lnet_loaded = not bool( AgentShell.run(['udevadm', 'info', '--path', '/sys/module/lnet' ]).rc) if lnet_loaded: lnet_up = not bool(AgentShell.run(['lnetctl', 'net', 'show']).rc) return { (False, False): "lnet_unloaded", (False, True): "lnet_unloaded", (True, False): "lnet_down", (True, True): "lnet_up" }[(lnet_loaded, lnet_up)]
def _lnet_state(self): lnet_up = False lnet_loaded = not bool( AgentShell.run(["udevadm", "info", "--path", "/sys/module/lnet" ]).rc) if lnet_loaded: lnet_up = not bool(AgentShell.run(["lnetctl", "net", "show"]).rc) return { (False, False): "lnet_unloaded", (False, True): "lnet_unloaded", (True, False): "lnet_down", (True, True): "lnet_up", }[(lnet_loaded, lnet_up)]
def _get_zpool_datasets(pool_name, drives): """ Retrieve datasets belonging to a zpool """ out = AgentShell.try_run(['zfs', 'list', '-H', '-o', 'name,avail,guid']) zpool_datasets = {} if out.strip() != "no datasets available": for line in filter(None, out.split('\n')): name, size_str, uuid = line.split() size = util.human_to_bytes(size_str) if name.startswith("%s/" % pool_name): # This will need discussion, but for now fabricate a major:minor. Do we ever use them as numbers? major_minor = "zfsset:%s" % uuid zpool_datasets[uuid] = { "name": name, "path": name, "block_device": major_minor, "uuid": uuid, "size": size, "drives": drives } daemon_log.debug("zfs mount '%s'" % name) return zpool_datasets
def _check_HYD4050(): """ HYD-4050 means that kernels are not installed with a default kernel or the initramfs isn't present. This function checks for these cases and returns an error message if a problem exists. return: None if everything is OK, error message if not. """ # Make sure that there is an initramfs for the booting kernel try: default_kernel = AgentShell.try_run(["grubby", "--default-kernel"]).strip() except AgentShell.CommandExecutionError: return ("Unable to determine your default kernel. " "This node may not boot successfully until grub " "is fixed to have a default kernel to boot.") default_kernel_version = default_kernel[default_kernel.find("-") + 1:] initramfs = "/boot/initramfs-%s.img" % default_kernel_version if not os.path.isfile(initramfs): return ("There is no initramfs (%s) for the default kernel (%s). " "This node may not boot successfully until an initramfs " "is created." % (initramfs, default_kernel_version)) return None
def __init__(self, block_devices): self.block_devices = block_devices self.mpaths = {} self.vgs = {} self.lvs = {} for vg_name, vg_uuid, vg_size in self._get_vgs(): self.vgs[vg_name] = { 'name': vg_name, 'uuid': vg_uuid, 'size': vg_size, 'pvs_major_minor': [] } self.lvs[vg_name] = {} for lv_name, lv_uuid, lv_size, lv_path in self._get_lvs(vg_name): # Do this to cache the device, type see blockdevice and filesystem for info. BlockDevice('lvm_volume', '/dev/mapper/%s-%s' % (vg_name, lv_name)) self.lvs[vg_name][lv_name] = { 'name': lv_name, 'uuid': lv_uuid, 'size': lv_size } stdout = AgentShell.try_run(['dmsetup', 'table']) self._parse_dm_table(stdout)
def action_one_no_context(arg1): """An action which invokes subprocess_one""" assert arg1 == "arg1_test" stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg']) assert stdout == 'subprocess_one_stdout' return ACTION_ONE_NO_CONTEXT_RETVAL