def register(self, address=None): # FIXME: At this time the 'capabilities' attribute is unused on the manager data = { "address": address, "fqdn": self._fqdn, "nodename": self._nodename, "capabilities": self.action_plugins.capabilities, "version": version(), "csr": self._crypto.generate_csr(self._fqdn), } if self._fqdn == "localhost.localdomain": console_log.error( "Registration failed, FQDN is localhost.localdomain") raise RuntimeError( "Name resolution error, FQDN resolves to localhost.localdomain" ) # TODO: during registration, we should already have the authority certificate # so we should establish an HTTPS connection (no client cert) with the # manager, and verify that the manager's certificate is signed and for # an address matching self.url try: result = self.post(data) except HttpError: console_log.error("Registration failed to %s with request %s" % (self.url, data)) raise else: return result
def reregister_server(url, address): """ Update manager url and register agent address with manager """ if _service_is_running() is True: console_log.warning( "chroma-agent service was running before registration, stopping.") agent_service.stop() config.set('settings', 'server', {'url': url}) crypto = Crypto(config.path) agent_client = AgentClient(url + 'reregister/', ActionPluginManager(), DevicePluginManager(), ServerProperties(), crypto) data = {'address': address, 'fqdn': agent_client._fqdn} try: result = agent_client.post(data) except HttpError: console_log.error("Reregistration failed to %s with request %s" % (agent_client.url, data)) raise console_log.info("Starting chroma-agent service") agent_service.start() return result
def reregister_server(url, address): """ Update manager url and register agent address with manager """ if _service_is_running() is True: console_log.warning( "chroma-agent service was running before registration, stopping.") agent_service.stop() conf.set_server_url(url) crypto = Crypto(conf.ENV_PATH) agent_client = AgentClient( url + "reregister/", ActionPluginManager(), DevicePluginManager(), ServerProperties(), crypto, ) data = {"address": address, "fqdn": agent_client._fqdn} try: result = agent_client.post(data) except HttpError: console_log.error("Reregistration failed to %s with request %s" % (agent_client.url, data)) raise console_log.info("Starting chroma-agent service") agent_service.start() return result
def import_target(device_type, path, pacemaker_ha_operation, validate_importable=False): """ Passed a device type and a path import the device if such an operation make sense. For example a jbod scsi disk does not have the concept of import whilst zfs does. :param device_type: the type of device to import :param path: path of device to import :param pacemaker_ha_operation: This import is at the request of pacemaker. In HA operations the device may often have not have been cleanly exported because the previous mounted node failed in operation. :param validate_importable: The intention is to make sure the device can be imported but not actually import it. in this in incarnation the device is import and the exported checking for errors. :return: None or an Error message """ blockdevice = BlockDevice(device_type, path) error = blockdevice.import_(False) if error: if '-f' in error and pacemaker_ha_operation: error = blockdevice.import_(True) if error: console_log.error("Error importing pool: '%s'" % error) if (error is None) and (validate_importable is True): error = blockdevice.export() if error: console_log.error("Error exporting pool: '%s'" % error) return agent_ok_or_error(error)
def _configure_target_ha(ha_label, info, enabled=False): if enabled: extra = [] else: extra = ['--disabled'] bdev = info['bdev'] if info['device_type'] == 'zfs': extra += ['--group', _group_name(ha_label)] zpool = info['bdev'].split("/")[0] result = AgentShell.run([ 'pcs', 'resource', 'create', _zfs_name(ha_label), 'ocf:chroma:ZFS', 'pool={}'.format(zpool), 'op', 'start', 'timeout=120', 'op', 'stop', 'timeout=90' ] + extra) if result.rc != 0: console_log.error("Resource (%s) create failed:%d: %s", zpool, result.rc, result.stderr) return result if enabled and not _wait_target(_zfs_name(ha_label), True): return { "rc": -1, "stdout": "", "stderr": "ZFS Resource ({}) failed to start".format(_zfs_name(ha_label)) } else: # This is a hack for ocf:lustre:Lustre up to Lustre 2.10.5/2.11 see LU-11461 result = AgentShell.run(['realpath', info['bdev']]) if result.rc == 0 and result.stdout.startswith('/dev/sd'): bdev = result.stdout.strip() # Create Lustre resource and add target=uuid as an attribute result = AgentShell.run([ 'pcs', 'resource', 'create', ha_label, 'ocf:lustre:Lustre', 'target={}'.format(bdev), 'mountpoint={}'.format( info['mntpt']), 'op', 'start', 'timeout=600' ] + extra) if result.rc != 0 or enabled and not _wait_target(ha_label, True): if result.rc == 0: result.rc = -1 result.stderr = "Resource ({}) failed to start".format(ha_label) console_log.error("Failed to create resource %s:%d: %s", ha_label, result.rc, result.stderr) if info['device_type'] == 'zfs': AgentShell.run(['pcs', 'resource', 'delete', _zfs_name(ha_label)]) return result
def scan_packages(): """ Interrogate the packages available from configured repositories, and the installation status of those packages. """ # Look up what repos are configured # ================================= if not os.path.exists(REPO_PATH): return None cp = ConfigParser.SafeConfigParser() cp.read(REPO_PATH) repo_names = sorted(cp.sections()) repo_packages = dict([(name, defaultdict(lambda: { 'available': [], 'installed': [] })) for name in repo_names]) # For all repos, enumerate packages in the repo in alphabetic order # ================================================================= yum_util('clean', fromrepo=repo_names) # For all repos, query packages in alphabetical order # =================================================== for repo_name in repo_names: packages = repo_packages[repo_name] try: stdout = yum_util('repoquery', fromrepo=[repo_name]) # Returning nothing means the package was not found at all and so we have no data to deliver back. if stdout: for line in [l.strip() for l in stdout.strip().split("\n")]: if line.startswith("Last metadata expiration check") or \ line.startswith("Waiting for process with pid"): continue epoch, name, version, release, arch = line.split() if arch == "src": continue packages[name]['available'].append( VersionInfo(epoch=epoch, version=version, release=release, arch=arch)) except ValueError, e: console_log.error("bug HYD-2948. repoquery Output: %s" % (stdout)) raise e except RuntimeError, e: # This is a network operation, so cope with it failing daemon_log.error(e) return None
def export_target(device_type, path): """ Passed a device type and a path export the device if such an operation make sense. For example a jbod scsi disk does not have the concept of export whilst zfs does. :param path: path of device to export :param device_type: the type of device to export :return: None or an Error message """ blockdevice = BlockDevice(device_type, path) error = blockdevice.export() if error: console_log.error("Error exporting pool: '%s'" % error) return agent_ok_or_error(error)
def _configure_target_ha(ha_label, info, enabled=False): if enabled: extra = [] else: extra = ["--disabled"] xmlid = ha_label res = _resource_xml( ha_label, "ocf:lustre:Lustre", { "target": info["bdev"], "mountpoint": info["mntpt"] }, ) if info["device_type"] == "zfs": xmlid = _group_name(ha_label) grp = ET.Element("group", {"id": xmlid}) zpool = info["bdev"].split("/")[0] grp.append( _resource_xml(_zfs_name(ha_label), "ocf:chroma:ZFS", {"pool": zpool})) grp.append(res) res = grp if not enabled: meta = ET.SubElement(res, "meta_attributes", {"id": "{}-{}".format(xmlid, "meta_attributes")}) _nvpair_xml(meta, "target_role", "Stopped") # Create Lustre resource and add target=uuid as an attribute result = cibcreate("resources", ET.tostring(res)) if result.rc != 0 or enabled and not _wait_target(ha_label, True): if result.rc == 0: result = AgentShell.RunResult( -1, "", "Resource ({}) failed to start".format(ha_label), False) console_log.error("Failed to create resource %s:%d: %s", ha_label, result.rc, result.stderr) return result
def import_target(device_type, path, pacemaker_ha_operation): """ Passed a device type and a path import the device if such an operation make sense. For example a jbod scsi disk does not have the concept of import whilst zfs does. :param device_type: the type of device to import :param path: path of device to import :param pacemaker_ha_operation: This import is at the request of pacemaker. In HA operations the device may often have not have been cleanly exported because the previous mounted node failed in operation. :return: None or an Error message """ blockdevice = BlockDevice(device_type, path) error = blockdevice.import_(False) if error: if '-f' in error and pacemaker_ha_operation: error = blockdevice.import_(True) if error: console_log.error("Error importing pool: '%s'" % error) return agent_ok_or_error(error)
def configure_target_ha(primary, device, ha_label, uuid, mount_point): """ Configure the target high availability :return: Value using simple return protocol """ _mkdir_p_concurrent(mount_point) if primary: info = _get_target_config(uuid) # If the target already exists with the same params, skip. # If it already exists with different params, that is an error if _resource_exists(ha_label): if info["bdev"] == device and info["mntpt"] == mount_point: return agent_result_ok return agent_error( "A resource with the name {} already exists".format(ha_label)) if info["bdev"] != device or info["mntpt"] != mount_point: console_log.error( "Mismatch for %s do not match configured (%s on %s) != (%s on %s)", ha_label, device, mount_point, info["bdev"], info["mntpt"], ) result = _configure_target_ha(ha_label, info, False) if result.rc != 0: return agent_error("Failed to create {}: {}".format( ha_label, result.rc)) result = _configure_target_priority(primary, ha_label, _this_node()) if result.rc != 0: return agent_error( "Failed to create location constraint on {}: {}".format( ha_label, result.rc)) return agent_result_ok
def _parse_dm_table(self, stdout): if stdout.strip() == "No devices found": dm_lines = [] else: dm_lines = [i for i in stdout.split("\n") if len(i) > 0] # Compose a lookup of names of multipath devices, for use # in parsing other lines multipath_names = set() for line in dm_lines: tokens = line.split() name = tokens[0].strip(":") dm_type = tokens[3] if dm_type == 'multipath': multipath_names.add(name) def _read_lv(block_device, lv_name, vg_name, devices): self.lvs[vg_name][lv_name]['block_device'] = block_device devices = [ self.block_devices.block_device_nodes[i]['major_minor'] for i in devices ] self.vgs[vg_name]['pvs_major_minor'] = list( set(self.vgs[vg_name]['pvs_major_minor']) | set(devices)) def _read_lv_partition(block_device, parent_lv_name, vg_name): # HYD-744: FIXME: compose path in a way that copes with hyphens parent_block_device = self.block_devices.node_block_devices[ "%s/%s-%s" % (BlockDevices.MAPPERPATH, vg_name, parent_lv_name)] self.block_devices.block_device_nodes[block_device][ 'parent'] = parent_block_device def _read_mpath_partition(block_device, parent_mpath_name): # A non-LV partition parent_block_device = self.block_devices.node_block_devices[ "%s/%s" % (BlockDevices.MAPPERPATH, parent_mpath_name)] self.block_devices.block_device_nodes[block_device][ 'parent'] = parent_block_device # Make a note of which VGs/LVs are in the table so that we can # filter out nonlocal LVM components. local_lvs = set() local_vgs = set() for line in dm_lines: tokens = line.split() name = tokens[0].strip(":") dm_type = tokens[3] node_path = os.path.join(BlockDevices.MAPPERPATH, name) block_device = self.block_devices.node_block_devices[node_path] if dm_type in ['linear', 'striped']: # This is either an LV or a partition. # Try to resolve its name to a known LV, if not found then it # is a partition. # This is an LVM LV if dm_type == 'striped': # List of striped devices dev_indices = range(6, len(tokens), 2) devices = [tokens[i] for i in dev_indices] elif dm_type == 'linear': # Single device linear range devices = [tokens[4]] else: console_log.error("Failed to parse dmsetupline '%s'" % line) continue # To be an LV: # Got to have a hyphen # Got to appear in lvs dict # To be a partition: # Got to have a (.*)p\d+$ # Part preceeding that pattern must be an LV or a mpath # Potentially confusing scenarios: # A multipath device named foo-bar where there exists a VG called 'foo' # An LV whose name ends "p1" like foo-lvp1 # NB some scenarios may be as confusing for devicemapper as they are for us, e.g. # if someone creates an LV "bar" in a VG "foo", and also an mpath called "foo-bar" # First, let's see if it's an LV or an LV partition match = re.search("(.*[^-])-([^-].*)", name) if match: vg_name, lv_name = match.groups() # When a name has a "-" in it, DM prints a double hyphen in the output # So for an LV called "my-lv" you get VolGroup00-my--lv vg_name = vg_name.replace("--", "-") lv_name = lv_name.replace("--", "-") try: vg_lv_info = self.lvs[vg_name] local_vgs.add(vg_name) except KeyError: # Part before the hyphen is not a VG, so this can't be an LV pass else: if lv_name in vg_lv_info: _read_lv(block_device, lv_name, vg_name, devices) local_lvs.add(lv_name) continue else: # It's not an LV, but it matched a VG, could it be an LV partition? result = re.search("(.*)p\d+", lv_name) if result: lv_name = result.groups()[0] if lv_name in vg_lv_info: # This could be an LV partition. _read_lv_partition(block_device, lv_name, vg_name) local_lvs.add(lv_name) continue else: # If it isn't an LV or an LV partition, see if it looks like an mpath partition result = re.search("(.*)p\d+", name) if result: mpath_name = result.groups()[0] if mpath_name in multipath_names: _read_mpath_partition(block_device, mpath_name) else: # Part before p\d+ is not an mpath, therefore not a multipath partition pass else: # No trailing p\d+, therefore not a partition console_log.error( "Cannot handle devicemapper device %s: it doesn't look like an LV or a partition" % name) elif dm_type == 'multipath': if name in self.mpaths: raise RuntimeError("Duplicated mpath device %s" % name) major_minors = self._parse_multipath_params(tokens[4:]) # multipath devices might reference devices that don't exist (maybe did and the removed) so # becareful about missing keys. devices = [ self.block_devices.block_device_nodes[major_minor] for major_minor in major_minors if major_minor in self.block_devices.block_device_nodes ] # Add this devices to the canonical path list. for device in devices: ndp.add_normalized_device( device['path'], "%s/%s" % (BlockDevices.MAPPERPATH, name)) self.mpaths[name] = { "name": name, "block_device": block_device, "nodes": devices } else: continue # Filter out nonlocal LVM components (HYD-2431) self.vgs = dict([(vg, value) for vg, value in self.vgs.items() if vg in local_vgs]) self.lvs = dict([(lv, value) for lv, value in self.lvs.items() if lv in local_vgs]) for vg_name, vg_lvs in self.lvs.items(): self.lvs[vg_name] = dict([(k, v) for k, v in self.lvs[vg_name].items() if k in local_lvs])
for res in dom.getElementsByTagName('primitive'): if not (res.getAttribute("provider") == "chroma" and res.getAttribute("type") == "Target"): continue ha_label = res.getAttribute("id") # _get_target_config() will raise KeyError if uuid doesn't exist locally # next() will raise StopIteration if it doesn't find attribute target try: info = next( _get_target_config(ops.getAttribute("value")) for ops in res.getElementsByTagName('nvpair') if ops.getAttribute("name") == "target") except Exception as err: console_log.error("No local info for resource: %s", ha_label) continue _unconfigure_target_priority(False, ha_label) _unconfigure_target_priority(True, ha_label) _unconfigure_target_ha(ha_label, info, True) _configure_target_ha(ha_label, info, (active.get(ha_label) is not None)) _configure_target_priority(True, ha_label, locations[ha_label][0]) _configure_target_priority(False, ha_label, locations[ha_label][1]) wait_list.append([ha_label, (active.get(ha_label) is not None)]) # wait for last item for wait in wait_list: console_log.info("Waiting on %s", wait[0]) _wait_target(*wait)
# (target-role:Stopped) is new. if "target-role" in columns[2]: del columns[2] # and even newer pacemakers add a "(disabled)" to the end of the line: # MGS_e1321a (ocf::chroma:Target): Stopped (disabled) if columns[3] == "(disabled)": columns[3] = None # Similar to above, the third column can report one of various # states such as Starting, Started, Stopping, Stopped so only # consider targets which are Started # If we still have 4 columns at this point, the third column # must be the state if columns[2] not in ['Starting', 'Started', 'Stopping', 'Stopped']: console_log.error("Unable to determine state of %s in\n%s'" % (columns[0], lines_text)) # a target that is "Stopping" has not completed the transistion # from "Started" (i.e. running) to Stopped, so count it as running # until it completes the transition if columns[2] == "Started" or columns[2] == "Stopping": locations[columns[0]] = columns[3] else: locations[columns[0]] = None return locations def check_block_device(path, device_type): """ Precursor to formatting a device: check if there is already a filesystem on it.
def convert_targets(force=False): """ Convert existing ocf:chroma:Target to ZFS + Lustre """ try: result = AgentShell.run(["cibadmin", "--query"]) except OSError as err: if err.errno != errno.ENOENT: raise err return { "crm_mon_error": { "rc": err.errno, "stdout": err.message, "stderr": err.strerror, } } if result.rc != 0: # Pacemaker not running, or no resources configured yet return { "crm_mon_error": { "rc": result.rc, "stdout": result.stdout, "stderr": result.stderr, } } dom = ET.fromstring(result.stdout) this_node = _this_node() # node elements are numbered from 1 # dc-uuid is the node id of the domain controller dcuuid = next( (node.get("uname") for node in dom.findall(".//node") if node.get("id") == dom.get("dc-uuid")), "", ) if dcuuid != this_node and not force: console_log.info("This is not Pacemaker DC %s this is %s", dcuuid, this_node) return # Build map of resource -> [ primary node, secondary node ] locations = {} for con in dom.findall(".//rsc_location"): ha_label = con.get("rsc") if not locations.get(ha_label): locations[ha_label] = {} if con.get("id") == _constraint(ha_label, True): ind = 0 elif con.get("id") == _constraint(ha_label, False): ind = 1 else: console_log.info("Unknown constraint: %s", con.get("id")) continue locations[ha_label][ind] = con.get("node") active = get_resource_locations() AgentShell.try_run([ "crm_attribute", "--type", "crm_config", "--name", "maintenance-mode", "--update", "true", ]) wait_list = [] for res in dom.findall(".//primitive"): if not (res.get("provider") == "chroma" and res.get("type") == "Target"): continue ha_label = res.get("id") # _get_target_config() will raise KeyError if uuid doesn't exist locally # next() will raise StopIteration if it doesn't find attribute target try: info = next( _get_target_config(ops.get("value")) for ops in res.findall('.//nvpair[@name="target"]')) except Exception as err: console_log.error("No local info for resource: %s", ha_label) continue _unconfigure_target_priority(False, ha_label) _unconfigure_target_priority(True, ha_label) _unconfigure_target_ha(ha_label, True) _configure_target_ha(ha_label, info, (active.get(ha_label) is not None)) _configure_target_priority(True, ha_label, locations[ha_label][0]) _configure_target_priority(False, ha_label, locations[ha_label][1]) wait_list.append([ha_label, (active.get(ha_label) is not None)]) # wait for last item for wait in wait_list: console_log.info("Waiting on %s", wait[0]) _wait_target(*wait) AgentShell.try_run([ "crm_attribute", "--type", "crm_config", "--name", "maintenance-mode", "--delete", ])