def _get_zpool_datasets(pool_name, drives): """ Retrieve datasets belonging to a zpool """ out = AgentShell.try_run(['zfs', 'list', '-H', '-o', 'name,avail,guid']) zpool_datasets = {} if out.strip() != "no datasets available": for line in filter(None, out.split('\n')): name, size_str, uuid = line.split() size = util.human_to_bytes(size_str) if name.startswith("%s/" % pool_name): # This will need discussion, but for now fabricate a major:minor. Do we ever use them as numbers? major_minor = "zfsset:%s" % uuid zpool_datasets[uuid] = { "name": name, "path": name, "block_device": major_minor, "uuid": uuid, "size": size, "drives": drives } daemon_log.debug("zfs mount '%s'" % name) return zpool_datasets
def process_zfs_mount(device, data, zfs_mounts): # If zfs-backed target/dataset, lookup underlying pool to get uuid # and nested dataset in zed structures to access lustre svname (label). dev_root = device.split("/")[0] if not dev_root: return None, None if dev_root not in [d for d, _, _ in zfs_mounts]: daemon_log.debug("lustre device has no mounted zfs pool") # Do not skip the check below if pool's canmount=off # We do not have the pool properties ATM here # So we can't check if it's canmount=off try: pool = next(p for p in data["zed"].values() if p["name"] == dev_root) dataset = next(d for d in pool["datasets"] if d["name"] == device) fs_label = next(p["value"] for p in dataset["props"] if p["name"] == "lustre:svname" # used to be fsname ) fs_uuid = dataset["guid"] return fs_label, fs_uuid except StopIteration: daemon_log.debug("lustre device is not zfs") return None, None
def process_zfs_mount(device, data, zfs_mounts): # If zfs-backed target/dataset, lookup underlying pool to get uuid # and nested dataset in zed structures to access lustre svname (label). dev_root = device.split('/')[0] if dev_root not in [d for d, m, f in zfs_mounts]: daemon_log.debug('lustre device is not zfs') return None, None, None pool = next( p for p in data['zed'].values() if p['name'] == dev_root ) dataset = next( d for d in pool['datasets'] if d['name'] == device ) fs_label = next( p['value'] for p in dataset['props'] if p['name'] == 'lustre:svname' # used to be fsname ) fs_uuid = dataset['guid'] # note: this will be one of the many partitions that belong to the pool new_device = next( child['Disk']['path'] for child in pool['vdev']['Root']['children'] if child.get('Disk') ) return fs_label, fs_uuid, new_device
def poll(self, plugin_name): """ For any plugins that don't have a session, try asking for one. For any ongoing sessions, invoke the poll callback """ now = datetime.datetime.now() try: session = self._client.sessions.get(plugin_name) except KeyError: # Request to open a session # if plugin_name in self._client.sessions._requested_at: next_request_at = ( self._client.sessions._requested_at[plugin_name] + self._client.sessions._backoffs[plugin_name]) if now < next_request_at: # We're still in our backoff period, skip requesting a session daemon_log.debug("Delaying session request until %s" % next_request_at) return else: if (self._client.sessions._backoffs[plugin_name] < MAX_SESSION_BACKOFF): self._client.sessions._backoffs[plugin_name] *= 2 daemon_log.debug("Requesting session for plugin %s" % plugin_name) self._client.sessions._requested_at[plugin_name] = now self.put(Message("SESSION_CREATE_REQUEST", plugin_name)) else: try: data = session.poll() except Exception: backtrace = "\n".join( traceback.format_exception(*(sys.exc_info()))) daemon_log.error("Error in plugin %s: %s" % (plugin_name, backtrace)) self._client.sessions.terminate(plugin_name) self.put(Message("SESSION_CREATE_REQUEST", plugin_name)) else: if data is not None: if isinstance(data, DevicePluginMessageCollection): for message in data: session.send_message( DevicePluginMessage(message, priority=data.priority)) elif isinstance(data, DevicePluginMessage): session.send_message(data) else: session.send_message(DevicePluginMessage(data))
def process_lvm_mount(device, data): try: bdev = next((v["paths"], v["lvUuid"]) for v in data["blockDevices"].itervalues() if device in v["paths"] and v.get("lvUuid")) except StopIteration: daemon_log.debug("lustre device is not lvm") return None, None label_prefix = "/dev/disk/by-label/" fs_label = next( p.split(label_prefix, 1)[1] for p in bdev[0] if p.startswith(label_prefix)) return fs_label, bdev[1]
def find_device_and_children(device_path): devices = [] try: # Then find all the partitions for that disk and add them, they are all a child of this # zfs pool, so # scsi-0QEMU_QEMU_HARDDISK_WD-WMAP3333333 includes # scsi-0QEMU_QEMU_HARDDISK_WD-WMAP3333333-part1 for device in ndp.find_normalized_start(ndp.normalized_device_path(device_path)): daemon_log.debug("zfs device '%s'" % device) devices.append(device) except KeyError: pass return devices
def process_lvm_mount(device, data): try: bdev = next( (v['paths'], v['lvUuid']) for v in data['blockDevices'].itervalues() if device in v['paths'] and v.get('lvUuid') ) except StopIteration: daemon_log.debug('lustre device is not lvm') return None, None, None label_prefix = '/dev/disk/by-label/' fs_label = next( p.split(label_prefix, 1)[1] for p in bdev[0] if p.startswith(label_prefix) ) return fs_label, bdev[1], None
def write_to_store(key, value): """ :param key: key to update value for store :param value: value to assign to given key :param filepath: filepath of store :return: None """ daemon_log.debug('write_to_store(): writing zfs data to %s. key: %s' % (ZFS_OBJECT_STORE_PATH, key)) dataDict = read_store() # preserve other keys, only update the key specified dataDict[key] = value dataString = json.dumps(dataDict) with open(ZFS_OBJECT_STORE_PATH, 'w') as f: f.write(dataString)
def _get_zpool_datasets(self, pool_name, zpool_uuid, drives, block_devices): out = AgentShell.try_run( ['zfs', 'list', '-H', '-o', 'name,avail,guid']) zpool_datasets = {} if out.strip() != "no datasets available": for line in filter(None, out.split('\n')): name, size_str, uuid = line.split() size = util.human_to_bytes(size_str) if name.startswith("%s/" % pool_name): # This will need discussion, but for now fabricate a major:minor. Do we ever use them as numbers? major_minor = "zfsset:%s" % (len(self.datasets) + 1) block_devices.block_device_nodes[major_minor] = { 'major_minor': major_minor, 'path': name, 'serial_80': None, 'serial_83': None, 'size': size, 'filesystem_type': 'zfs', 'parent': None } # Do this to cache the device, type see blockdevice and filesystem for info. BlockDevice('zfs', name) FileSystem('zfs', name) zpool_datasets[uuid] = { "name": name, "path": name, "block_device": major_minor, "uuid": uuid, "size": size, "drives": drives } daemon_log.debug("zfs mount '%s'" % name) return zpool_datasets
def install_packages(repos, packages): """ Explicitly evaluate and install or update any specific-version dependencies and satisfy even if that involves installing an older package than is already installed. Primary use case is installing lustre-modules, which depends on a specific kernel package. :param repos: List of strings, yum repo names :param packages: List of strings, yum package names :return: package report of the format given by the lustre device plugin """ if packages != []: yum_util('clean') out = yum_util('requires', enablerepo=repos, packages=packages) for requirement in [l.strip() for l in out.strip().split("\n")]: match = re.match("([^\)/]*) = (.*)", requirement) if match: require_package, require_version = match.groups() packages.append("%s-%s" % (require_package, require_version)) yum_util('install', enablerepo=repos, packages=packages) # So now we have installed the packages requested, we will also make sure that any installed packages we # have that are already installed are updated to our presumably better versions. update_packages = yum_check_update(repos) if update_packages: daemon_log.debug( "The following packages need update after we installed IML packages %s" % update_packages) yum_util('update', packages=update_packages, enablerepo=repos) error = _check_HYD4050() if error: return agent_error(error) return agent_result(lustre.scan_packages())
def _dev_major_minor(self, path): """ Return a string if 'path' is a block device or link to one, else return None """ file_status = None retries = self.MAXRETRIES while retries > 0: try: file_status = os.stat(path) if path in self.non_existent_paths: self.non_existent_paths.discard(path) daemon_log.debug('New device started to respond %s' % path) self.previous_path_status[path] = file_status break except OSError as os_error: if os_error.errno not in [errno.ENOENT, errno.ENOTDIR]: raise # An OSError could be raised because a path genuinely doesn't # exist, but it also can be the result of conflicting with # actions that cause devices to disappear momentarily, such as # during a partprobe while it reloads the partition table. # So we retry for a short window to catch those devices that # just disappear momentarily. time.sleep(0.1) retries -= retries if path in self.non_existent_paths else 1 if file_status is None: if path not in self.non_existent_paths: self.non_existent_paths.add(path) daemon_log.debug('New device failed to respond %s' % path) if path not in self.previous_path_status: return None file_status = self.previous_path_status.pop(path) daemon_log.debug( 'Device failed to respond but stored file_status used') if stat.S_ISBLK(file_status.st_mode): return "%d:%d" % (os.major( file_status.st_rdev), os.minor(file_status.st_rdev)) else: return None
def send(self): """Return True if the POST succeeds, else False""" messages = [] completion_callbacks = [] post_envelope = { "messages": [], "server_boot_time": self._client.boot_time.isoformat() + "Z", "client_start_time": self._client.start_time.isoformat() + "Z", } # Any message we drop will need its session killed kill_sessions = set() messages_bytes = len(json.dumps(post_envelope)) while True: try: message = self._retry_messages.get_nowait() daemon_log.debug("HttpWriter got message from retry queue") except Queue.Empty: try: message = self._messages.get_nowait() daemon_log.debug( "HttpWriter got message from primary queue") except Queue.Empty: break if message.callback: completion_callbacks.append(message.callback) message_length = len(json.dumps(message.dump(self._client._fqdn))) if message_length > MAX_BYTES_PER_POST: daemon_log.warning("Oversized message %s/%s: %s" % ( message_length, MAX_BYTES_PER_POST, message.dump(self._client._fqdn), )) if messages and message_length > MAX_BYTES_PER_POST - messages_bytes: # This message will not fit into this POST: pop it back into the queue daemon_log.info( "HttpWriter message %s overflowed POST %s/%s (%d " "messages), enqueuing" % ( message.dump(self._client._fqdn), message_length, MAX_BYTES_PER_POST, len(messages), )) self._retry_messages.put(message) break messages.append(message) messages_bytes += message_length daemon_log.debug("HttpWriter sending %s messages" % len(messages)) try: post_envelope["messages"] = [ m.dump(self._client._fqdn) for m in messages ] self._client.post(post_envelope) except HttpError: daemon_log.warning("HttpWriter: request failed") # Terminate any sessions which we've just droppped messages for for message in messages: if message.type == "DATA": kill_sessions.add(message.plugin_name) for plugin_name in kill_sessions: self._client.sessions.terminate(plugin_name) return False else: return True finally: for callback in completion_callbacks: callback()
def join(self): daemon_log.debug("Client joining...") # self.reader.join() self.writer.join() self.sessions.terminate_all() daemon_log.debug("Client joined")
def stop(self): daemon_log.debug("Client stopping...") self.reader.stop() self.writer.stop() self.stopped.set()