class LoadAverage(MonitorPlugin): """Plugin captures information about load average.""" persist_name = "load-average" scope = "load" # Prevent the Plugin base-class from scheduling looping calls. run_interval = None def __init__(self, interval=15, monitor_interval=60 * 60, create_time=time.time, get_load_average=os.getloadavg): self._interval = interval self._monitor_interval = monitor_interval self._create_time = create_time self._load_averages = [] self._get_load_average = get_load_average def register(self, registry): super(LoadAverage, self).register(registry) self._accumulate = Accumulator(self._persist, registry.step_size) self.registry.reactor.call_every(self._interval, self.run) self._monitor = CoverageMonitor(self._interval, 0.8, "load average snapshot", create_time=self._create_time) self.registry.reactor.call_every(self._monitor_interval, self._monitor.log) self.registry.reactor.call_on("stop", self._monitor.log, priority=2000) self.call_on_accepted("load-average", self.send_message, True) def create_message(self): load_averages = self._load_averages self._load_averages = [] return {"type": "load-average", "load-averages": load_averages} def exchange(self, urgent=False): self.registry.broker.call_if_accepted("load-average", self.send_message, urgent) def send_message(self, urgent=False): message = self.create_message() if len(message["load-averages"]): self.registry.broker.send_message(message, self._session_id, urgent=urgent) def run(self): self._monitor.ping() new_timestamp = int(self._create_time()) new_load_average = self._get_load_average()[0] step_data = self._accumulate(new_timestamp, new_load_average, "accumulate") if step_data: self._load_averages.append(step_data)
def test_percent_no_expected_data(self): """ If time < interval has passed and the monitor has received some pings, it should still return 100%. """ monitor = CoverageMonitor(10, 1.0, "test", create_time=self.reactor.time) monitor.reset() self.reactor.advance(1) monitor.ping() self.assertEqual(monitor.percent, 1.0)
class MemoryInfo(MonitorPlugin): """Plugin captures information about free memory and free swap.""" persist_name = "memory-info" scope = "memory" # Prevent the Plugin base-class from scheduling looping calls. run_interval = None def __init__(self, interval=15, monitor_interval=60 * 60, source_filename="/proc/meminfo", create_time=time.time): self._interval = interval self._monitor_interval = monitor_interval self._source_filename = source_filename self._memory_info = [] self._create_time = create_time def register(self, registry): super(MemoryInfo, self).register(registry) self._accumulate = Accumulator(self._persist, self.registry.step_size) self.registry.reactor.call_every(self._interval, self.run) self._monitor = CoverageMonitor(self._interval, 0.8, "memory/swap snapshot", create_time=self._create_time) self.registry.reactor.call_every(self._monitor_interval, self._monitor.log) self.registry.reactor.call_on("stop", self._monitor.log, priority=2000) self.call_on_accepted("memory-info", self.send_message, True) def create_message(self): memory_info = self._memory_info self._memory_info = [] return {"type": "memory-info", "memory-info": memory_info} def send_message(self, urgent=False): message = self.create_message() if len(message["memory-info"]): self.registry.broker.send_message(message, self._session_id, urgent=urgent) def exchange(self, urgent=False): self.registry.broker.call_if_accepted("memory-info", self.send_message, urgent) def run(self): self._monitor.ping() new_timestamp = int(self._create_time()) memstats = MemoryStats(self._source_filename) memory_step_data = self._accumulate(new_timestamp, memstats.free_memory, "accumulate-memory") swap_step_data = self._accumulate(new_timestamp, memstats.free_swap, "accumulate-swap") if memory_step_data and swap_step_data: timestamp = memory_step_data[0] free_memory = int(memory_step_data[1]) free_swap = int(swap_step_data[1]) self._memory_info.append((timestamp, free_memory, free_swap))
class SwiftUsage(MonitorPlugin): """Plugin reporting Swift cluster usage. This only works if the client runs on a Swift node. It requires the 'python-swift' package to be installed (which is installed on swift nodes). """ persist_name = "swift-usage" scope = "storage" def __init__(self, interval=30, monitor_interval=60 * 60, create_time=time.time, swift_ring="/etc/swift/object.ring.gz"): self._interval = interval self._monitor_interval = monitor_interval self._create_time = create_time self._swift_ring = swift_ring # To discover Recon host/port self._has_swift = has_swift self._swift_usage_points = [] self.active = True def register(self, registry): super(SwiftUsage, self).register(registry) self._accumulate = Accumulator(self._persist, self._interval) self._monitor = CoverageMonitor(self.run_interval, 0.8, "Swift device usage snapshot", create_time=self._create_time) self.registry.reactor.call_every(self._monitor_interval, self._monitor.log) self.registry.reactor.call_on("stop", self._monitor.log, priority=2000) self.call_on_accepted("swift-usage", self.send_message, True) def create_message(self): usage_points = self._swift_usage_points self._swift_usage_points = [] if usage_points: return {"type": "swift-usage", "data-points": usage_points} def send_message(self, urgent=False): message = self.create_message() if message: self.registry.broker.send_message(message, self._session_id, urgent=urgent) def exchange(self, urgent=False): self.registry.broker.call_if_accepted("swift-usage", self.send_message, urgent) def run(self): if not self._should_run(): return self._monitor.ping() host = self._get_recon_host() deferred = threads.deferToThread(self._perform_recon_call, host) deferred.addCallback(self._handle_usage) return deferred def _should_run(self): """Return whether the plugin should run.""" if not self.active: return False if not self._has_swift: logging.info("This machine does not appear to be a Swift machine. " "Deactivating plugin.") self.active = False return False # Check for object ring config file. # If it is not present, it's not a Swift machine or it not yet set up. if not os.path.exists(self._swift_ring): return False return True def _get_recon_host(self): """Return a tuple with Recon (host, port).""" local_ips = self._get_local_ips() ring = Ring(self._swift_ring) for dev in ring.devs: if dev and dev["ip"] in local_ips: return dev["ip"], dev["port"] def _get_local_ips(self): """Return a list of IP addresses for local devices.""" return [device["ip_address"] for device in get_active_device_info()] def _perform_recon_call(self, host): """Get usage information from Swift Recon service.""" if not host: return scout = Scout("diskusage") # Perform the actual call scout_result = scout.scout(host) disk_usage = scout_result[1] status_code = scout_result[2] if status_code == 200: return disk_usage def _handle_usage(self, disk_usage): if disk_usage is None: # The recon failed, most likely because swift is not responding. return timestamp = int(self._create_time()) devices = set() for usage in disk_usage: if not usage["mounted"]: continue device = usage["device"] devices.add(device) step_values = [] for key in ("size", "avail", "used"): # Store values in tree so it's easy to delete all values for a # device persist_key = "usage.%s.%s" % (device, key) step_value = self._accumulate(timestamp, usage[key], persist_key) step_values.append(step_value) if all(step_values): point = [step_value[0], device] # accumulated timestamp point.extend(int(step_value[1]) for step_value in step_values) self._swift_usage_points.append(tuple(point)) # Update device list and remove usage for devices that no longer exist. current_devices = set(self._persist.get("devices", ())) for device in current_devices - devices: self._persist.remove("usage.%s" % device) self._persist.set("devices", list(devices))
class MountInfo(MonitorPlugin): persist_name = "mount-info" scope = "disk" max_free_space_items_to_exchange = 200 def __init__(self, interval=300, monitor_interval=60 * 60, mounts_file="/proc/mounts", create_time=time.time, statvfs=None, mtab_file="/etc/mtab"): self.run_interval = interval self._monitor_interval = monitor_interval self._create_time = create_time self._mounts_file = mounts_file self._mtab_file = mtab_file if statvfs is None: statvfs = os.statvfs self._statvfs = statvfs self._create_time = create_time self._free_space = [] self._mount_info = [] self._mount_info_to_persist = None self.is_device_removable = is_device_removable def register(self, registry): super(MountInfo, self).register(registry) self._accumulate = Accumulator(self._persist, self.registry.step_size) self._monitor = CoverageMonitor(self.run_interval, 0.8, "mount info snapshot", create_time=self._create_time) self.registry.reactor.call_every(self._monitor_interval, self._monitor.log) self.registry.reactor.call_on("stop", self._monitor.log, priority=2000) self.call_on_accepted("mount-info", self.send_messages, True) def create_messages(self): return [ message for message in [ self.create_mount_info_message(), self.create_free_space_message() ] if message is not None ] def create_mount_info_message(self): if self._mount_info: message = {"type": "mount-info", "mount-info": self._mount_info} self._mount_info_to_persist = self._mount_info[:] self._mount_info = [] return message return None def create_free_space_message(self): if self._free_space: items_to_exchange = self._free_space[:self. max_free_space_items_to_exchange] message = {"type": "free-space", "free-space": items_to_exchange} self._free_space = self._free_space[ self.max_free_space_items_to_exchange:] return message return None def send_messages(self, urgent=False): for message in self.create_messages(): d = self.registry.broker.send_message(message, self._session_id, urgent=urgent) if message["type"] == "mount-info": d.addCallback(lambda x: self.persist_mount_info()) def exchange(self): self.registry.broker.call_if_accepted("mount-info", self.send_messages) def persist_mount_info(self): for timestamp, mount_info in self._mount_info_to_persist: mount_point = mount_info["mount-point"] self._persist.set(("mount-info", mount_point), mount_info) self._mount_info_to_persist = None # This forces the registry to write the persistent store to disk # This means that the persistent data reflects the state of the # messages sent. self.registry.flush() def run(self): self._monitor.ping() now = int(self._create_time()) current_mount_points = set() for mount_info in self._get_mount_info(): mount_point = mount_info["mount-point"] free_space = mount_info.pop("free-space") key = ("accumulate-free-space", mount_point) step_data = self._accumulate(now, free_space, key) if step_data: timestamp = step_data[0] free_space = int(step_data[1]) self._free_space.append((timestamp, mount_point, free_space)) prev_mount_info = self._persist.get(("mount-info", mount_point)) if not prev_mount_info or prev_mount_info != mount_info: if mount_info not in [m for t, m in self._mount_info]: self._mount_info.append((now, mount_info)) current_mount_points.add(mount_point) def _get_mount_info(self): """Generator yields local mount points worth recording data for.""" bound_mount_points = self._get_bound_mount_points() for info in get_mount_info(self._mounts_file, self._statvfs): device = info["device"] mount_point = info["mount-point"] if (device.startswith("/dev/") and not mount_point.startswith("/dev/") and not self.is_device_removable(device) and mount_point not in bound_mount_points): yield info def _get_bound_mount_points(self): """ Returns a set of mount points that have the "bind" option by parsing /etc/mtab. """ bound_points = set() if not self._mtab_file or not os.path.isfile(self._mtab_file): return bound_points file = open(self._mtab_file, "r") for line in file: try: device, mount_point, filesystem, options = line.split()[:4] mount_point = codecs.decode(mount_point, "unicode_escape") except ValueError: continue if "bind" in options.split(","): bound_points.add(mount_point) return bound_points
class Temperature(MonitorPlugin): """Capture thermal zone temperatures and trip point settings.""" persist_name = "temperature" scope = "temperature" # Prevent the Plugin base-class from scheduling looping calls. run_interval = None def __init__(self, interval=30, monitor_interval=60 * 60, thermal_zone_path=None, create_time=time.time): self.thermal_zone_path = thermal_zone_path self._interval = interval self._monitor_interval = monitor_interval self._create_time = create_time self._thermal_zones = [] self._temperatures = {} for thermal_zone in get_thermal_zones(self.thermal_zone_path): self._thermal_zones.append(thermal_zone.name) self._temperatures[thermal_zone.name] = [] def register(self, registry): super(Temperature, self).register(registry) if self._thermal_zones: self._accumulate = Accumulator(self._persist, self.registry.step_size) registry.reactor.call_every(self._interval, self.run) self._monitor = CoverageMonitor(self._interval, 0.8, "temperature snapshot", create_time=self._create_time) registry.reactor.call_every(self._monitor_interval, self._monitor.log) registry.reactor.call_on("stop", self._monitor.log, priority=2000) self.call_on_accepted("temperature", self.exchange, True) def create_messages(self): messages = [] for zone in self._thermal_zones: temperatures = self._temperatures[zone] self._temperatures[zone] = [] if not temperatures: continue messages.append({ "type": "temperature", "thermal-zone": zone, "temperatures": temperatures }) return messages def send_messages(self, urgent): for message in self.create_messages(): self.registry.broker.send_message(message, self._session_id, urgent=urgent) def exchange(self, urgent=False): self.registry.broker.call_if_accepted("temperature", self.send_messages, urgent) def run(self): self._monitor.ping() now = int(self._create_time()) for zone in get_thermal_zones(self.thermal_zone_path): if zone.temperature_value is not None: key = ("accumulate", zone.name) step_data = self._accumulate(now, zone.temperature_value, key) if step_data: self._temperatures[zone.name].append(step_data)
class CoverageMonitorTest(ReactorHavingTest): def setUp(self): super(CoverageMonitorTest, self).setUp() self.monitor = CoverageMonitor(1, 1.0, "test", create_time=self.reactor.time) def test_warn(self): self.monitor.ping() self.reactor.advance(1) self.assertFalse(self.monitor.warn()) self.reactor.advance(1) self.assertTrue(self.monitor.warn()) self.monitor.reset() self.assertFalse(self.monitor.warn()) def test_percent_no_data(self): """ If no time has passed and the monitor hasn't received any pings it should return 100%. """ self.assertEqual(self.monitor.percent, 1.0) def test_percent_no_expected_data(self): """ If time < interval has passed and the monitor has received some pings, it should still return 100%. """ monitor = CoverageMonitor(10, 1.0, "test", create_time=self.reactor.time) monitor.reset() self.reactor.advance(1) monitor.ping() self.assertEqual(monitor.percent, 1.0) def test_percent(self): self.reactor.advance(1) self.assertEqual(self.monitor.percent, 0.0) self.monitor.ping() self.reactor.advance(1) self.assertEqual(self.monitor.percent, 0.5) def test_percent_reset(self): self.reactor.advance(1) self.assertEqual(self.monitor.percent, 0.0) self.monitor.reset() self.monitor.ping() self.reactor.advance(1) self.assertEqual(self.monitor.percent, 1.0) def test_expected_count(self): self.reactor.advance(1) self.assertEqual(self.monitor.expected_count, 1.0) self.reactor.advance(1) self.assertEqual(self.monitor.expected_count, 2.0) def test_expected_count_reset(self): self.reactor.advance(1) self.assertEqual(self.monitor.expected_count, 1.0) self.monitor.reset() self.reactor.advance(1) self.assertEqual(self.monitor.expected_count, 1.0) def test_log(self): for i in range(100): self.monitor.ping() self.reactor.advance(1) self.monitor.log() self.assertTrue( "INFO: 100 of 100 expected test events (100.00%) " "occurred in the last 100.00s." in self.logfile.getvalue()) def test_log_warning(self): for i in range(100): self.reactor.advance(1) self.monitor.log() self.assertTrue( "WARNING: 0 of 100 expected test events (0.00%) " "occurred in the last 100.00s." in self.logfile.getvalue())
class CephUsage(MonitorPlugin): """ Plugin that captures Ceph usage information. This only works if the client runs on one of the Ceph monitor nodes, and noops otherwise. The plugin requires the 'python-ceph' package to be installed, which is the case on a standard "ceph" charm deployment. The landscape-client charm should join a ceph-client relation with the ceph charm, which will crete a keyring and config file for the landscape-client to consume in <data_path>/ceph-client/ceph.landscape-client.conf. It contains the following: [global] auth supported = cephx keyring = <keyring-file> mon host = <ip>:6789 The configured keyring can be generated with: ceph-authtool <keyring-file> --create-keyring --name=client.landscape-client --add-key=<key> """ persist_name = "ceph-usage" scope = "storage" # Prevent the Plugin base-class from scheduling looping calls. run_interval = None def __init__(self, interval=30, monitor_interval=60 * 60, create_time=time.time): self.active = True self._has_rados = has_rados self._interval = interval self._monitor_interval = monitor_interval self._ceph_usage_points = [] self._ceph_ring_id = None self._create_time = create_time self._ceph_config = None def register(self, registry): super(CephUsage, self).register(registry) self._ceph_config = os.path.join(self.registry.config.data_path, "ceph-client", "ceph.landscape-client.conf") self._accumulate = Accumulator(self._persist, self._interval) self._monitor = CoverageMonitor(self._interval, 0.8, "Ceph usage snapshot", create_time=self._create_time) self.registry.reactor.call_every(self._interval, self.run) self.registry.reactor.call_every(self._monitor_interval, self._monitor.log) self.registry.reactor.call_on("stop", self._monitor.log, priority=2000) self.call_on_accepted("ceph-usage", self.send_message, True) def create_message(self): ceph_points = self._ceph_usage_points ring_id = self._ceph_ring_id self._ceph_usage_points = [] return { "type": "ceph-usage", "ring-id": ring_id, "ceph-usages": [], # For backwards-compatibility "data-points": ceph_points } def send_message(self, urgent=False): message = self.create_message() if message["ring-id"] and message["data-points"]: self.registry.broker.send_message(message, self._session_id, urgent=urgent) def exchange(self, urgent=False): self.registry.broker.call_if_accepted("ceph-usage", self.send_message, urgent) def run(self): if not self._should_run(): return self._monitor.ping() deferred = threads.deferToThread(self._perform_rados_call) deferred.addCallback(self._handle_usage) return deferred def _should_run(self): """Returns whether or not this plugin should run.""" if not self.active: return False if not self._has_rados: logging.info("This machine does not appear to be a Ceph machine. " "Deactivating plugin.") self.active = False return False # Check if a ceph config file is available. # If it is not, it's not a ceph machine or ceph is not set up yet. if self._ceph_config is None or not os.path.exists(self._ceph_config): return False return True def _perform_rados_call(self): """The actual Rados interaction.""" with Rados(conffile=self._ceph_config, rados_id="landscape-client") as cluster: cluster_stats = cluster.get_cluster_stats() if self._ceph_ring_id is None: fsid = unicode(cluster.get_fsid(), "utf-8") self._ceph_ring_id = fsid return cluster_stats def _handle_usage(self, cluster_stats): """A method to use as callback to the rados interaction. Parses the output and stores the usage data in an accumulator. """ names_map = [("total", "kb"), ("avail", "kb_avail"), ("used", "kb_used")] timestamp = int(self._create_time()) step_values = [] for name, key in names_map: value = cluster_stats[key] * 1024 # Report usage in bytes step_value = self._accumulate(timestamp, value, "usage.%s" % name) step_values.append(step_value) if not all(step_values): return point = [step_value[0]] # accumulated timestamp point.extend(int(step_value[1]) for step_value in step_values) self._ceph_usage_points.append(tuple(point))
class CPUUsage(MonitorPlugin): """ Plugin that captures CPU usage information. """ persist_name = "cpu-usage" scope = "cpu" # Prevent the Plugin base-class from scheduling looping calls. run_interval = None def __init__(self, interval=30, monitor_interval=60 * 60, create_time=time.time): self._interval = interval self._monitor_interval = monitor_interval self._cpu_usage_points = [] self._create_time = create_time self._stat_file = "/proc/stat" def register(self, registry): super(CPUUsage, self).register(registry) self._accumulate = Accumulator(self._persist, registry.step_size) self.registry.reactor.call_every(self._interval, self.run) self._monitor = CoverageMonitor(self._interval, 0.8, "CPU usage snapshot", create_time=self._create_time) self.registry.reactor.call_every(self._monitor_interval, self._monitor.log) self.registry.reactor.call_on("stop", self._monitor.log, priority=2000) self.call_on_accepted("cpu-usage", self.send_message, True) def create_message(self): cpu_points = self._cpu_usage_points self._cpu_usage_points = [] return {"type": "cpu-usage", "cpu-usages": cpu_points} def send_message(self, urgent=False): message = self.create_message() if len(message["cpu-usages"]): self.registry.broker.send_message(message, self._session_id, urgent=urgent) def exchange(self, urgent=False): self.registry.broker.call_if_accepted("cpu-usage", self.send_message, urgent) def run(self): self._monitor.ping() new_timestamp = int(self._create_time()) new_cpu_usage = self._get_cpu_usage(self._stat_file) step_data = None if new_cpu_usage is not None: step_data = self._accumulate(new_timestamp, new_cpu_usage, ACCUMULATOR_KEY) if step_data is not None: self._cpu_usage_points.append(step_data) def _get_cpu_usage(self, stat_file): """ This method computes the CPU usage from C{stat_file}. """ result = None try: with open(stat_file, "r") as f: # The first line of the file is the CPU information aggregated # across cores. stat = f.readline() except IOError: logging.error( "Could not open %s for reading, " "CPU usage cannot be computed.", stat_file) return None # The cpu line is composed of: # ["cpu", user, nice, system, idle, iowait, irq, softirq, steal, guest, # guest nice] # The fields are a sum of USER_HZ quantums since boot spent in each # "category". We need to keep track of what the previous measure was, # since the current CPU usage will be calculated on the delta between # the previous measure and the current measure. # Remove the trailing "\n" fields = stat.split()[1:] idle = int(fields[3]) value = sum(int(i) for i in fields) previous = self._persist.get(LAST_MESURE_KEY) if previous is not None and value != previous[0]: delta = value - previous[0] if delta >= 0: result = (delta - idle + previous[1]) / float(delta) self._persist.set(LAST_MESURE_KEY, (value, idle)) return result