def check(self, instance): """Gather VM metrics for each instance""" # Load metric cache metric_cache = self._load_metric_cache() # Load the nova-obtained instance data cache instance_cache = self._load_instance_cache() # Build dimensions for both the customer and for operations dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance) # Define aggregate gauges, gauge name to metric name agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated', 'ram': 'nova.vm.mem.total_allocated_mb', 'disk': 'nova.vm.disk.total_allocated_gb'} agg_values = {} for gauge in agg_gauges.keys(): agg_values[gauge] = 0 insp = inspector.get_hypervisor_inspector() for inst in insp._get_connection().listAllDomains(): # Verify that this instance exists in the cache. Add if necessary. inst_name = inst.name() if inst.isActive() == 0: self.log.info("{0} is not active -- skipping.".format(inst_name)) continue if inst_name not in instance_cache: instance_cache = self._update_instance_cache() if inst_name not in metric_cache: metric_cache[inst_name] = {} # Skip instances created within the probation period vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created']) if (vm_probation_remaining >= 0): self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'], vm_probation_remaining)) continue # Build customer dimensions dims_customer = dims_base.copy() dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid'] dims_customer['zone'] = instance_cache.get(inst_name)['zone'] # Add dimensions that would be helpful for operations dims_operations = dims_customer.copy() dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id'] dims_operations['cloud_tier'] = 'overcloud' # Accumulate aggregate data for gauge in agg_gauges: if gauge in instance_cache.get(inst_name): agg_values[gauge] += instance_cache.get(inst_name)[gauge] # CPU utilization percentage sample_time = float("{:9f}".format(time.time())) if 'cpu.time' in metric_cache[inst_name]: # I have a prior value, so calculate the rate & push the metric cpu_diff = insp.inspect_cpus(inst).time - metric_cache[inst_name]['cpu.time']['value'] time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp']) # Convert time_diff to nanoseconds, and calculate percentage rate = (cpu_diff / (time_diff * 1000000000)) * 100 self.gauge('cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_operations) metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time, 'value': insp.inspect_cpus(inst).time} # Disk utilization for disk in insp.inspect_disks(inst): sample_time = time.time() disk_dimensions = {'device': disk[0].device} for metric in disk[1]._fields: metric_name = "io.{0}".format(metric) if metric_name not in metric_cache[inst_name]: metric_cache[inst_name][metric_name] = {} value = int(disk[1].__getattribute__(metric)) if disk[0].device in metric_cache[inst_name][metric_name]: time_diff = sample_time - metric_cache[inst_name][metric_name][disk[0].device]['timestamp'] val_diff = value - metric_cache[inst_name][metric_name][disk[0].device]['value'] # Change the metric name to a rate, ie. "io.read_requests" # gets converted to "io.read_ops_sec" rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops')) # Customer this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst_name][metric_name][disk[0].device] = { 'timestamp': sample_time, 'value': value} # Network utilization for vnic in insp.inspect_vnics(inst): sample_time = time.time() vnic_dimensions = {'device': vnic[0].name} for metric in vnic[1]._fields: metric_name = "net.{0}".format(metric) if metric_name not in metric_cache[inst_name]: metric_cache[inst_name][metric_name] = {} value = int(vnic[1].__getattribute__(metric)) if vnic[0].name in metric_cache[inst_name][metric_name]: time_diff = sample_time - metric_cache[inst_name][metric_name][vnic[0].name]['timestamp'] val_diff = value - metric_cache[inst_name][metric_name][vnic[0].name]['value'] # Change the metric name to a rate, ie. "net.rx_bytes" # gets converted to "net.rx_bytes_sec" rate_name = "{0}_sec".format(metric_name) # Rename "tx" to "out" and "rx" to "in" rate_name = rate_name.replace("tx", "out") rate_name = rate_name.replace("rx", "in") # Customer this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst_name][metric_name][vnic[0].name] = { 'timestamp': sample_time, 'value': value} # Save these metrics for the next collector invocation self._update_metric_cache(metric_cache) # Publish aggregate metrics for gauge in agg_gauges: self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
def check(self, instance): """Gather VM metrics for each instance""" time_start = time.time() # Load metric cache metric_cache = self._load_metric_cache() # Load the nova-obtained instance data cache instance_cache = self._load_instance_cache() # Build dimensions for both the customer and for operations dims_base = self._set_dimensions( { 'service': 'compute', 'component': 'vm' }, instance) # Define aggregate gauges, gauge name to metric name agg_gauges = { 'vcpus': 'nova.vm.cpu.total_allocated', 'ram': 'nova.vm.mem.total_allocated_mb', 'disk': 'nova.vm.disk.total_allocated_gb' } agg_values = {} for gauge in agg_gauges.keys(): agg_values[gauge] = 0 insp = inspector.get_hypervisor_inspector() for inst in insp._get_connection().listAllDomains(): # Verify that this instance exists in the cache. Add if necessary. inst_name = inst.name() if inst_name not in instance_cache: instance_cache = self._update_instance_cache() # Build customer dimensions try: dims_customer = dims_base.copy() dims_customer['resource_id'] = instance_cache.get( inst_name)['instance_uuid'] dims_customer['zone'] = instance_cache.get(inst_name)['zone'] # Add dimensions that would be helpful for operations dims_operations = dims_customer.copy() dims_operations['tenant_id'] = instance_cache.get( inst_name)['tenant_id'] if self.init_config.get('metadata'): for metadata in self.init_config.get('metadata'): metadata_value = ( instance_cache.get(inst_name).get(metadata)) if metadata_value: dims_operations[metadata] = metadata_value # Remove customer 'hostname' dimension, this will be replaced by the VM name del (dims_customer['hostname']) except TypeError: # Nova can potentially get into a state where it can't see an # instance, but libvirt can. This would cause TypeErrors as # incomplete data is cached for this instance. Log and skip. self.log.error( "{0} is not known to nova after instance cache update -- skipping this ghost VM." .format(inst_name)) continue # Accumulate aggregate data for gauge in agg_gauges: if gauge in instance_cache.get(inst_name): agg_values[gauge] += instance_cache.get(inst_name)[gauge] # Skip further processing on VMs that are not in an active state if self._inspect_state(insp, inst, instance_cache, dims_customer, dims_operations) != 0: continue # Skip the remainder of the checks if alive_only is True in the config if self.init_config.get('alive_only'): continue # Skip instances created within the probation period vm_probation_remaining = self._test_vm_probation( instance_cache.get(inst_name)['created']) if (vm_probation_remaining >= 0): self.log.info( "Libvirt: {0} in probation for another {1} seconds".format( instance_cache.get(inst_name)['hostname'].encode( 'utf8'), vm_probation_remaining)) continue if inst_name not in metric_cache: metric_cache[inst_name] = {} self._inspect_cpu(insp, inst, instance_cache, metric_cache, dims_customer, dims_operations) self._inspect_disks(insp, inst, instance_cache, metric_cache, dims_customer, dims_operations) self._inspect_network(insp, inst, instance_cache, metric_cache, dims_customer, dims_operations) # Memory utilizaion # (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON) try: mem_metrics = { 'mem.free_mb': float(inst.memoryStats()['unused']) / 1024, 'mem.swap_used_mb': float(inst.memoryStats()['swap_out']) / 1024, 'mem.total_mb': float(inst.memoryStats()['available'] - inst.memoryStats()['unused']) / 1024, 'mem.used_mb': float(inst.memoryStats()['available'] - inst.memoryStats()['unused']) / 1024, 'mem.free_perc': float(inst.memoryStats()['unused']) / float(inst.memoryStats()['available']) * 100 } for name in mem_metrics: self.gauge( name, mem_metrics[name], dimensions=dims_customer, delegated_tenant=instance_cache.get( inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge("vm.{0}".format(name), mem_metrics[name], dimensions=dims_operations) except KeyError: self.log.debug( "Balloon driver not active/available on guest {0} ({1})". format(inst_name, instance_cache.get(inst_name)['hostname'])) # Test instance's remote responsiveness (ping check) if possible if self.init_config.get( 'ping_check') and 'network' in instance_cache.get( inst_name): for net in instance_cache.get(inst_name)['network']: ping_cmd = self.init_config.get('ping_check').replace( 'NAMESPACE', net['namespace']).split() ping_cmd.append(net['ip']) dims_customer_ip = dims_customer.copy() dims_operations_ip = dims_operations.copy() dims_customer_ip['ip'] = net['ip'] dims_operations_ip['ip'] = net['ip'] with open(os.devnull, "w") as fnull: try: self.log.debug("Running ping test: {0}".format( ' '.join(ping_cmd))) res = subprocess.call(ping_cmd, stdout=fnull, stderr=fnull) self.gauge('ping_status', res, dimensions=dims_customer_ip, delegated_tenant=instance_cache.get( inst_name)['tenant_id'], hostname=instance_cache.get(inst_name) ['hostname']) self.gauge('vm.ping_status', res, dimensions=dims_operations_ip) except OSError as e: self.log.warn( "OS error running '{0}' returned {1}".format( ping_cmd, e)) # Save these metrics for the next collector invocation self._update_metric_cache(metric_cache, math.ceil(time.time() - time_start)) # Publish aggregate metrics for gauge in agg_gauges: self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
def check(self, instance): """Gather VM metrics for each instance""" time_start = time.time() # Load metric cache metric_cache = self._load_metric_cache() # Load the nova-obtained instance data cache instance_cache = self._load_instance_cache() # Build dimensions for both the customer and for operations dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance) # Define aggregate gauges, gauge name to metric name agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated', 'ram': 'nova.vm.mem.total_allocated_mb', 'disk': 'nova.vm.disk.total_allocated_gb'} agg_values = {} for gauge in agg_gauges.keys(): agg_values[gauge] = 0 insp = inspector.get_hypervisor_inspector() updated_cache_this_time = False ping_results = [] for inst in insp._get_connection().listAllDomains(): # Verify that this instance exists in the cache. Add if necessary. inst_name = inst.name() if inst_name not in instance_cache and not updated_cache_this_time: # # If we have multiple ghost VMs, we'll needlessly # update the instance cache. Let's limit the cache # update to once per agent wakeup. # updated_cache_this_time = True instance_cache = self._update_instance_cache() # Build customer dimensions try: dims_customer = dims_base.copy() dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid'] dims_customer['zone'] = instance_cache.get(inst_name)['zone'] # Add dimensions that would be helpful for operations dims_operations = dims_customer.copy() dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id'] dims_operations = self._update_dims_with_metadata(instance_cache, inst_name, dims_operations) if self.init_config.get('customer_metadata'): for metadata in self.init_config.get('customer_metadata'): metadata_value = (instance_cache.get(inst_name). get(metadata)) if metadata_value: dims_customer[metadata] = metadata_value # Remove customer 'hostname' dimension, this will be replaced by the VM name del(dims_customer['hostname']) except TypeError: # Nova can potentially get into a state where it can't see an # instance, but libvirt can. This would cause TypeErrors as # incomplete data is cached for this instance. Log and skip. self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name)) continue # Accumulate aggregate data for gauge in agg_gauges: if gauge in instance_cache.get(inst_name): agg_values[gauge] += instance_cache.get(inst_name)[gauge] # Skip instances created within the probation period vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created']) if (vm_probation_remaining >= 0): self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'].encode('utf8'), vm_probation_remaining)) continue # Skip further processing on VMs that are not in an active state if self._inspect_state(insp, inst, inst_name, instance_cache, dims_customer, dims_operations) != 0: continue # Skip the remainder of the checks if alive_only is True in the config if self.init_config.get('alive_only'): continue if inst_name not in metric_cache: metric_cache[inst_name] = {} if self.init_config.get('vm_cpu_check_enable'): self._inspect_cpu(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations) if not self._collect_intervals['disk']['skip']: if self.init_config.get('vm_disks_check_enable'): self._inspect_disks(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations) if self.init_config.get('vm_extended_disks_check_enable'): self._inspect_disk_info(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations) if not self._collect_intervals['vnic']['skip']: if self.init_config.get('vm_network_check_enable'): self._inspect_network(insp, inst, inst_name, instance_cache, metric_cache, dims_customer, dims_operations) # Memory utilizaion # (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON) try: mem_stats = inst.memoryStats() mem_metrics = {'mem.free_mb': float(mem_stats['unused']) / 1024, 'mem.swap_used_mb': float(mem_stats['swap_out']) / 1024, 'mem.total_mb': float(mem_stats['available']) / 1024, 'mem.used_mb': float(mem_stats['available'] - mem_stats['unused']) / 1024, 'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100} for name in mem_metrics: self.gauge(name, mem_metrics[name], dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge("vm.{0}".format(name), mem_metrics[name], dimensions=dims_operations) memory_info = insp.inspect_memory_resident(inst) self.gauge('vm.mem.resident_mb', float(memory_info.resident), dimensions=dims_operations) except KeyError: self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name, instance_cache.get(inst_name)['hostname'])) # Test instance's remote responsiveness (ping check) if possible if (self.init_config.get('vm_ping_check_enable')) and self.init_config.get('ping_check') and 'network' in instance_cache.get(inst_name): for net in instance_cache.get(inst_name)['network']: ping_args = [dims_customer, dims_operations, inst_name, instance_cache, net] ping_results.append(self.pool.apply_async(self._run_ping, ping_args)) # Save these metrics for the next collector invocation self._update_metric_cache(metric_cache, math.ceil(time.time() - time_start)) # Publish aggregate metrics for gauge in agg_gauges: self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base) # Check results of ping tests self._check_ping_results(ping_results)
def check(self, instance): """Gather VM metrics for each instance""" time_start = time.time() # Load metric cache metric_cache = self._load_metric_cache() # Load the nova-obtained instance data cache instance_cache = self._load_instance_cache() # Build dimensions for both the customer and for operations dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance) # Define aggregate gauges, gauge name to metric name agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated', 'ram': 'nova.vm.mem.total_allocated_mb', 'disk': 'nova.vm.disk.total_allocated_gb'} agg_values = {} for gauge in agg_gauges.keys(): agg_values[gauge] = 0 insp = inspector.get_hypervisor_inspector() for inst in insp._get_connection().listAllDomains(): # Verify that this instance exists in the cache. Add if necessary. inst_name = inst.name() if inst_name not in instance_cache: instance_cache = self._update_instance_cache() # Build customer dimensions try: dims_customer = dims_base.copy() dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid'] dims_customer['zone'] = instance_cache.get(inst_name)['zone'] # Add dimensions that would be helpful for operations dims_operations = dims_customer.copy() dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id'] if self.init_config.get('metadata'): for metadata in self.init_config.get('metadata'): metadata_value = (instance_cache.get(inst_name). get(metadata)) if metadata_value: dims_operations[metadata] = metadata_value # Remove customer 'hostname' dimension, this will be replaced by the VM name del(dims_customer['hostname']) except TypeError: # Nova can potentially get into a state where it can't see an # instance, but libvirt can. This would cause TypeErrors as # incomplete data is cached for this instance. Log and skip. self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name)) continue # Accumulate aggregate data for gauge in agg_gauges: if gauge in instance_cache.get(inst_name): agg_values[gauge] += instance_cache.get(inst_name)[gauge] # Skip further processing on VMs that are not in an active state if self._inspect_state(insp, inst, instance_cache, dims_customer, dims_operations) != 0: continue # Skip the remainder of the checks if alive_only is True in the config if self.init_config.get('alive_only'): continue # Skip instances created within the probation period vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created']) if (vm_probation_remaining >= 0): self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'].encode('utf8'), vm_probation_remaining)) continue if inst_name not in metric_cache: metric_cache[inst_name] = {} self._inspect_cpu(insp, inst, instance_cache, metric_cache, dims_customer, dims_operations) self._inspect_disks(insp, inst, instance_cache, metric_cache, dims_customer, dims_operations) self._inspect_network(insp, inst, instance_cache, metric_cache, dims_customer, dims_operations) # Memory utilizaion # (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON) try: mem_metrics = {'mem.free_mb': float(inst.memoryStats()['unused']) / 1024, 'mem.swap_used_mb': float(inst.memoryStats()['swap_out']) / 1024, 'mem.total_mb': float(inst.memoryStats()['available'] - inst.memoryStats()['unused']) / 1024, 'mem.used_mb': float(inst.memoryStats()['available'] - inst.memoryStats()['unused']) / 1024, 'mem.free_perc': float(inst.memoryStats()['unused']) / float(inst.memoryStats()['available']) * 100} for name in mem_metrics: self.gauge(name, mem_metrics[name], dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge("vm.{0}".format(name), mem_metrics[name], dimensions=dims_operations) except KeyError: self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name, instance_cache.get(inst_name)['hostname'])) # Test instance's remote responsiveness (ping check) if possible if self.init_config.get('ping_check') and 'network' in instance_cache.get(inst_name): for net in instance_cache.get(inst_name)['network']: ping_cmd = self.init_config.get('ping_check').replace('NAMESPACE', net['namespace']).split() ping_cmd.append(net['ip']) dims_customer_ip = dims_customer.copy() dims_operations_ip = dims_operations.copy() dims_customer_ip['ip'] = net['ip'] dims_operations_ip['ip'] = net['ip'] with open(os.devnull, "w") as fnull: try: self.log.debug("Running ping test: {0}".format(' '.join(ping_cmd))) res = subprocess.call(ping_cmd, stdout=fnull, stderr=fnull) self.gauge('ping_status', res, dimensions=dims_customer_ip, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge('vm.ping_status', res, dimensions=dims_operations_ip) except OSError as e: self.log.warn("OS error running '{0}' returned {1}".format(ping_cmd, e)) # Save these metrics for the next collector invocation self._update_metric_cache(metric_cache, math.ceil(time.time() - time_start)) # Publish aggregate metrics for gauge in agg_gauges: self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
def check(self, instance): """Gather VM metrics for each instance""" # Load metric cache metric_cache = self._load_metric_cache() # Load the nova-obtained instance data cache instance_cache = self._load_instance_cache() # Build dimensions for both the customer and for operations dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance) # Define aggregate gauges, gauge name to metric name agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated', 'ram': 'nova.vm.mem.total_allocated_mb', 'disk': 'nova.vm.disk.total_allocated_gb'} agg_values = {} for gauge in agg_gauges.keys(): agg_values[gauge] = 0 insp = inspector.get_hypervisor_inspector() for inst in insp._get_connection().listAllDomains(): # Verify that this instance exists in the cache. Add if necessary. inst_name = inst.name() if inst_name not in instance_cache: instance_cache = self._update_instance_cache() # Build customer dimensions try: dims_customer = dims_base.copy() dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid'] dims_customer['zone'] = instance_cache.get(inst_name)['zone'] # Add dimensions that would be helpful for operations dims_operations = dims_customer.copy() dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id'] # Remove customer 'hostname' dimension, this will be replaced by the VM name del(dims_customer['hostname']) except TypeError: # Nova can potentially get into a state where it can't see an # instance, but libvirt can. This would cause TypeErrors as # incomplete data is cached for this instance. Log and skip. self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name)) continue # Skip instances that are inactive if inst.isActive() == 0: detail = 'Instance is not active' self.gauge('host_alive_status', 2, dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname'], value_meta={'detail': detail}) self.gauge('vm.host_alive_status', 2, dimensions=dims_operations, value_meta={'detail': detail}) continue if inst_name not in metric_cache: metric_cache[inst_name] = {} # Skip instances created within the probation period vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created']) if (vm_probation_remaining >= 0): self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'].encode('utf8'), vm_probation_remaining)) continue # Test instance's general responsiveness (ping check) if so configured if self.init_config.get('ping_check') and 'private_ip' in instance_cache.get(inst_name): detail = 'Ping check OK' ping_cmd = self.init_config.get('ping_check').split() ping_cmd.append(instance_cache.get(inst_name)['private_ip']) with open(os.devnull, "w") as fnull: try: res = subprocess.call(ping_cmd, stdout=fnull, stderr=fnull) if res > 0: detail = 'Host failed ping check' self.gauge('host_alive_status', res, dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname'], value_meta={'detail': detail}) self.gauge('vm.host_alive_status', res, dimensions=dims_operations, value_meta={'detail': detail}) # Do not attempt to process any more metrics for offline hosts if res > 0: continue except OSError as e: self.log.warn("OS error running '{0}' returned {1}".format(ping_cmd, e)) # Accumulate aggregate data for gauge in agg_gauges: if gauge in instance_cache.get(inst_name): agg_values[gauge] += instance_cache.get(inst_name)[gauge] # CPU utilization percentage sample_time = float("{:9f}".format(time.time())) if 'cpu.time' in metric_cache[inst_name]: # I have a prior value, so calculate the rate & push the metric cpu_diff = insp.inspect_cpus(inst).time - metric_cache[inst_name]['cpu.time']['value'] time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp']) # Convert time_diff to nanoseconds, and calculate percentage rate = (cpu_diff / (time_diff * 1000000000)) * 100 self.gauge('cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_operations) metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time, 'value': insp.inspect_cpus(inst).time} # Disk activity for disk in insp.inspect_disks(inst): sample_time = time.time() disk_dimensions = {'device': disk[0].device} for metric in disk[1]._fields: metric_name = "io.{0}".format(metric) if metric_name not in metric_cache[inst_name]: metric_cache[inst_name][metric_name] = {} value = int(disk[1].__getattribute__(metric)) if disk[0].device in metric_cache[inst_name][metric_name]: time_diff = sample_time - metric_cache[inst_name][metric_name][disk[0].device]['timestamp'] val_diff = value - metric_cache[inst_name][metric_name][disk[0].device]['value'] # Change the metric name to a rate, ie. "io.read_requests" # gets converted to "io.read_ops_sec" rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops')) # Customer this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst_name][metric_name][disk[0].device] = { 'timestamp': sample_time, 'value': value} # Memory utilizaion # (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON) try: mem_metrics = {'mem.free_mb': float(inst.memoryStats()['unused']) / 1024, 'mem.swap_used_mb': float(inst.memoryStats()['swap_out']) / 1024, 'mem.total_mb': float(inst.memoryStats()['available'] - inst.memoryStats()['unused']) / 1024, 'mem.used_mb': float(inst.memoryStats()['available'] - inst.memoryStats()['unused']) / 1024, 'mem.free_perc': float(inst.memoryStats()['unused']) / float(inst.memoryStats()['available']) * 100} for name in mem_metrics: self.gauge(name, mem_metrics[name], dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge("vm.{0}".format(name), mem_metrics[name], dimensions=dims_operations) except KeyError: self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name, instance_cache.get(inst_name)['hostname'])) # Network activity for vnic in insp.inspect_vnics(inst): sample_time = time.time() vnic_dimensions = {'device': vnic[0].name} for metric in vnic[1]._fields: metric_name = "net.{0}".format(metric) if metric_name not in metric_cache[inst_name]: metric_cache[inst_name][metric_name] = {} value = int(vnic[1].__getattribute__(metric)) if vnic[0].name in metric_cache[inst_name][metric_name]: time_diff = sample_time - metric_cache[inst_name][metric_name][vnic[0].name]['timestamp'] val_diff = value - metric_cache[inst_name][metric_name][vnic[0].name]['value'] # Change the metric name to a rate, ie. "net.rx_bytes" # gets converted to "net.rx_bytes_sec" rate_name = "{0}_sec".format(metric_name) # Rename "tx" to "out" and "rx" to "in" rate_name = rate_name.replace("tx", "out") rate_name = rate_name.replace("rx", "in") # Customer this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst_name][metric_name][vnic[0].name] = { 'timestamp': sample_time, 'value': value} # Save these metrics for the next collector invocation self._update_metric_cache(metric_cache) # Publish aggregate metrics for gauge in agg_gauges: self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)
def check(self, instance): """Gather VM metrics for each instance""" # Load metric cache metric_cache = self._load_metric_cache() # Load the nova-obtained instance data cache instance_cache = self._load_instance_cache() # Build dimensions for both the customer and for operations dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance) insp = inspector.get_hypervisor_inspector() for inst in insp.inspect_instances(): # Verify that this instance exists in the cache. Add if necessary. if inst.name not in instance_cache: instance_cache = self._update_instance_cache() if inst.name not in metric_cache: metric_cache[inst.name] = {} # Skip instances created within the probation period vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst.name)['created']) if (vm_probation_remaining >= 0): self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst.name)['hostname'], vm_probation_remaining)) continue # Build customer dimensions dims_customer = dims_base.copy() dims_customer['resource_id'] = instance_cache.get(inst.name)['instance_uuid'] dims_customer['zone'] = instance_cache.get(inst.name)['zone'] # Add dimensions that would be helpful for operations dims_operations = dims_customer.copy() dims_operations['tenant_id'] = instance_cache.get(inst.name)['tenant_id'] dims_operations['cloud_tier'] = 'overcloud' # CPU utilization percentage sample_time = float("{:9f}".format(time.time())) if 'cpu.time' in metric_cache[inst.name]: # I have a prior value, so calculate the rate & push the metric cpu_diff = insp.inspect_cpus(inst.name).time - metric_cache[inst.name]['cpu.time']['value'] time_diff = sample_time - float(metric_cache[inst.name]['cpu.time']['timestamp']) # Convert time_diff to nanoseconds, and calculate percentage rate = (cpu_diff / (time_diff * 1000000000)) * 100 self.gauge('cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_customer, delegated_tenant=instance_cache.get(inst.name)['tenant_id'], hostname=instance_cache.get(inst.name)['hostname']) self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_operations) metric_cache[inst.name]['cpu.time'] = {'timestamp': sample_time, 'value': insp.inspect_cpus(inst.name).time} # Disk utilization for disk in insp.inspect_disks(inst.name): sample_time = time.time() disk_dimensions = {'device': disk[0].device} for metric in disk[1]._fields: metric_name = "io.{0}".format(metric) if metric_name not in metric_cache[inst.name]: metric_cache[inst.name][metric_name] = {} value = int(disk[1].__getattribute__(metric)) if disk[0].device in metric_cache[inst.name][metric_name]: time_diff = sample_time - metric_cache[inst.name][metric_name][disk[0].device]['timestamp'] val_diff = value - metric_cache[inst.name][metric_name][disk[0].device]['value'] # Change the metric name to a rate, ie. "io.read_requests" # gets converted to "io.read_ops_sec" rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops')) # Customer this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst.name)['tenant_id'], hostname=instance_cache.get(inst.name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst.name][metric_name][disk[0].device] = { 'timestamp': sample_time, 'value': value} # Network utilization for vnic in insp.inspect_vnics(inst.name): sample_time = time.time() vnic_dimensions = {'device': vnic[0].name} for metric in vnic[1]._fields: metric_name = "net.{0}".format(metric) if metric_name not in metric_cache[inst.name]: metric_cache[inst.name][metric_name] = {} value = int(vnic[1].__getattribute__(metric)) if vnic[0].name in metric_cache[inst.name][metric_name]: time_diff = sample_time - metric_cache[inst.name][metric_name][vnic[0].name]['timestamp'] val_diff = value - metric_cache[inst.name][metric_name][vnic[0].name]['value'] # Change the metric name to a rate, ie. "net.rx_bytes" # gets converted to "net.rx_bytes_sec" rate_name = "{0}_sec".format(metric_name) # Rename "tx" to "out" and "rx" to "in" rate_name = rate_name.replace("tx", "out") rate_name = rate_name.replace("rx", "in") # Customer this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst.name)['tenant_id'], hostname=instance_cache.get(inst.name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst.name][metric_name][vnic[0].name] = { 'timestamp': sample_time, 'value': value} # Save these metrics for the next collector invocation self._update_metric_cache(metric_cache)
def check(self, instance): """Gather VM metrics for each instance""" # Load metric cache metric_cache = self._load_metric_cache() # Load the nova-obtained instance data cache instance_cache = self._load_instance_cache() # Build dimensions for both the customer and for operations dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance) # Define aggregate gauges, gauge name to metric name agg_gauges = {'vcpus': 'nova.vm.cpu.total_allocated', 'ram': 'nova.vm.mem.total_allocated_mb', 'disk': 'nova.vm.disk.total_allocated_gb'} agg_values = {} for gauge in agg_gauges.keys(): agg_values[gauge] = 0 insp = inspector.get_hypervisor_inspector() for inst in insp._get_connection().listAllDomains(): # Verify that this instance exists in the cache. Add if necessary. inst_name = inst.name() if inst_name not in instance_cache: instance_cache = self._update_instance_cache() # Build customer dimensions try: dims_customer = dims_base.copy() dims_customer['resource_id'] = instance_cache.get(inst_name)['instance_uuid'] dims_customer['zone'] = instance_cache.get(inst_name)['zone'] # Add dimensions that would be helpful for operations dims_operations = dims_customer.copy() dims_operations['tenant_id'] = instance_cache.get(inst_name)['tenant_id'] # Remove customer 'hostname' dimension, this will be replaced by the VM name del(dims_customer['hostname']) except TypeError: # Nova can potentially get into a state where it can't see an # instance, but libvirt can. This would cause TypeErrors as # incomplete data is cached for this instance. Log and skip. self.log.error("{0} is not known to nova after instance cache update -- skipping this ghost VM.".format(inst_name)) continue # Skip instances that are inactive if inst.isActive() == 0: detail = 'Instance is not active' self.gauge('host_alive_status', 2, dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname'], value_meta={'detail': detail}) self.gauge('vm.host_alive_status', 2, dimensions=dims_operations, value_meta={'detail': detail}) continue if inst_name not in metric_cache: metric_cache[inst_name] = {} # Skip instances created within the probation period vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst_name)['created']) if (vm_probation_remaining >= 0): self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst_name)['hostname'], vm_probation_remaining)) continue # Test instance's general responsiveness (ping check) if so configured if self.init_config.get('ping_check') and 'private_ip' in instance_cache.get(inst_name): detail = 'Ping check OK' ping_cmd = self.init_config.get('ping_check').split() ping_cmd.append(instance_cache.get(inst_name)['private_ip']) with open(os.devnull, "w") as fnull: try: res = subprocess.call(ping_cmd, stdout=fnull, stderr=fnull) if res > 0: detail = 'Host failed ping check' self.gauge('host_alive_status', res, dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname'], value_meta={'detail': detail}) self.gauge('vm.host_alive_status', res, dimensions=dims_operations, value_meta={'detail': detail}) # Do not attempt to process any more metrics for offline hosts if res > 0: continue except OSError as e: self.log.warn("OS error running '{0}' returned {1}".format(ping_cmd, e)) # Accumulate aggregate data for gauge in agg_gauges: if gauge in instance_cache.get(inst_name): agg_values[gauge] += instance_cache.get(inst_name)[gauge] # CPU utilization percentage sample_time = float("{:9f}".format(time.time())) if 'cpu.time' in metric_cache[inst_name]: # I have a prior value, so calculate the rate & push the metric cpu_diff = insp.inspect_cpus(inst).time - metric_cache[inst_name]['cpu.time']['value'] time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp']) # Convert time_diff to nanoseconds, and calculate percentage rate = (cpu_diff / (time_diff * 1000000000)) * 100 self.gauge('cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_customer, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)), dimensions=dims_operations) metric_cache[inst_name]['cpu.time'] = {'timestamp': sample_time, 'value': insp.inspect_cpus(inst).time} # Disk activity for disk in insp.inspect_disks(inst): sample_time = time.time() disk_dimensions = {'device': disk[0].device} for metric in disk[1]._fields: metric_name = "io.{0}".format(metric) if metric_name not in metric_cache[inst_name]: metric_cache[inst_name][metric_name] = {} value = int(disk[1].__getattribute__(metric)) if disk[0].device in metric_cache[inst_name][metric_name]: time_diff = sample_time - metric_cache[inst_name][metric_name][disk[0].device]['timestamp'] val_diff = value - metric_cache[inst_name][metric_name][disk[0].device]['value'] # Change the metric name to a rate, ie. "io.read_requests" # gets converted to "io.read_ops_sec" rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops')) # Customer this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = disk_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst_name][metric_name][disk[0].device] = { 'timestamp': sample_time, 'value': value} # Disk utilization # TODO(dschroeder) # Memory utilizaion # TODO(dschroeder) # Network activity for vnic in insp.inspect_vnics(inst): sample_time = time.time() vnic_dimensions = {'device': vnic[0].name} for metric in vnic[1]._fields: metric_name = "net.{0}".format(metric) if metric_name not in metric_cache[inst_name]: metric_cache[inst_name][metric_name] = {} value = int(vnic[1].__getattribute__(metric)) if vnic[0].name in metric_cache[inst_name][metric_name]: time_diff = sample_time - metric_cache[inst_name][metric_name][vnic[0].name]['timestamp'] val_diff = value - metric_cache[inst_name][metric_name][vnic[0].name]['value'] # Change the metric name to a rate, ie. "net.rx_bytes" # gets converted to "net.rx_bytes_sec" rate_name = "{0}_sec".format(metric_name) # Rename "tx" to "out" and "rx" to "in" rate_name = rate_name.replace("tx", "out") rate_name = rate_name.replace("rx", "in") # Customer this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_customer) self.gauge(rate_name, val_diff, dimensions=this_dimensions, delegated_tenant=instance_cache.get(inst_name)['tenant_id'], hostname=instance_cache.get(inst_name)['hostname']) # Operations (metric name prefixed with "vm." this_dimensions = vnic_dimensions.copy() this_dimensions.update(dims_operations) self.gauge("vm.{0}".format(rate_name), val_diff, dimensions=this_dimensions) # Save this metric to the cache metric_cache[inst_name][metric_name][vnic[0].name] = { 'timestamp': sample_time, 'value': value} # Save these metrics for the next collector invocation self._update_metric_cache(metric_cache) # Publish aggregate metrics for gauge in agg_gauges: self.gauge(agg_gauges[gauge], agg_values[gauge], dimensions=dims_base)