Ejemplo n.º 1
0
class Nova():
    def __init__(self):
        self.registry = CollectorRegistry()
        self.prodstack = {}
        with open(config['cache_file'], 'rb') as f:
            self.prodstack = pickle.load(f)[0]
        self.hypervisors = self.prodstack['hypervisors']
        self.tenant_map = {
            t['id']: t['name']
            for t in self.prodstack['tenants']
        }
        self.flavor_map = {
            f['id']: {
                'ram': f['ram'],
                'disk': f['disk'],
                'vcpus': f['vcpus']
            }
            for f in self.prodstack['flavors']
        }
        self.aggregate_map = {}
        self.services_map = {}
        for s in self.prodstack['services']:
            if s['binary'] == 'nova-compute':
                self.services_map[s['host']] = s['status']
        for agg in self.prodstack['aggregates']:
            self.aggregate_map.update({i: agg['name'] for i in agg['hosts']})

    def _get_schedulable_instances(self, host):
        free_vcpus = host['vcpus'] * config[
            'openstack_allocation_ratio_vcpu'] - host['vcpus_used']
        free_ram_mbs = host['memory_mb'] * config[
            'openstack_allocation_ratio_ram'] - host['memory_mb_used']
        free_disk_gbs = host['local_gb'] * config[
            'openstack_allocation_ratio_disk'] - host['local_gb_used']
        s = config['schedulable_instance_size']
        return min(int(free_vcpus / s['vcpu']),
                   int(free_ram_mbs / s['ram_mbs']),
                   int(free_disk_gbs / s['disk_gbs']))

    def _get_schedulable_instances_capacity(self, host):
        capacity_vcpus = host['vcpus'] * config[
            'openstack_allocation_ratio_vcpu']
        capacity_ram_mbs = host['memory_mb'] * config[
            'openstack_allocation_ratio_ram']
        capacity_disk_gbs = host['local_gb'] * config[
            'openstack_allocation_ratio_disk']
        s = config['schedulable_instance_size']
        return min(int(capacity_vcpus / s['vcpu']),
                   int(capacity_ram_mbs / s['ram_mbs']),
                   int(capacity_disk_gbs / s['disk_gbs']))

    def gen_hypervisor_stats(self):
        labels = [
            'cloud', 'hypervisor_hostname', 'aggregate', 'nova_service_status',
            'arch'
        ]
        vms = Gauge('hypervisor_running_vms',
                    'Number of running VMs',
                    labels,
                    registry=self.registry)
        vcpus_total = Gauge('hypervisor_vcpus_total',
                            'Total number of vCPUs',
                            labels,
                            registry=self.registry)
        vcpus_used = Gauge('hypervisor_vcpus_used',
                           'Number of used vCPUs',
                           labels,
                           registry=self.registry)
        mem_total = Gauge('hypervisor_memory_mbs_total',
                          'Total amount of memory in MBs',
                          labels,
                          registry=self.registry)
        mem_used = Gauge('hypervisor_memory_mbs_used',
                         'Used memory in MBs',
                         labels,
                         registry=self.registry)
        disk_total = Gauge('hypervisor_disk_gbs_total',
                           'Total amount of disk space in GBs',
                           labels,
                           registry=self.registry)
        disk_used = Gauge('hypervisor_disk_gbs_used',
                          'Used disk space in GBs',
                          labels,
                          registry=self.registry)
        schedulable_instances = Gauge(
            'hypervisor_schedulable_instances',
            'Number of schedulable instances, see "schedulable_instance_size" option',
            labels,
            registry=self.registry)
        schedulable_instances_capacity = Gauge(
            'hypervisor_schedulable_instances_capacity',
            'Number of schedulable instances we have capacity for',
            labels,
            registry=self.registry)

        def squashnone(val, default=0):
            if val is None:
                return default
            return val

        for h in self.hypervisors:
            log.debug("Hypervisor: %s", h)
            host = h['service']['host']
            log.debug("host: %s", host)
            cpu_info = h['cpu_info']
            log.debug("cpu_info: %s", cpu_info)
            arch = 'Unknown'
            if not cpu_info:
                log.info("Could not get cpu info")
            elif type(cpu_info) != dict:
                cpu_info = json.loads(cpu_info)
                arch = cpu_info['arch']
            l = [
                config['cloud'], host,
                self.aggregate_map.get(host, 'unknown'),
                self.services_map[host], arch
            ]
            # Disabled hypervisors return None below, convert to 0
            vms.labels(*l).set(squashnone(h['running_vms']))
            vcpus_total.labels(*l).set(squashnone(h['vcpus']))
            vcpus_used.labels(*l).set(squashnone(h['vcpus_used']))
            mem_total.labels(*l).set(squashnone(h['memory_mb']))
            mem_used.labels(*l).set(squashnone(h['memory_mb_used']))
            disk_total.labels(*l).set(squashnone(h['local_gb']))
            disk_used.labels(*l).set(squashnone(h['local_gb_used']))

            if config.get("schedulable_instance_size", False):
                schedulable_instances.labels(*l).set(
                    self._get_schedulable_instances(h))
                schedulable_instances_capacity.labels(*l).set(
                    self._get_schedulable_instances_capacity(h))

    def gen_instance_stats(self):
        missing_flavors = False
        instances = Gauge('nova_instances',
                          'Nova instances metrics',
                          ['cloud', 'tenant', 'instance_state'],
                          registry=self.registry)
        res_ram = Gauge('nova_resources_ram_mbs',
                        'Nova RAM usage metric', ['cloud', 'tenant'],
                        registry=self.registry)
        res_vcpus = Gauge('nova_resources_vcpus',
                          'Nova vCPU usage metric', ['cloud', 'tenant'],
                          registry=self.registry)
        res_disk = Gauge('nova_resources_disk_gbs',
                         'Nova disk usage metric', ['cloud', 'tenant'],
                         registry=self.registry)
        for i in self.prodstack['instances']:
            if i['tenant_id'] in self.tenant_map:
                tenant = self.tenant_map[i['tenant_id']]
            else:
                tenant = 'orphaned'
            instances.labels(config['cloud'], tenant, i['status']).inc()

            if i['flavor']['id'] in self.flavor_map:
                flavor = self.flavor_map[i['flavor']['id']]
                res_ram.labels(config['cloud'], tenant).inc(flavor['ram'])
                res_vcpus.labels(config['cloud'], tenant).inc(flavor['vcpus'])
                res_disk.labels(config['cloud'], tenant).inc(flavor['disk'])
            else:
                missing_flavors = True

        # If flavors were deleted we can't reliably find out resouerce use
        if missing_flavors:
            self.registry.unregister(res_ram)
            self.registry.unregister(res_vcpus)
            self.registry.unregister(res_disk)
            res_ram = Gauge(
                'nova_resources_ram_mbs',
                'Nova RAM usage metric unavailable, missing flavors', [],
                registry=self.registry)
            res_vcpus = Gauge(
                'nova_resources_vcpus',
                'Nova vCPU usage metric unavailable, missing flavors', [],
                registry=self.registry)
            res_disk = Gauge(
                'nova_resources_disk_gbs',
                'Nova disk usage metric unavailable, missing flavors', [],
                registry=self.registry)

    def gen_overcommit_stats(self):
        labels = ['cloud', 'resource']
        openstack_overcommit = Gauge('openstack_allocation_ratio',
                                     'Openstack overcommit ratios',
                                     labels,
                                     registry=self.registry)
        l = [config['cloud'], 'vcpu']
        openstack_overcommit.labels(*l).set(
            config['openstack_allocation_ratio_vcpu'])
        l = [config['cloud'], 'ram']
        openstack_overcommit.labels(*l).set(
            config['openstack_allocation_ratio_ram'])
        l = [config['cloud'], 'disk']
        openstack_overcommit.labels(*l).set(
            config['openstack_allocation_ratio_disk'])

    def gen_quota_stats(self):
        cores = Gauge('nova_quota_cores',
                      'Nova cores metric', ['cloud', 'tenant', 'type'],
                      registry=self.registry)
        fips = Gauge('nova_quota_floating_ips',
                     'Nova floating IP addresses (number)',
                     ['cloud', 'tenant', 'type'],
                     registry=self.registry)
        inst = Gauge('nova_quota_instances',
                     'Nova instances (number)', ['cloud', 'tenant', 'type'],
                     registry=self.registry)
        ram = Gauge('nova_quota_ram_mbs',
                    'Nova RAM (MB)', ['cloud', 'tenant', 'type'],
                    registry=self.registry)
        for t, q in self.prodstack['nova_quotas'].items():
            if t in self.tenant_map:
                tenant = self.tenant_map[t]
            else:
                tenant = 'orphaned'

            # we get detailed quota information only on recent OS versions
            if isinstance(q['cores'], int):
                cores.labels(config['cloud'], tenant, 'limit').set(q['cores'])
                fips.labels(config['cloud'], tenant,
                            'limit').set(q['floating_ips'])
                inst.labels(config['cloud'], tenant,
                            'limit').set(q['instances'])
                ram.labels(config['cloud'], tenant, 'limit').set(q['ram'])
            else:
                for tt in ['limit', 'in_use', 'reserved']:
                    cores.labels(config['cloud'], tenant,
                                 tt).inc(q['cores'][tt])
                    fips.labels(config['cloud'], tenant,
                                tt).inc(q['floating_ips'][tt])
                    inst.labels(config['cloud'], tenant,
                                tt).inc(q['instances'][tt])
                    ram.labels(config['cloud'], tenant, tt).inc(q['ram'][tt])

    def get_stats(self):
        log.debug("get_stats")
        self.gen_hypervisor_stats()
        self.gen_instance_stats()
        self.gen_overcommit_stats()
        self.gen_quota_stats()
        return generate_latest(self.registry)
Ejemplo n.º 2
0
class BroadcastWebsocketStats():
    def __init__(self, local_hostname, remote_hostname):
        self._local_hostname = local_hostname
        self._remote_hostname = remote_hostname
        self._registry = CollectorRegistry()

        # TODO: More robust replacement
        self.name = self.safe_name(self._local_hostname)
        self.remote_name = self.safe_name(self._remote_hostname)

        self._messages_received_total = Counter(f'awx_{self.remote_name}_messages_received_total',
                                                'Number of messages received, to be forwarded, by the broadcast websocket system',
                                                registry=self._registry)
        self._messages_received = Gauge(f'awx_{self.remote_name}_messages_received',
                                        'Number forwarded messages received by the broadcast websocket system, for the duration of the current connection',
                                        registry=self._registry)
        self._connection = Enum(f'awx_{self.remote_name}_connection',
                                'Websocket broadcast connection',
                                states=['disconnected', 'connected'],
                                registry=self._registry)
        self._connection_start = Gauge(f'awx_{self.remote_name}_connection_start',
                                       'Time the connection was established',
                                       registry=self._registry)

        self._messages_received_per_minute = Gauge(f'awx_{self.remote_name}_messages_received_per_minute',
                                                   'Messages received per minute',
                                                   registry=self._registry)
        self._internal_messages_received_per_minute = FixedSlidingWindow()

    def safe_name(self, s):
        # Replace all non alpha-numeric characters with _
        return re.sub('[^0-9a-zA-Z]+', '_', s)

    def unregister(self):
        self._registry.unregister(f'awx_{self.remote_name}_messages_received')
        self._registry.unregister(f'awx_{self.remote_name}_connection')

    def record_message_received(self):
        self._internal_messages_received_per_minute.record()
        self._messages_received.inc()
        self._messages_received_total.inc()

    def record_connection_established(self):
        self._connection.state('connected')
        self._connection_start.set_to_current_time()
        self._messages_received.set(0)

    def record_connection_lost(self):
        self._connection.state('disconnected')

    def get_connection_duration(self):
        return (datetime.datetime.now() - self._connection_established_ts).total_seconds()

    def render(self):
        msgs_per_min = self._internal_messages_received_per_minute.render()
        self._messages_received_per_minute.set(msgs_per_min)

    def serialize(self):
        self.render()

        registry_data = generate_latest(self._registry).decode('UTF-8')
        return registry_data
Ejemplo n.º 3
0
class QMetrics():

    job_key = 'unknown_metrics_key'
    counters = {}
    gauges = {}

    def __init__(self, args={}):
        if 'job_key' in args:
            self.job_key = args['job_key']
        self.init_pushgateway_url()
        self.registry = CollectorRegistry()

    def create_gauge(self, metrics_key, metrics_name, data_keys):
        index = len(self.gauges) + 1
        self.gauges[index] = Gauge(metrics_key,
                                   metrics_name,
                                   data_keys,
                                   registry=self.registry)
        return index

    def reinit_gauge(self, index, metrics_key, metrics_name, data_keys):
        self.registry.unregister(self.gauges[index])
        self.gauges[index] = Gauge(metrics_key,
                                   metrics_name,
                                   data_keys,
                                   registry=self.registry)
        return index

    def set_gauge_to_now(self, index, *labels):
        now = datetime.datetime.utcnow().timestamp()
        #print("SETTING GAUGE {index} TO {now}".format(index=index, now=now))
        self.gauges[index].labels(*labels).set(now)

    def gauge_set(self, index, *labels, value):
        #print("SETTING GAUGE {index} TO {value}".format(index=index, value=value))
        self.gauges[index].labels(*labels).set(value)

    def create_counter(self, metrics_key, metrics_name, data_keys):
        index = len(self.counters) + 1
        self.counters[index] = Counter(metrics_key,
                                       metrics_name,
                                       data_keys,
                                       registry=self.registry)
        return index

    def init_pushgateway_url(self):
        self.pushgateway_url = '{host}:{port}'.format(
            host=os.environ['PUSHGATEWAY_HOST'],
            port=os.environ['PUSHGATEWAY_PORT'])

    def counter_inc(self, index, *labels, increment=1):
        #print("INCREMENTING COUNTER {index} TO {increment}".format(index=index, increment=increment))
        self.counters[index].labels(*labels).inc(increment)

    def gauge_inc(self, index, *labels, increment=1):
        #print("INCREMENTING GAUGE {index} TO {increment}".format(index=index, increment=increment))
        self.gauges[index].labels(*labels).inc(increment)

    def push_metrics(self):
        job = '{job_key}_{random}'.format(job_key=self.job_key,
                                          random=random.randint(
                                              1000000000000,
                                              100000000000000000))
        push_to_gateway(self.pushgateway_url, job=job, registry=self.registry)
        return job

    def delete(self, job_key):
        r = requests.delete(
            'http://{pushgateway_url}/metrics/job/{job_key}'.format(
                pushgateway_url=self.pushgateway_url, job_key=job_key))
        return r