def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) status = None try: self.get_service() if not self.args.service_count: status = self.check_service() except Exception as ex: print "Problem retreiving data: %s " % ex.message if status: self.metric_sender.add_metric({ "openshift.webservice.{}.status".format(self.args.pod): status }) self.metric_sender.add_metric( {'openshift.cluster.service.count': self.servicecount}, synthetic=True) self.metric_sender.send_metrics()
def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_pods() except Exception as ex: print "Problem retreiving pod data: %s " % ex.message self.metric_sender.send_metrics()
def __init__(self): self.args = None self.metric_sender = None self.ora = OpenshiftRestApi() self.dns_host = '' self.dns_port = 53 self.openshift_services = []
def run(self): ''' Main function to run the check ''' self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) master_cfg = [] with open(self.args.master_config, 'r') as yml: master_cfg = yaml.load(yml) self.ora = OpenshiftRestApi( host=master_cfg['oauthConfig']['masterURL'], verify_ssl=True) self.cluster_capacity() if not self.args.dry_run: self.zagg_sender.send_metrics()
def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_service() status = self.check_service() except Exception as ex: print "Problem retreiving data: %s " % ex.message self.metric_sender.add_metric({ "openshift.webservice.{}.status".format(self.args.pod) : status}) self.metric_sender.send_metrics()
class OpenshiftPodChecker(object): """ Checks for Openshift Pods """ def __init__(self): self.args = None self.ora = None self.zagg_sender = None def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_pods() except Exception as ex: print "Problem retreiving pod data: %s " % ex.message self.zagg_sender.send_metrics() def get_pods(self): """ Gets pod data """ print "\nPerforming pod check ...\n" api_url = "/api/v1/pods" if (str(self.args.namespace) != "None") & (str(self.args.namespace) != "all"): api_url = "/api/v1/namespaces/{}/pods".format(self.args.namespace) api_yaml = self.ora.get(api_url, rtype="text") pods = yaml.safe_load(api_yaml) pod_count = 0 for pod in pods["items"]: if self.args.pod and self.args.pod in pod["metadata"]["name"]: print "status of {} is {}".format(pod["metadata"]["name"], pod["status"]["phase"]) if pod["status"]["phase"] == "Running": pod_count += 1 else: pass self.zagg_sender.add_zabbix_keys({"service.pod.{}.count".format(self.args.pod): pod_count}) def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description="Openshift pod sender") parser.add_argument("-p", "--pod", default=None, help="Check for pod with this specific name") parser.add_argument("-n", "--namespace", default=None, help='Check for pods in this namespace - "all" for all') parser.add_argument("-v", "--verbose", action="store_true", default=None, help="Verbose?") parser.add_argument("--debug", action="store_true", default=None, help="Debug?") self.args = parser.parse_args()
def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_pods() except Exception as ex: print "Problem retreiving pod data: %s " % ex.message self.zagg_sender.send_metrics()
def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_service() status = self.check_service() except Exception as ex: print "Problem retreiving data: %s " % ex.message self.zagg_sender.add_zabbix_keys({ "openshift.webservice.{}.status".format(self.args.pod) : status}) self.zagg_sender.send_metrics()
class OpenshiftMasterZaggClient(object): """ Checks for the Openshift Master """ def __init__(self): self.args = None self.metric_sender = None self.ora = None self.zabbix_api_key = None self.zabbix_healthz_key = None def run(self): """ Main function to run the check """ self.parse_args() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) if self.args.local: self.ora = OpenshiftRestApi() self.args.api_ping = True self.args.healthz = True self.zabbix_api_key = 'openshift.master.local.api.ping' self.zabbix_healthz_key = 'openshift.master.local.api.healthz' else: master_cfg_from_yaml = [] with open('/etc/origin/master/master-config.yaml', 'r') as yml: master_cfg_from_yaml = yaml.load(yml) self.ora = OpenshiftRestApi(host=master_cfg_from_yaml['oauthConfig']['masterURL'], verify_ssl=True) self.zabbix_api_key = 'openshift.master.api.ping' self.zabbix_healthz_key = 'openshift.master.api.healthz' try: if self.args.healthz or self.args.all_checks: self.healthz_check() except Exception as ex: print "Problem performing healthz check: %s " % ex.message self.metric_sender.add_metric({self.zabbix_healthz_key: 'false'}) try: if self.args.api_ping or self.args.all_checks: self.api_ping() if self.args.project_count or self.args.all_checks: self.project_count() if self.args.pod_count or self.args.all_checks: self.pod_count() if self.args.user_count or self.args.all_checks: self.user_count() if self.args.pv_info or self.args.all_checks: self.pv_info() if self.args.node_checks or self.args.all_checks: self.nodes_not_schedulable() self.nodes_not_ready() self.nodes_not_labeled() except Exception as ex: print "Problem Openshift API checks: %s " % ex.message self.metric_sender.add_metric({self.zabbix_api_key: 0}) # Openshift API is down try: if self.args.metrics or self.args.all_checks: self.metric_check() except Exception as ex: print "Problem getting Openshift metrics at /metrics: %s " % ex.message self.metric_sender.add_metric({'openshift.master.metric.ping' : 0}) # Openshift Metrics are down self.metric_sender.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Network metric sender') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('-l', '--local', action='store_true', default=False, help='Run local checks against the local API (https://127.0.0.1)') master_check_group = parser.add_argument_group('Different Checks to Perform') master_check_group.add_argument('--all-checks', action='store_true', default=None, help='Do all of the checks') master_check_group.add_argument('--api-ping', action='store_true', default=None, help='Verify the Openshift API is alive') master_check_group.add_argument('--healthz', action='store_true', default=None, help='Query the Openshift Master API /healthz') master_check_group.add_argument('--metrics', action='store_true', default=None, help='Query the Openshift Master Metrics at /metrics') master_check_group.add_argument('--project-count', action='store_true', default=None, help='Query the Openshift Master for Number of Pods') master_check_group.add_argument('--pod-count', action='store_true', default=None, help='Query the Openshift Master for Number of Running Pods') master_check_group.add_argument('--user-count', action='store_true', default=None, help='Query the Openshift Master for Number of Users') master_check_group.add_argument('--pv-info', action='store_true', default=None, help='Query the Openshift Master for Persistent Volumes Info') master_check_group.add_argument('--node-checks', action='store_true', default=None, help='Query the Openshift Master for node checks') self.args = parser.parse_args() def api_ping(self): """ Verify the Openshift API health is responding correctly """ print "\nPerforming Openshift API ping check..." response = self.ora.get('/api/v1/nodes') print "\nOpenshift API ping is alive" print "Number of nodes in the Openshift cluster: %s" % len(response['items']) self.metric_sender.add_metric({self.zabbix_api_key: 1, 'openshift.master.node.count': len(response['items'])}) def healthz_check(self): """ check the /healthz API call """ print "\nPerforming /healthz check..." response = self.ora.get('/healthz', rtype='text') print "healthz check returns: %s " %response self.metric_sender.add_metric({self.zabbix_healthz_key: str('ok' in response).lower()}) def metric_check(self): """ collect certain metrics from the /metrics API call """ print "\nPerforming /metrics check..." response = self.ora.get('/metrics', rtype='text') for metric_type in text_string_to_metric_families(response): # Collect the apiserver_request_latencies_summary{resource="pods",verb="LIST",quantiles in /metrics # Collect the apiserver_request_latencies_summary{resource="pods",verb="WATCHLIST",quantiles in /metrics if metric_type.name == 'apiserver_request_latencies_summary': key_str = 'openshift.master.apiserver.latency.summary' for sample in metric_type.samples: if (sample[1]['resource'] == 'pods' and sample[1].has_key('quantile') and 'LIST' in sample[1]['verb']): curr_key_str = key_str + ".pods.quantile.%s.%s" % (sample[1]['verb'], sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.metric_sender.add_metric({curr_key_str.lower(): int(value/1000)}) # Collect the scheduler_e2e_scheduling_latency_microseconds{quantiles in /metrics if metric_type.name == 'scheduler_e2e_scheduling_latency_microseconds': for sample in metric_type.samples: if sample[1].has_key('quantile'): key_str = 'openshift.master.scheduler.e2e.scheduling.latency' curr_key_str = key_str + ".quantile.%s" % (sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.metric_sender.add_metric({curr_key_str.lower(): int(value/1000)}) self.metric_sender.add_metric({'openshift.master.metric.ping' : 1}) # def project_count(self): """ check the number of projects in Openshift """ print "\nPerforming project count check..." excluded_names = ['openshift', 'openshift-infra', 'default', 'ops-monitor'] response = self.ora.get('/oapi/v1/projects') project_names = [project['metadata']['name'] for project in response['items']] valid_names = set(project_names) - set(excluded_names) print "Project count: %s" % len(valid_names) self.metric_sender.add_metric({'openshift.project.count' : len(valid_names)}) def pod_count(self): """ check the number of pods in Openshift """ print "\nPerforming pod count check..." response = self.ora.get('/api/v1/pods') # Get running pod count running_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: running_pod_count += 1 # Get running pod count on compute only nodes (non-infra) running_user_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: if 'nodeSelector' in i['spec']: # logging pods don't have selector on 'type' if 'type' in i['spec']['nodeSelector'] \ and i['spec']['nodeSelector']['type'] == 'compute': running_user_pod_count += 1 print "Total pod count: %s" % len(response['items']) print "Running pod count: %s" % running_pod_count print "User Running pod count: %s" % running_user_pod_count self.metric_sender.add_metric({'openshift.master.pod.running.count' : running_pod_count, 'openshift.master.pod.user.running.count' : running_user_pod_count, 'openshift.master.pod.total.count' : len(response['items'])}) def user_count(self): """ check the number of users in Openshift """ print "\nPerforming user count check..." response = self.ora.get('/oapi/v1/users') print "Total user count: %s" % len(response['items']) self.metric_sender.add_metric({'openshift.master.user.count' : len(response['items'])}) @staticmethod def convert_to_GiB(value): """ take units as 'Gi', 'Ti', etc and return as int GiB """ if 'G' in value: return int(value.strip('GIgi')) elif 'Ti' in value: return 1000 * int(value.replace('Ti', '')) def pv_info(self): """ Gather info about the persistent volumes in Openshift """ print "\nPerforming user persistent volume count...\n" response = self.ora.get('/api/v1/persistentvolumes') pv_capacity_total = 0 pv_capacity_available = 0 pv_types = {'Available': 0, 'Bound': 0, 'Released': 0, 'Failed': 0} # Dynamic items variables discovery_key_pv = 'disc.pv' item_prototype_macro_pv = '#OSO_PV' item_prototype_key_count = 'disc.pv.count' item_prototype_key_available = 'disc.pv.available' dynamic_pv_count = defaultdict(int) dynamic_pv_available = defaultdict(int) for item in response['items']: # gather dynamic pv counts dynamic_pv_count[item['spec']['capacity']['storage']] += 1 #get count of each pv type available pv_types[item['status']['phase']] += 1 #get info for the capacity and capacity available capacity = item['spec']['capacity']['storage'] if item['status']['phase'] == 'Available': # get total available capacity pv_capacity_available = pv_capacity_available + self.convert_to_GiB(capacity) # gather dynamic pv available counts dynamic_pv_available[item['spec']['capacity']['storage']] += 1 pv_capacity_total = pv_capacity_total + self.convert_to_GiB(capacity) print "Total Persistent Volume Total count: %s" % len(response['items']) print 'Total Persistent Volume Capacity: %s' % pv_capacity_total print 'Total Persisten Volume Available Capacity: %s' % pv_capacity_available self.metric_sender.add_metric( {'openshift.master.pv.total.count' : len(response['items']), 'openshift.master.pv.space.total': pv_capacity_total, 'openshift.master.pv.space.available': pv_capacity_available}) for key, value in pv_types.iteritems(): print "Total Persistent Volume %s count: %s" % (key, value) self.metric_sender.add_metric( {'openshift.master.pv.%s.count' %key.lower() : value}) # Add dynamic items self.metric_sender.add_dynamic_metric(discovery_key_pv, item_prototype_macro_pv, dynamic_pv_count.keys()) for size, count in dynamic_pv_count.iteritems(): print print "Total Persistent Volume %s count: %s" % (size, count) print "Total Persistent Volume available %s count: %s" % (size, dynamic_pv_available[size]) self.metric_sender.add_metric({"%s[%s]" %(item_prototype_key_count, size) : count, "%s[%s]" %(item_prototype_key_available, size) : dynamic_pv_available[size]}) def nodes_not_schedulable(self): """check the number of nodes in the cluster that are not schedulable""" print "\nPerforming nodes not schedulable check..." response = self.ora.get('/api/v1/nodes') nodes_not_schedulable = [] for n in response['items']: if n['metadata']['labels']['type'] == 'master': if self.args.verbose: print "Node: %s is a master\n" % n['metadata']['name'] else: if "unschedulable" in n['spec']: nodes_not_schedulable.append(n['metadata']['name']) print "Count of nodes not schedulable: %s" % len(nodes_not_schedulable) print "Nodes not schedulable: %s\n" % nodes_not_schedulable self.metric_sender.add_metric( {'openshift.master.nodesnotschedulable.count' : len(nodes_not_schedulable)}) def nodes_not_ready(self): """ check the number of nodes in the cluster that are not ready""" print "\nPerforming nodes not ready check..." response = self.ora.get('/api/v1/nodes') nodes_not_ready = [] for n in response['items']: has_ready_status = False for cond in n['status']['conditions']: if self.args.verbose: print "Get ready status of %s" % n['metadata']['name'] if cond['type'] == "Ready": has_ready_status = True if cond['status'].lower() != "true": if self.args.verbose: print "Non-true ready status of %s : %s" % (n['metadata']['name'], cond['status']) nodes_not_ready.append(n['metadata']['name']) if has_ready_status == False: if self.args.verbose: print "Did not find ready status for %s" % n['metadata']['name'] nodes_not_ready.append(n['metadata']['name']) print "Count of nodes not ready: %s" % len(nodes_not_ready) self.metric_sender.add_metric( {'openshift.master.nodesnotready.count' : len(nodes_not_ready)}) def nodes_not_labeled(self): """ check the nodes in the cluster that are not labeled Note: This check only searches for nodes with no label keys set""" print "\nPerforming nodes not labeled check..." response = self.ora.get('/api/v1/nodes') nodes_not_labeled = [] nodes_labeled = [] for n in response['items']: if 'labels' in n['metadata']: nodes_labeled.append(n['metadata']['name']) else: nodes_not_labeled.append(n['metadata']['name']) print "Nodes not labeled: %s\nNodes labeled: %s \n" % (nodes_not_labeled, nodes_labeled) self.metric_sender.add_metric( {'openshift.master.nodesnotlabeled.count' : len(nodes_not_labeled)})
class OpenshiftMasterZaggClient(object): """ Checks for the Openshift Master """ def __init__(self): self.args = None self.zagg_sender = None self.ora = OpenshiftRestApi() def run(self): """ Main function to run the check """ self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) try: if self.args.healthz or self.args.all_checks: self.healthz_check() except Exception as ex: print "Problem performing healthz check: %s " % ex.message self.zagg_sender.add_zabbix_keys( {'openshift.master.api.healthz': 'false'}) try: if self.args.api_ping or self.args.all_checks: self.api_ping() if self.args.project_count or self.args.all_checks: self.project_count() if self.args.pod_count or self.args.all_checks: self.pod_count() if self.args.user_count or self.args.all_checks: self.user_count() except Exception as ex: print "Problem Openshift API checks: %s " % ex.message self.zagg_sender.add_zabbix_keys({'openshift.master.api.ping': 0}) # Openshift API is down try: if self.args.metrics or self.args.all_checks: self.metric_check() except Exception as ex: print "Problem getting Openshift metrics at /metrics: %s " % ex.message self.zagg_sender.add_zabbix_keys( {'openshift.master.metric.ping': 0}) # Openshift Metrics are down self.zagg_sender.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Network metric sender') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') master_check_group = parser.add_argument_group( 'Different Checks to Perform') master_check_group.add_argument('--all-checks', action='store_true', default=None, help='Do all of the checks') master_check_group.add_argument( '--api-ping', action='store_true', default=None, help='Verify the Openshift API is alive') master_check_group.add_argument( '--healthz', action='store_true', default=None, help='Query the Openshift Master API /healthz') master_check_group.add_argument( '--metrics', action='store_true', default=None, help='Query the Openshift Master Metrics at /metrics') master_check_group.add_argument( '--project-count', action='store_true', default=None, help='Query the Openshift Master for Number of Pods') master_check_group.add_argument( '--pod-count', action='store_true', default=None, help='Query the Openshift Master for Number of Running Pods') master_check_group.add_argument( '--user-count', action='store_true', default=None, help='Query the Openshift Master for Number of Users') self.args = parser.parse_args() def api_ping(self): """ Verify the Openshift API health is responding correctly """ print "\nPerforming Openshift API ping check..." response = self.ora.get('/api/v1/nodes') print "\nOpenshift API ping is alive" print "Number of nodes in the Openshift cluster: %s" % len( response['items']) self.zagg_sender.add_zabbix_keys({ 'openshift.master.api.ping': 1, 'openshift.master.node.count': len(response['items']) }) def healthz_check(self): """ check the /healthz API call """ print "\nPerforming /healthz check..." response = self.ora.get('/healthz', rtype='text') print "healthz check returns: %s " % response self.zagg_sender.add_zabbix_keys( {'openshift.master.api.healthz': str('ok' in response).lower()}) def metric_check(self): """ collect certain metrics from the /metrics API call """ print "\nPerforming /metrics check..." response = self.ora.get('/metrics', rtype='text') for metric_type in text_string_to_metric_families(response): # Collect the apiserver_request_latencies_summary{resource="pods",verb="LIST",quantiles in /metrics # Collect the apiserver_request_latencies_summary{resource="pods",verb="WATCHLIST",quantiles in /metrics if metric_type.name == 'apiserver_request_latencies_summary': key_str = 'openshift.master.apiserver.latency.summary' for sample in metric_type.samples: if (sample[1]['resource'] == 'pods' and sample[1].has_key('quantile') and 'LIST' in sample[1]['verb']): curr_key_str = key_str + ".pods.quantile.%s.%s" % ( sample[1]['verb'], sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.zagg_sender.add_zabbix_keys( {curr_key_str.lower(): int(value / 1000)}) # Collect the scheduler_e2e_scheduling_latency_microseconds{quantiles in /metrics if metric_type.name == 'scheduler_e2e_scheduling_latency_microseconds': for sample in metric_type.samples: if sample[1].has_key('quantile'): key_str = 'openshift.master.scheduler.e2e.scheduling.latency' curr_key_str = key_str + ".quantile.%s" % ( sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.zagg_sender.add_zabbix_keys( {curr_key_str.lower(): int(value / 1000)}) self.zagg_sender.add_zabbix_keys({'openshift.master.metric.ping': 1}) # def project_count(self): """ check the number of projects in Openshift """ print "\nPerforming project count check..." excluded_names = [ 'openshift', 'openshift-infra', 'default', 'ops-monitor' ] response = self.ora.get('/oapi/v1/projects') project_names = [ project['metadata']['name'] for project in response['items'] ] valid_names = set(project_names) - set(excluded_names) print "Project count: %s" % len(valid_names) self.zagg_sender.add_zabbix_keys( {'openshift.project.count': len(valid_names)}) def pod_count(self): """ check the number of pods in Openshift """ print "\nPerforming pod count check..." response = self.ora.get('/api/v1/pods') # Get running pod count running_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: running_pod_count += 1 # Get running pod count on compute only nodes (non-infra) running_user_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: if 'nodeSelector' in i['spec']: if i['spec']['nodeSelector']['type'] == 'compute': running_user_pod_count += 1 print "Total pod count: %s" % len(response['items']) print "Running pod count: %s" % running_pod_count print "User Running pod count: %s" % running_user_pod_count self.zagg_sender.add_zabbix_keys({ 'openshift.master.pod.running.count': running_pod_count, 'openshift.master.pod.user.running.count': running_user_pod_count, 'openshift.master.pod.total.count': len(response['items']) }) def user_count(self): """ check the number of users in Openshift """ print "\nPerforming user count check..." response = self.ora.get('/oapi/v1/users') print "Total user count: %s" % len(response['items']) self.zagg_sender.add_zabbix_keys( {'openshift.master.user.count': len(response['items'])})
class OpenshiftWebServiceChecker(object): """ Checks for Openshift Pods """ def __init__(self): self.args = None self.ora = None self.zagg_sender = None self.service_ip = None self.service_port = '443' def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_service() status = self.check_service() except Exception as ex: print "Problem retreiving data: %s " % ex.message self.zagg_sender.add_zabbix_keys({ "openshift.webservice.{}.status".format(self.args.pod) : status}) self.zagg_sender.send_metrics() def get_service(self): """ Gets the service for a pod """ print "\nLooking up services for pod\n" api_url = "/api/v1/services" if (str(self.args.namespace) != "None") & \ (str(self.args.namespace) != "all"): api_url = '/api/v1/namespaces/{}/services'.format(self.args.namespace) print "using api url {}".format(api_url) api_yaml = self.ora.get(api_url, rtype='text') services = yaml.safe_load(api_yaml) for service in services["items"]: if self.args.pod and \ self.args.pod in service["metadata"]["name"]: print "service IP is {}".format(service["spec"]["clusterIP"]) self.service_ip = service["spec"]["clusterIP"] if self.args.portname != None: for port in service["spec"]["ports"]: if port["name"] == self.args.portname: self.service_port = port["port"] else: self.service_port = service["spec"]["ports"][0]["port"] else: pass def check_service(self): """ Checks the web service """ print "\nChecking web service\n" if self.args.insecure: proto = 'http' else: proto = 'https' url = '{}://{}:{}/{}'.format( proto, self.service_ip, self.service_port, self.args.url, ) try: print "Performing check on URL: {}".format(url) response = urllib2.urlopen(url, timeout=30) if str(response.getcode()) == self.args.status: if self.args.content == None \ or self.args.content in response.read(): return True except urllib2.URLError: print "Received error accessing URL: {}".format(url) except socket.timeout: print "Timed out accessing URL: {}".format(url) return False def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Openshift pod sender') parser.add_argument('-p', '--pod', default=None, help='Check for pod with this specific name') parser.add_argument('-n', '--namespace', default=None, help='Check for pods in this namespace - "all" for all') parser.add_argument('-P', '--portname', default=None, help='name of the port to check') parser.add_argument('-u', '--url', default="/", help='URL to check. Defaults to "/".') parser.add_argument('-s', '--status', default="200", help='HTTP status code to expect. Defaults to 200') parser.add_argument('-c', '--content', default=None, help='Looks for a string in the content of the response.') parser.add_argument('-i', '--insecure', help='Use insecure http connection') parser.add_argument('-S', '--secure', help='Use secure https connection (default)') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') self.args = parser.parse_args()
def __init__(self): self.args = None self.zagg_sender = None self.ora = OpenshiftRestApi()
class OpenshiftClusterCapacity(object): ''' Checks for cluster capacity ''' def __init__(self): self.args = None self.zagg_sender = None self.ora = None self.sql_conn = None self.zbx_key_prefix = "openshift.master.cluster.compute_nodes." def run(self): ''' Main function to run the check ''' self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) master_cfg = [] with open(self.args.master_config, 'r') as yml: master_cfg = yaml.load(yml) self.ora = OpenshiftRestApi( host=master_cfg['oauthConfig']['masterURL'], verify_ssl=True) self.cluster_capacity() if not self.args.dry_run: self.zagg_sender.send_metrics() def parse_args(self): ''' parse the args from the cli ''' parser = argparse.ArgumentParser(description='Cluster capacity sender') parser.add_argument( '--master-config', default='/etc/origin/master/master-config.yaml', help='Location of OpenShift master-config.yml file') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('--dry-run', action='store_true', default=False, help='Do not sent results to Zabbix') self.args = parser.parse_args() def load_nodes(self): ''' load nodes into SQL ''' self.sql_conn.execute('''CREATE TABLE nodes (name text, type text, api text, max_cpu integer, max_memory integer, max_pods integer)''') response = self.ora.get('/api/v1/nodes') for new_node in response['items']: # Skip nodes not in 'Ready' state node_ready = False for condition in new_node['status']['conditions']: if condition['type'] == 'Ready' and \ condition['status'] == 'True': node_ready = True if not node_ready: continue # Skip unschedulable nodes if new_node['spec'].get('unschedulable'): continue node = {} node['name'] = new_node['metadata']['name'] node['type'] = new_node['metadata']['labels'].get( 'type', 'unknown') node['api'] = new_node['metadata']['selfLink'] if 'allocatable' in new_node['status']: cpu = new_node['status']['allocatable']['cpu'] mem = new_node['status']['allocatable']['memory'] node['max_pods'] = int( new_node['status']['allocatable']['pods']) else: cpu = new_node['status']['capacity']['cpu'] mem = new_node['status']['capacity']['memory'] node['max_pods'] = int(new_node['status']['capacity']['pods']) node['max_cpu'] = to_milicores(cpu) node['max_memory'] = to_bytes(mem) if self.args.debug: print "Adding node: {}".format(str(node)) self.sql_conn.execute( 'INSERT INTO nodes VALUES (?,?,?,?,?,?)', (node['name'], node['type'], node['api'], node['max_cpu'], node['max_memory'], node['max_pods'])) @staticmethod def load_container_limits(pod, containers): ''' process/store container limits data ''' for container in containers: if 'limits' in container['resources']: pod['cpu_limits'] = int(pod.get('cpu_limits', 0)) \ + int(to_milicores(container['resources']['limits'].get('cpu', '0'))) pod['memory_limits'] = int(pod.get('memory_limits', 0)) \ + int(to_bytes(container['resources']['limits'].get('memory', '0'))) if 'requests' in container['resources']: pod['cpu_requests'] = int(pod.get('cpu_requests', 0)) \ + int(to_milicores(container['resources']['requests'].get('cpu', '0'))) pod['memory_requests'] = int(pod.get('memory_requests', 0)) \ + int(to_bytes(container['resources']['requests'].get('memory', '0'))) def load_pods(self): ''' put pod details into db ''' self.sql_conn.execute('''CREATE TABLE pods (name text, namespace text, api text, cpu_limits integer, cpu_requests integer, memory_limits integer, memory_requests integer, node text)''') response = self.ora.get('/api/v1/pods') for new_pod in response['items']: if new_pod['status']['phase'] != 'Running': continue pod = {} pod['name'] = new_pod['metadata']['name'] pod['namespace'] = new_pod['metadata']['namespace'] pod['api'] = new_pod['metadata']['selfLink'] pod['node'] = new_pod['spec']['nodeName'] self.load_container_limits(pod, new_pod['spec']['containers']) self.sql_conn.execute( 'INSERT INTO pods VALUES (?,?,?,?,?,?,?,?)', (pod['name'], pod['namespace'], pod['api'], pod.get('cpu_limits'), pod.get('cpu_requests'), pod.get('memory_limits'), pod.get('memory_requests'), pod['node'])) def get_largest_pod(self): ''' return single largest memory request number for all running pods ''' max_pod = 0 for row in self.sql_conn.execute('''SELECT MAX(memory_requests) FROM pods, nodes WHERE pods.node=nodes.name AND nodes.type="compute"'''): max_pod = row[0] return max_pod def how_many_schedulable(self, node_size): ''' return how many pods with memory request 'node_size' can be scheduled ''' nodes = {} # get max mem for each compute node for row in self.sql_conn.execute('''SELECT nodes.name, nodes.max_memory FROM nodes WHERE nodes.type="compute"'''): nodes[row[0]] = { 'max_memory': row[1], # set memory_allocated to '0' because node may have # no pods running, and next SQL query below will # leave this field unpopulated 'memory_scheduled': 0 } # get memory requests for all pods on all compute nodes for row in self.sql_conn.execute('''SELECT nodes.name, SUM(pods.memory_requests) FROM pods, nodes WHERE pods.node=nodes.name AND nodes.type="compute" GROUP BY nodes.name'''): nodes[row[0]]['memory_scheduled'] = row[1] schedulable = 0 for node in nodes.keys(): # TODO: Some containers from `oc get pods --all-namespaces -o json` # don't have resources scheduled, causing memory_scheduled == 0 available = nodes[node]['max_memory'] - \ nodes[node]['memory_scheduled'] num = available / node_size # ignore negative number (overcommitted nodes) if num > 0: schedulable += num return schedulable def get_compute_nodes_max_schedulable_cpu(self): ''' calculate total schedulable CPU (in milicores) for all compute nodes ''' max_cpu = 0 for row in self.sql_conn.execute('''SELECT SUM(nodes.max_cpu) FROM nodes WHERE nodes.type="compute" '''): max_cpu = row[0] return max_cpu def get_compute_nodes_max_schedulable_mem(self): ''' calculate total schedulable memory for all compute nodes ''' max_mem = 0 for row in self.sql_conn.execute('''SELECT SUM(nodes.max_memory) FROM nodes WHERE nodes.type="compute" '''): max_mem = row[0] return max_mem def get_compute_nodes_scheduled_cpu(self): ''' calculate cpu scheduled to pods (total requested and percentage of cluster-wide total) ''' max_cpu = self.get_compute_nodes_max_schedulable_cpu() cpu_requests_for_all_pods = 0 for row in self.sql_conn.execute('''SELECT SUM(pods.cpu_requests) FROM pods, nodes WHERE pods.node = nodes.name AND nodes.type = "compute" '''): cpu_requests_for_all_pods = row[0] cpu_scheduled_as_pct = 100.0 * cpu_requests_for_all_pods / max_cpu cpu_unscheduled = max_cpu - cpu_requests_for_all_pods cpu_unscheduled_as_pct = 100.0 * cpu_unscheduled / max_cpu return (cpu_requests_for_all_pods, cpu_scheduled_as_pct, cpu_unscheduled, cpu_unscheduled_as_pct) def get_compute_nodes_scheduled_mem(self): ''' calculate mem allocated to pods (total requested and percentage of cluster-wide total) ''' max_mem = self.get_compute_nodes_max_schedulable_mem() mem_requests_for_all_pods = 0 for row in self.sql_conn.execute('''SELECT SUM(pods.memory_requests) FROM pods, nodes WHERE pods.node = nodes.name AND nodes.type = "compute" '''): mem_requests_for_all_pods = row[0] mem_scheduled_as_pct = 100.0 * mem_requests_for_all_pods / max_mem mem_unscheduled = max_mem - mem_requests_for_all_pods mem_unscheduled_as_pct = 100.0 * mem_unscheduled / max_mem return (mem_requests_for_all_pods, mem_scheduled_as_pct, mem_unscheduled, mem_unscheduled_as_pct) def get_oversub_cpu(self): ''' return percentage oversubscribed based on CPU limits on runing pods ''' max_cpu = self.get_compute_nodes_max_schedulable_cpu() pod_cpu_limits = 0 # get cpu limits for all running pods for row in self.sql_conn.execute('''SELECT SUM(pods.cpu_limits) FROM pods, nodes WHERE pods.node = nodes.name AND nodes.type = "compute" '''): pod_cpu_limits = row[0] return ((float(pod_cpu_limits) / max_cpu) * 100.0) - 100 def get_oversub_mem(self): ''' return percentage oversubscribed based on memory limits on running pods ''' max_mem = self.get_compute_nodes_max_schedulable_mem() pod_mem_limits = 0 # get mem limits for all running pods for row in self.sql_conn.execute('''SELECT SUM(pods.memory_limits) FROM pods, nodes WHERE pods.node = nodes.name AND nodes.type = "compute" '''): pod_mem_limits = row[0] return ((float(pod_mem_limits) / max_mem) * 100.0) - 100 def do_cpu_stats(self): ''' gather and report CPU statistics ''' # CPU items zbx_key_max_schedulable_cpu = self.zbx_key_prefix + "max_schedulable.cpu" zbx_key_scheduled_cpu = self.zbx_key_prefix + "scheduled.cpu" zbx_key_scheduled_cpu_pct = self.zbx_key_prefix + "scheduled.cpu_pct" zbx_key_unscheduled_cpu = self.zbx_key_prefix + "unscheduled.cpu" zbx_key_unscheduled_cpu_pct = self.zbx_key_prefix + "unscheduled.cpu_pct" zbx_key_oversub_cpu_pct = self.zbx_key_prefix + "oversubscribed.cpu_pct" print "CPU Stats:" max_schedulable_cpu = self.get_compute_nodes_max_schedulable_cpu() self.zagg_sender.add_zabbix_keys( {zbx_key_max_schedulable_cpu: max_schedulable_cpu}) scheduled_cpu, scheduled_cpu_pct, unscheduled_cpu, unscheduled_cpu_pct = self.get_compute_nodes_scheduled_cpu( ) oversub_cpu_pct = self.get_oversub_cpu() print " Scheduled CPU for compute nodes:\t\t\t" + \ "{:>15} milicores".format(scheduled_cpu) print " Unscheduled CPU for compute nodes:\t\t\t" + \ "{:>15} milicores".format(unscheduled_cpu) print " Maximum (total) schedulable CPU for compute " + \ "nodes:\t{:>15} milicores".format(max_schedulable_cpu) print " Percent scheduled CPU for compute nodes:\t\t\t" + \ "{:.2f}%".format(scheduled_cpu_pct) print " Percent unscheduled CPU for compute nodes:\t\t\t" + \ "{:.2f}%".format(unscheduled_cpu_pct) print " Percent oversubscribed CPU for compute nodes: \t\t" + \ "{:.2f}%".format(oversub_cpu_pct) self.zagg_sender.add_zabbix_keys( {zbx_key_scheduled_cpu: scheduled_cpu}) self.zagg_sender.add_zabbix_keys( {zbx_key_scheduled_cpu_pct: int(scheduled_cpu_pct)}) self.zagg_sender.add_zabbix_keys( {zbx_key_unscheduled_cpu: unscheduled_cpu}) self.zagg_sender.add_zabbix_keys( {zbx_key_unscheduled_cpu_pct: int(unscheduled_cpu_pct)}) self.zagg_sender.add_zabbix_keys( {zbx_key_oversub_cpu_pct: int(oversub_cpu_pct)}) def do_mem_stats(self): ''' gather and report memory statistics ''' # Memory items zbx_key_max_schedulable_mem = self.zbx_key_prefix + "max_schedulable.mem" zbx_key_scheduled_mem = self.zbx_key_prefix + "scheduled.mem" zbx_key_scheduled_mem_pct = self.zbx_key_prefix + "scheduled.mem_pct" zbx_key_unscheduled_mem = self.zbx_key_prefix + "unscheduled.mem" zbx_key_unscheduled_mem_pct = self.zbx_key_prefix + "unscheduled.mem_pct" zbx_key_oversub_mem_pct = self.zbx_key_prefix + "oversubscribed.mem_pct" print "\nMemory Stats:" max_schedulable_mem = self.get_compute_nodes_max_schedulable_mem() self.zagg_sender.add_zabbix_keys( {zbx_key_max_schedulable_mem: max_schedulable_mem}) scheduled_mem, scheduled_mem_pct, unscheduled_mem, unscheduled_mem_pct = self.get_compute_nodes_scheduled_mem( ) oversub_mem_pct = self.get_oversub_mem() print " Scheduled mem for compute nodes:\t\t\t" + \ "{:>20} bytes".format(scheduled_mem) print " Unscheduled mem for compute nodes:\t\t\t" + \ "{:>20} bytes".format(unscheduled_mem) print " Maximum (total) schedulable memory for compute nodes:\t" + \ "{:>20} bytes".format(max_schedulable_mem) print " Percent scheduled mem for compute nodes:\t\t\t" + \ "{:.2f}%".format(scheduled_mem_pct) print " Percent unscheduled mem for compute nodes:\t\t\t" + \ "{:.2f}%".format(unscheduled_mem_pct) print " Percent oversubscribed mem for compute nodes: \t\t" + \ "{:.2f}%".format(oversub_mem_pct) self.zagg_sender.add_zabbix_keys( {zbx_key_scheduled_mem: scheduled_mem}) self.zagg_sender.add_zabbix_keys( {zbx_key_scheduled_mem_pct: int(scheduled_mem_pct)}) self.zagg_sender.add_zabbix_keys( {zbx_key_unscheduled_mem: unscheduled_mem}) self.zagg_sender.add_zabbix_keys( {zbx_key_unscheduled_mem_pct: int(unscheduled_mem_pct)}) self.zagg_sender.add_zabbix_keys( {zbx_key_oversub_mem_pct: int(oversub_mem_pct)}) def cluster_capacity(self): ''' check capacity of compute nodes on cluster''' # Other zabbix items zbx_key_max_pods = "openshift.master.cluster.max_mem_pods_schedulable" self.sql_conn = sqlite3.connect(':memory:') self.load_nodes() self.load_pods() self.do_cpu_stats() self.do_mem_stats() print "\nOther stats:" largest = self.get_largest_pod() if self.args.debug: print " Largest memory pod: {}".format(largest) schedulable = self.how_many_schedulable(largest) print " Number of max-size nodes schedulable:\t\t\t\t{}".format( schedulable) self.zagg_sender.add_zabbix_keys({zbx_key_max_pods: schedulable})
# Our jenkins server does not include these rpms. # In the future we might move this to a container where these # libs might exist #pylint: disable=import-error from openshift_tools.web.openshift_rest_api import OpenshiftRestApi from openshift_tools.monitoring.metric_sender import MetricSender import logging logging.basicConfig( format='%(asctime)s - %(relativeCreated)6d - %(levelname)-8s - %(message)s', ) logger = logging.getLogger() logger.setLevel(logging.INFO) ora = OpenshiftRestApi() valid_node_types = ["master", "infra", "compute"] def parse_args(): """ parse the args from the cli """ logger.debug("parse_args()") parser = argparse.ArgumentParser(description='OpenShift node counts') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') return parser.parse_args() def send_metrics(expected, actual): """ send data to MetricSender """ logger.debug("send_metrics()")
class OpenshiftWebServiceChecker(object): """ Checks for Openshift Pods """ def __init__(self): self.args = None self.ora = None self.metric_sender = None self.service_ip = None self.service_port = '443' def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_service() status = self.check_service() except Exception as ex: print "Problem retreiving data: %s " % ex.message self.metric_sender.add_metric({ "openshift.webservice.{}.status".format(self.args.pod) : status}) self.metric_sender.send_metrics() def get_service(self): """ Gets the service for a pod """ print "\nLooking up services for pod\n" api_url = "/api/v1/services" if (str(self.args.namespace) != "None") & \ (str(self.args.namespace) != "all"): api_url = '/api/v1/namespaces/{}/services'.format(self.args.namespace) print "using api url {}".format(api_url) api_yaml = self.ora.get(api_url, rtype='text') services = yaml.safe_load(api_yaml) for service in services["items"]: if self.args.pod and \ self.args.pod in service["metadata"]["name"]: print "service IP is {}".format(service["spec"]["clusterIP"]) self.service_ip = service["spec"]["clusterIP"] if self.args.portname != None: for port in service["spec"]["ports"]: if port["name"] == self.args.portname: self.service_port = port["port"] else: self.service_port = service["spec"]["ports"][0]["port"] else: pass def check_service(self): """ Checks the web service """ print "\nChecking web service\n" if self.args.insecure: proto = 'http' else: proto = 'https' url = '{}://{}:{}/{}'.format( proto, self.service_ip, self.service_port, self.args.url, ) try: print "Performing check on URL: {}".format(url) response = urllib2.urlopen(url, timeout=30) if str(response.getcode()) == self.args.status: if self.args.content == None \ or self.args.content in response.read(): return True except urllib2.URLError: print "Received error accessing URL: {}".format(url) except socket.timeout: print "Timed out accessing URL: {}".format(url) return False def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Openshift pod sender') parser.add_argument('-p', '--pod', default=None, help='Check for pod with this specific name') parser.add_argument('-n', '--namespace', default=None, help='Check for pods in this namespace - "all" for all') parser.add_argument('-P', '--portname', default=None, help='name of the port to check') parser.add_argument('-u', '--url', default="/", help='URL to check. Defaults to "/".') parser.add_argument('-s', '--status', default="200", help='HTTP status code to expect. Defaults to 200') parser.add_argument('-c', '--content', default=None, help='Looks for a string in the content of the response.') parser.add_argument('-i', '--insecure', help='Use insecure http connection') parser.add_argument('-S', '--secure', help='Use secure https connection (default)') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') self.args = parser.parse_args()
class OpenshiftMasterZaggClient(object): """ Checks for the Openshift Master """ def __init__(self): self.args = None self.zagg_sender = None self.ora = None self.zabbix_api_key = None self.zabbix_healthz_key = None def run(self): """ Main function to run the check """ self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) if self.args.local: self.ora = OpenshiftRestApi() self.args.api_ping = True self.args.healthz = True self.zabbix_api_key = 'openshift.master.local.api.ping' self.zabbix_healthz_key = 'openshift.master.local.api.healthz' else: master_cfg_from_yaml = [] with open('/etc/origin/master/master-config.yaml', 'r') as yml: master_cfg_from_yaml = yaml.load(yml) self.ora = OpenshiftRestApi(host=master_cfg_from_yaml['oauthConfig']['masterURL'], verify_ssl=True) self.zabbix_api_key = 'openshift.master.api.ping' self.zabbix_healthz_key = 'openshift.master.api.healthz' try: if self.args.healthz or self.args.all_checks: self.healthz_check() except Exception as ex: print "Problem performing healthz check: %s " % ex.message self.zagg_sender.add_zabbix_keys({self.zabbix_healthz_key: 'false'}) try: if self.args.api_ping or self.args.all_checks: self.api_ping() if self.args.project_count or self.args.all_checks: self.project_count() if self.args.pod_count or self.args.all_checks: self.pod_count() if self.args.user_count or self.args.all_checks: self.user_count() if self.args.pv_info or self.args.all_checks: self.pv_info() if self.args.nodes_not_ready or self.args.all_checks: self.nodes_not_ready() except Exception as ex: print "Problem Openshift API checks: %s " % ex.message self.zagg_sender.add_zabbix_keys({self.zabbix_api_key: 0}) # Openshift API is down try: if self.args.metrics or self.args.all_checks: self.metric_check() except Exception as ex: print "Problem getting Openshift metrics at /metrics: %s " % ex.message self.zagg_sender.add_zabbix_keys({'openshift.master.metric.ping' : 0}) # Openshift Metrics are down self.zagg_sender.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Network metric sender') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('-l', '--local', action='store_true', default=False, help='Run local checks against the local API (https://127.0.0.1)') master_check_group = parser.add_argument_group('Different Checks to Perform') master_check_group.add_argument('--all-checks', action='store_true', default=None, help='Do all of the checks') master_check_group.add_argument('--api-ping', action='store_true', default=None, help='Verify the Openshift API is alive') master_check_group.add_argument('--healthz', action='store_true', default=None, help='Query the Openshift Master API /healthz') master_check_group.add_argument('--metrics', action='store_true', default=None, help='Query the Openshift Master Metrics at /metrics') master_check_group.add_argument('--project-count', action='store_true', default=None, help='Query the Openshift Master for Number of Pods') master_check_group.add_argument('--pod-count', action='store_true', default=None, help='Query the Openshift Master for Number of Running Pods') master_check_group.add_argument('--user-count', action='store_true', default=None, help='Query the Openshift Master for Number of Users') master_check_group.add_argument('--pv-info', action='store_true', default=None, help='Query the Openshift Master for Persistent Volumes Info') master_check_group.add_argument('--nodes-not-ready', action='store_true', default=None, help='Query the Openshift Master for number of nodes not in Ready state') self.args = parser.parse_args() def api_ping(self): """ Verify the Openshift API health is responding correctly """ print "\nPerforming Openshift API ping check..." response = self.ora.get('/api/v1/nodes') print "\nOpenshift API ping is alive" print "Number of nodes in the Openshift cluster: %s" % len(response['items']) self.zagg_sender.add_zabbix_keys({self.zabbix_api_key: 1, 'openshift.master.node.count': len(response['items'])}) def healthz_check(self): """ check the /healthz API call """ print "\nPerforming /healthz check..." response = self.ora.get('/healthz', rtype='text') print "healthz check returns: %s " %response self.zagg_sender.add_zabbix_keys({self.zabbix_healthz_key: str('ok' in response).lower()}) def metric_check(self): """ collect certain metrics from the /metrics API call """ print "\nPerforming /metrics check..." response = self.ora.get('/metrics', rtype='text') for metric_type in text_string_to_metric_families(response): # Collect the apiserver_request_latencies_summary{resource="pods",verb="LIST",quantiles in /metrics # Collect the apiserver_request_latencies_summary{resource="pods",verb="WATCHLIST",quantiles in /metrics if metric_type.name == 'apiserver_request_latencies_summary': key_str = 'openshift.master.apiserver.latency.summary' for sample in metric_type.samples: if (sample[1]['resource'] == 'pods' and sample[1].has_key('quantile') and 'LIST' in sample[1]['verb']): curr_key_str = key_str + ".pods.quantile.%s.%s" % (sample[1]['verb'], sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.zagg_sender.add_zabbix_keys({curr_key_str.lower(): int(value/1000)}) # Collect the scheduler_e2e_scheduling_latency_microseconds{quantiles in /metrics if metric_type.name == 'scheduler_e2e_scheduling_latency_microseconds': for sample in metric_type.samples: if sample[1].has_key('quantile'): key_str = 'openshift.master.scheduler.e2e.scheduling.latency' curr_key_str = key_str + ".quantile.%s" % (sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.zagg_sender.add_zabbix_keys({curr_key_str.lower(): int(value/1000)}) self.zagg_sender.add_zabbix_keys({'openshift.master.metric.ping' : 1}) # def project_count(self): """ check the number of projects in Openshift """ print "\nPerforming project count check..." excluded_names = ['openshift', 'openshift-infra', 'default', 'ops-monitor'] response = self.ora.get('/oapi/v1/projects') project_names = [project['metadata']['name'] for project in response['items']] valid_names = set(project_names) - set(excluded_names) print "Project count: %s" % len(valid_names) self.zagg_sender.add_zabbix_keys({'openshift.project.count' : len(valid_names)}) def pod_count(self): """ check the number of pods in Openshift """ print "\nPerforming pod count check..." response = self.ora.get('/api/v1/pods') # Get running pod count running_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: running_pod_count += 1 # Get running pod count on compute only nodes (non-infra) running_user_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: if 'nodeSelector' in i['spec']: if i['spec']['nodeSelector']['type'] == 'compute': running_user_pod_count += 1 print "Total pod count: %s" % len(response['items']) print "Running pod count: %s" % running_pod_count print "User Running pod count: %s" % running_user_pod_count self.zagg_sender.add_zabbix_keys({'openshift.master.pod.running.count' : running_pod_count, 'openshift.master.pod.user.running.count' : running_user_pod_count, 'openshift.master.pod.total.count' : len(response['items'])}) def user_count(self): """ check the number of users in Openshift """ print "\nPerforming user count check..." response = self.ora.get('/oapi/v1/users') print "Total user count: %s" % len(response['items']) self.zagg_sender.add_zabbix_keys({'openshift.master.user.count' : len(response['items'])}) def pv_info(self): """ Gather info about the persistent volumes in Openshift """ print "\nPerforming user persistent volume count...\n" response = self.ora.get('/api/v1/persistentvolumes') pv_capacity_total = 0 pv_capacity_available = 0 pv_types = {'Available': 0, 'Bound': 0, 'Released': 0, 'Failed': 0} # Dynamic items variables discovery_key_pv = 'disc.pv' item_prototype_macro_pv = '#OSO_PV' item_prototype_key_count = 'disc.pv.count' item_prototype_key_available = 'disc.pv.available' dynamic_pv_count = defaultdict(int) dynamic_pv_available = defaultdict(int) for item in response['items']: # gather dynamic pv counts dynamic_pv_count[item['spec']['capacity']['storage']] += 1 #get count of each pv type available pv_types[item['status']['phase']] += 1 #get info for the capacity and capacity available capacity = item['spec']['capacity']['storage'] if item['status']['phase'] == 'Available': # get total available capacity pv_capacity_available = pv_capacity_available + int(capacity.replace('Gi', '')) # gather dynamic pv available counts dynamic_pv_available[item['spec']['capacity']['storage']] += 1 pv_capacity_total = pv_capacity_total + int(capacity.replace('Gi', '')) print "Total Persistent Volume Total count: %s" % len(response['items']) print 'Total Persistent Volume Capacity: %s' % pv_capacity_total print 'Total Persisten Volume Available Capacity: %s' % pv_capacity_available self.zagg_sender.add_zabbix_keys( {'openshift.master.pv.total.count' : len(response['items']), 'openshift.master.pv.space.total': pv_capacity_total, 'openshift.master.pv.space.available': pv_capacity_available}) for key, value in pv_types.iteritems(): print "Total Persistent Volume %s count: %s" % (key, value) self.zagg_sender.add_zabbix_keys( {'openshift.master.pv.%s.count' %key.lower() : value}) # Add dynamic items self.zagg_sender.add_zabbix_dynamic_item(discovery_key_pv, item_prototype_macro_pv, dynamic_pv_count.keys()) for size, count in dynamic_pv_count.iteritems(): print print "Total Persistent Volume %s count: %s" % (size, count) print "Total Persistent Volume available %s count: %s" % (size, dynamic_pv_available[size]) self.zagg_sender.add_zabbix_keys({"%s[%s]" %(item_prototype_key_count, size) : count, "%s[%s]" %(item_prototype_key_available, size) : dynamic_pv_available[size]}) def nodes_not_ready(self): """ check the number of nodes in the cluster that are not ready""" print "\nPerforming nodes not ready check..." response = self.ora.get('/api/v1/nodes') nodes_not_schedulable = [] for n in response['items']: if "unschedulable" in n['spec']: nodes_not_schedulable.append(n) nodes_not_ready = [] for n in response['items']: has_ready_status = False for cond in n['status']['conditions']: if cond['reason'] == "KubeletReady": has_ready_status = True if cond['status'].lower() != "true": nodes_not_ready.append(n) if has_ready_status == False: nodes_not_ready.append(n) print "Count of nodes not schedulable: %s" % len(nodes_not_schedulable) print "Count of nodes not ready: %s" % len(nodes_not_ready) self.zagg_sender.add_zabbix_keys( {'openshift.master.nodesnotready.count' : len(nodes_not_ready)}) self.zagg_sender.add_zabbix_keys( {'openshift.master.nodesnotschedulable.count' : len(nodes_not_schedulable)})
def run(self): """ Main function to run the check """ self.parse_args() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) if self.args.local: self.ora = OpenshiftRestApi() self.args.api_ping = True self.args.healthz = True self.zabbix_api_key = 'openshift.master.local.api.ping' self.zabbix_healthz_key = 'openshift.master.local.api.healthz' else: master_cfg_from_yaml = [] with open('/etc/origin/master/master-config.yaml', 'r') as yml: master_cfg_from_yaml = yaml.load(yml) self.ora = OpenshiftRestApi(host=master_cfg_from_yaml['oauthConfig']['masterURL'], verify_ssl=True) self.zabbix_api_key = 'openshift.master.api.ping' self.zabbix_healthz_key = 'openshift.master.api.healthz' try: if self.args.healthz or self.args.all_checks: self.healthz_check() except Exception as ex: print "Problem performing healthz check: %s " % ex.message self.metric_sender.add_metric({self.zabbix_healthz_key: 'false'}) try: if self.args.api_ping or self.args.all_checks: self.api_ping() if self.args.project_count or self.args.all_checks: self.project_count() if self.args.pod_count or self.args.all_checks: self.pod_count() if self.args.user_count or self.args.all_checks: self.user_count() if self.args.pv_info or self.args.all_checks: self.pv_info() if self.args.node_checks or self.args.all_checks: self.nodes_not_schedulable() self.nodes_not_ready() self.nodes_not_labeled() except Exception as ex: print "Problem Openshift API checks: %s " % ex.message self.metric_sender.add_metric({self.zabbix_api_key: 0}) # Openshift API is down try: if self.args.metrics or self.args.all_checks: self.metric_check() except Exception as ex: print "Problem getting Openshift metrics at /metrics: %s " % ex.message self.metric_sender.add_metric({'openshift.master.metric.ping' : 0}) # Openshift Metrics are down self.metric_sender.send_metrics()
def run(self): """ Main function to run the check """ self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) if self.args.local: self.ora = OpenshiftRestApi() self.args.api_ping = True self.args.healthz = True self.zabbix_api_key = 'openshift.master.local.api.ping' self.zabbix_healthz_key = 'openshift.master.local.api.healthz' else: master_cfg_from_yaml = [] with open('/etc/origin/master/master-config.yaml', 'r') as yml: master_cfg_from_yaml = yaml.load(yml) self.ora = OpenshiftRestApi(host=master_cfg_from_yaml['oauthConfig']['masterURL'], verify_ssl=True) self.zabbix_api_key = 'openshift.master.api.ping' self.zabbix_healthz_key = 'openshift.master.api.healthz' try: if self.args.healthz or self.args.all_checks: self.healthz_check() except Exception as ex: print "Problem performing healthz check: %s " % ex.message self.zagg_sender.add_zabbix_keys({self.zabbix_healthz_key: 'false'}) try: if self.args.api_ping or self.args.all_checks: self.api_ping() if self.args.project_count or self.args.all_checks: self.project_count() if self.args.pod_count or self.args.all_checks: self.pod_count() if self.args.user_count or self.args.all_checks: self.user_count() if self.args.pv_info or self.args.all_checks: self.pv_info() if self.args.nodes_not_ready or self.args.all_checks: self.nodes_not_ready() except Exception as ex: print "Problem Openshift API checks: %s " % ex.message self.zagg_sender.add_zabbix_keys({self.zabbix_api_key: 0}) # Openshift API is down try: if self.args.metrics or self.args.all_checks: self.metric_check() except Exception as ex: print "Problem getting Openshift metrics at /metrics: %s " % ex.message self.zagg_sender.add_zabbix_keys({'openshift.master.metric.ping' : 0}) # Openshift Metrics are down self.zagg_sender.send_metrics()
def __init__(self): self.args = None self.zagg_sender = None self.ora = OpenshiftRestApi()
class OpenshiftMasterZaggClient(object): """ Checks for the Openshift Master """ def __init__(self): self.args = None self.zagg_sender = None self.ora = OpenshiftRestApi() def run(self): """ Main function to run the check """ self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) try: if self.args.healthz or self.args.all_checks: self.healthz_check() except Exception as ex: print "Problem performing healthz check: %s " % ex.message self.zagg_sender.add_zabbix_keys({'openshift.master.api.healthz' : 'false'}) try: if self.args.api_ping or self.args.all_checks: self.api_ping() if self.args.project_count or self.args.all_checks: self.project_count() if self.args.pod_count or self.args.all_checks: self.pod_count() if self.args.user_count or self.args.all_checks: self.user_count() except Exception as ex: print "Problem Openshift API checks: %s " % ex.message self.zagg_sender.add_zabbix_keys({'openshift.master.api.ping' : 0}) # Openshift API is down try: if self.args.metrics or self.args.all_checks: self.metric_check() except Exception as ex: print "Problem getting Openshift metrics at /metrics: %s " % ex.message self.zagg_sender.add_zabbix_keys({'openshift.master.metric.ping' : 0}) # Openshift Metrics are down self.zagg_sender.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Network metric sender') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') master_check_group = parser.add_argument_group('Different Checks to Perform') master_check_group.add_argument('--all-checks', action='store_true', default=None, help='Do all of the checks') master_check_group.add_argument('--api-ping', action='store_true', default=None, help='Verify the Openshift API is alive') master_check_group.add_argument('--healthz', action='store_true', default=None, help='Query the Openshift Master API /healthz') master_check_group.add_argument('--metrics', action='store_true', default=None, help='Query the Openshift Master Metrics at /metrics') master_check_group.add_argument('--project-count', action='store_true', default=None, help='Query the Openshift Master for Number of Pods') master_check_group.add_argument('--pod-count', action='store_true', default=None, help='Query the Openshift Master for Number of Running Pods') master_check_group.add_argument('--user-count', action='store_true', default=None, help='Query the Openshift Master for Number of Users') self.args = parser.parse_args() def api_ping(self): """ Verify the Openshift API health is responding correctly """ print "\nPerforming Openshift API ping check..." response = self.ora.get('/api/v1/nodes') print "\nOpenshift API ping is alive" print "Number of nodes in the Openshift cluster: %s" % len(response['items']) self.zagg_sender.add_zabbix_keys({'openshift.master.api.ping' : 1, 'openshift.master.node.count': len(response['items'])}) def healthz_check(self): """ check the /healthz API call """ print "\nPerforming /healthz check..." response = self.ora.get('/healthz', rtype='text') print "healthz check returns: %s " %response self.zagg_sender.add_zabbix_keys({'openshift.master.api.healthz' : str('ok' in response).lower()}) def metric_check(self): """ collect certain metrics from the /metrics API call """ print "\nPerforming /metrics check..." response = self.ora.get('/metrics', rtype='text') for metric_type in text_string_to_metric_families(response): # Collect the apiserver_request_latencies_summary{resource="pods",verb="LIST",quantiles in /metrics # Collect the apiserver_request_latencies_summary{resource="pods",verb="WATCHLIST",quantiles in /metrics if metric_type.name == 'apiserver_request_latencies_summary': key_str = 'openshift.master.apiserver.latency.summary' for sample in metric_type.samples: if (sample[1]['resource'] == 'pods' and sample[1].has_key('quantile') and 'LIST' in sample[1]['verb']): curr_key_str = key_str + ".pods.quantile.%s.%s" % (sample[1]['verb'], sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.zagg_sender.add_zabbix_keys({curr_key_str.lower(): int(value/1000)}) # Collect the scheduler_e2e_scheduling_latency_microseconds{quantiles in /metrics if metric_type.name == 'scheduler_e2e_scheduling_latency_microseconds': for sample in metric_type.samples: if sample[1].has_key('quantile'): key_str = 'openshift.master.scheduler.e2e.scheduling.latency' curr_key_str = key_str + ".quantile.%s" % (sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.zagg_sender.add_zabbix_keys({curr_key_str.lower(): int(value/1000)}) self.zagg_sender.add_zabbix_keys({'openshift.master.metric.ping' : 1}) # def project_count(self): """ check the number of projects in Openshift """ print "\nPerforming project count check..." excluded_names = ['openshift', 'openshift-infra', 'default', 'ops-monitor'] response = self.ora.get('/oapi/v1/projects') project_names = [project['metadata']['name'] for project in response['items']] valid_names = set(project_names) - set(excluded_names) print "Project count: %s" % len(valid_names) self.zagg_sender.add_zabbix_keys({'openshift.project.count' : len(valid_names)}) def pod_count(self): """ check the number of pods in Openshift """ print "\nPerforming pod count check..." response = self.ora.get('/api/v1/pods') # Get running pod count running_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: running_pod_count += 1 # Get running pod count on compute only nodes (non-infra) running_user_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: if 'nodeSelector' in i['spec']: if i['spec']['nodeSelector']['type'] == 'compute': running_user_pod_count += 1 print "Total pod count: %s" % len(response['items']) print "Running pod count: %s" % running_pod_count print "User Running pod count: %s" % running_user_pod_count self.zagg_sender.add_zabbix_keys({'openshift.master.pod.running.count' : running_pod_count, 'openshift.master.pod.user.running.count' : running_user_pod_count, 'openshift.master.pod.total.count' : len(response['items'])}) def user_count(self): """ check the number of users in Openshift """ print "\nPerforming user count check..." response = self.ora.get('/oapi/v1/users') print "Total user count: %s" % len(response['items']) self.zagg_sender.add_zabbix_keys({'openshift.master.user.count' : len(response['items'])})
class OpenshiftClusterCapacity(object): ''' Checks for cluster capacity ''' def __init__(self): self.args = None self.zagg_sender = None self.ora = None self.sql_conn = None def run(self): ''' Main function to run the check ''' self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) master_cfg = [] with open(self.args.master_config, 'r') as yml: master_cfg = yaml.load(yml) self.ora = OpenshiftRestApi( host=master_cfg['oauthConfig']['masterURL'], verify_ssl=True) self.node_capacity() if not self.args.dry_run: self.zagg_sender.send_metrics() def parse_args(self): ''' parse the args from the cli ''' parser = argparse.ArgumentParser(description='Cluster capacity sender') parser.add_argument( '--master-config', default='/etc/origin/master/master-config.yaml', help='Location of OpenShift master-config.yml file') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('--dry-run', action='store_true', default=False, help='Do not sent results to Zabbix') self.args = parser.parse_args() def load_nodes(self): ''' load nodes into SQL ''' self.sql_conn.execute('''CREATE TABLE nodes (name text, type text, api text, max_cpu integer, max_memory integer, max_pods integer)''') response = self.ora.get('/api/v1/nodes') for new_node in response['items']: # Skip nodes not in 'Ready' state node_ready = False for condition in new_node['status']['conditions']: if condition['type'] == 'Ready' and \ condition['status'] == 'True': node_ready = True if not node_ready: continue node = {} node['name'] = new_node['metadata']['name'] node['type'] = new_node['metadata']['labels']['type'] node['api'] = new_node['metadata']['selfLink'] if 'allocatable' in new_node['status']: cpu = new_node['status']['allocatable']['cpu'] mem = new_node['status']['allocatable']['memory'] node['max_pods'] = int( new_node['status']['allocatable']['pods']) else: cpu = new_node['status']['capacity']['cpu'] mem = new_node['status']['capacity']['memory'] node['max_pods'] = int(new_node['status']['capacity']['pods']) node['max_cpu'] = to_milicores(cpu) node['max_memory'] = to_bytes(mem) if self.args.debug: print "Adding node: {}".format(str(node)) self.sql_conn.execute( 'INSERT INTO nodes VALUES (?,?,?,?,?,?)', (node['name'], node['type'], node['api'], node['max_cpu'], node['max_memory'], node['max_pods'])) @staticmethod def load_container_limits(pod, containers): ''' process/store container limits data ''' for container in containers: if 'limits' in container['resources']: cpu = container['resources']['limits'].get('cpu') if cpu: pod['cpu_limits'] = pod.get('cpu_limits', 0) + \ to_milicores(cpu) mem = container['resources']['limits'].get('memory') if mem: pod['memory_limits'] = pod.get('memory_limits', 0) + \ to_bytes(mem) if 'requests' in container['resources']: cpu = container['resources']['requests'].get('cpu') if cpu: pod['cpu_requests'] = pod.get('cpu_requests', 0) + \ to_milicores(cpu) mem = container['resources']['requests'].get('memory') if mem: pod['memory_requests'] = pod.get('memory_requests', 0) + \ to_bytes(mem) def load_pods(self): ''' put pod details into db ''' self.sql_conn.execute('''CREATE TABLE pods (name text, namespace text, api text, cpu_limits integer, cpu_requets integer, memory_limits integer, memory_requests integer, node text)''') response = self.ora.get('/api/v1/pods') for new_pod in response['items']: if new_pod['status']['phase'] != 'Running': continue pod = {} pod['name'] = new_pod['metadata']['name'] pod['namespace'] = new_pod['metadata']['namespace'] pod['api'] = new_pod['metadata']['selfLink'] pod['node'] = new_pod['spec']['nodeName'] self.load_container_limits(pod, new_pod['spec']['containers']) self.sql_conn.execute( 'INSERT INTO pods VALUES (?,?,?,?,?,?,?,?)', (pod['name'], pod['namespace'], pod['api'], pod.get('cpu_limits'), pod.get('cpu_requests'), pod.get('memory_limits'), pod.get('memory_requests'), pod['node'])) def get_memory_percentage(self): ''' calculate pod memory limits as a percentage of cluster (compute-node) memory capacity ''' node_mem = 0 pod_mem = 0 for row in self.sql_conn.execute('''SELECT SUM(nodes.max_memory) FROM nodes WHERE nodes.type="compute"'''): node_mem = row[0] for row in self.sql_conn.execute('''SELECT SUM(pods.memory_limits) FROM pods, nodes WHERE pods.node=nodes.name AND nodes.type="compute"'''): pod_mem = row[0] return float(100) * pod_mem / node_mem def get_largest_pod(self): ''' return memory limit for largest pod ''' max_pod = 0 for row in self.sql_conn.execute('''SELECT MAX(memory_limits) FROM pods, nodes WHERE pods.node=nodes.name AND nodes.type="compute"'''): max_pod = row[0] return max_pod def how_many_schedulable(self, size): ''' return how many pods with memory 'size' can be scheduled ''' nodes = {} # get max mem for each compute node for row in self.sql_conn.execute('''SELECT nodes.name, nodes.max_memory FROM nodes WHERE nodes.type="compute"'''): nodes[row[0]] = {'max_memory': row[1]} # get memory allocated/granted for each compute node for row in self.sql_conn.execute('''SELECT nodes.name, SUM(pods.memory_limits) FROM pods, nodes WHERE pods.node=nodes.name AND nodes.type="compute" GROUP BY nodes.name'''): nodes[row[0]]['memory_allocated'] = row[1] schedulable = 0 for node in nodes.keys(): available = nodes[node]['max_memory'] - \ nodes[node]['memory_allocated'] num = available / size # ignore negative number (overcommitted nodes) if num > 0: schedulable += num return schedulable def node_capacity(self): ''' check capacity of compute nodes ''' zbx_key_mem_alloc = "openshift.master.cluster.memory_allocation" zbx_key_max_pods = "openshift.master.cluster.max_mem_pods_schedulable" self.sql_conn = sqlite3.connect(':memory:') self.load_nodes() self.load_pods() memory_percentage = self.get_memory_percentage() largest = self.get_largest_pod() if self.args.debug: print "Largest memory pod: {}".format(largest) schedulable = self.how_many_schedulable(largest) print "Percentage of memory allocated: {}".format(memory_percentage) print "Number of max-size nodes schedulable: {}".format(schedulable) self.zagg_sender.add_zabbix_keys( {zbx_key_mem_alloc: int(memory_percentage)}) self.zagg_sender.add_zabbix_keys({zbx_key_max_pods: schedulable})
class OpenshiftPodChecker(object): """ Checks for Openshift Pods """ def __init__(self): self.args = None self.ora = None self.metric_sender = None def run(self): """ Main function to run the check """ self.parse_args() self.ora = OpenshiftRestApi() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) try: self.get_pods() except Exception as ex: print "Problem retreiving pod data: %s " % ex.message self.metric_sender.send_metrics() def get_pods(self): """ Gets pod data """ print "\nPerforming pod check ...\n" api_url = '/api/v1/pods' if (str(self.args.namespace) != "None") & \ (str(self.args.namespace) != "all"): api_url = '/api/v1/namespaces/{}/pods'.format(self.args.namespace) api_yaml = self.ora.get(api_url, rtype='text') pods = yaml.safe_load(api_yaml) pod_count = 0 for pod in pods["items"]: if self.args.pod and \ self.args.pod in pod["metadata"]["name"]: print "status of {} is {}".format( pod["metadata"]["name"], pod["status"]["phase"], ) if pod["status"]["phase"] == "Running": pod_count += 1 else: pass self.metric_sender.add_metric( {"service.pod.{}.count".format(self.args.pod): pod_count}) def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Openshift pod sender') parser.add_argument('-p', '--pod', default=None, help='Check for pod with this specific name') parser.add_argument( '-n', '--namespace', default=None, help='Check for pods in this namespace - "all" for all') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') self.args = parser.parse_args()