def __init__( self, args=None, ): '''initial for the InfraNodePodStatus''' self.args = args self.kubeconfig = '/tmp/admin.kubeconfig' self.oc = OCUtil(namespace=self.args.namespace, config_file=self.kubeconfig)
def __init__(self): """ initialize EBSStuckVolumesCheck class """ self.args = None self.vol_state_data = None self.parse_args() # Make sure we're using the profile they've requested. if self.args.aws_creds_profile: os.environ['AWS_PROFILE'] = self.args.aws_creds_profile self.eu = EbsUtil(self.args.region, verbose=self.args.verbose) self.ocutil = OCUtil(verbose=self.args.verbose) self.mts = MetricSender(verbose=self.args.verbose)
def run(self): """Main function to run the check""" self.ocutil = OCUtil(config_file=self.kubeconfig, verbose=self.args.verbose) self.zgs = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) self.check_all_router_health() self.check_router_replica_count() if self.args.dry_run: self.zgs.print_unique_metrics_key_value() else: self.zgs.send_metrics()
def main(): ''' Gather and send details on all visible S3 buckets ''' logger.info("start") discovery_key = "disc.aws" discovery_macro = "#S3_BUCKET" prototype_s3_size = "disc.aws.size" prototype_s3_count = "disc.aws.objects" args = parse_args() if args.verbose: logger.setLevel(logging.DEBUG) logger.debug("verbose flag set") ocutil = OCUtil() dc_yaml = ocutil.get_dc('docker-registry') registry_config_secret = get_registry_config_secret(dc_yaml) oc_yaml = ocutil.get_secrets(registry_config_secret) aws_access, aws_secret = get_aws_creds(oc_yaml) awsutil = AWSUtil(aws_access, aws_secret, args.debug) bucket_list = awsutil.get_bucket_list(args.debug) bucket_stats = {} for bucket in bucket_list: s3_size, s3_objects = awsutil.get_bucket_info(bucket, args.debug) bucket_stats[bucket] = {"size": s3_size, "objects": s3_objects} if args.debug: print "Bucket stats: " + str(bucket_stats) if args.test: print "Test-only. Received results: " + str(bucket_stats) else: zgs = ZaggSender(verbose=args.debug) zgs.add_zabbix_dynamic_item(discovery_key, discovery_macro, bucket_list) for bucket in bucket_stats.keys(): zab_key = "{}[{}]".format(prototype_s3_size, bucket) zgs.add_zabbix_keys( {zab_key: int(round(bucket_stats[bucket]["size"]))}) zab_key = "{}[{}]".format(prototype_s3_count, bucket) zgs.add_zabbix_keys({zab_key: bucket_stats[bucket]["objects"]}) zgs.send_metrics()
def get_registry_config_secret(yaml_results): ''' Find the docker registry config secret ''' ocutil = OCUtil() volumes = yaml_results['spec']['template']['spec']['volumes'] for volume in volumes: if 'emptyDir' in volume: continue secret_dict = ocutil.get_secrets(volume['secret']['secretName']) if 'config.yml' in secret_dict['data']: return volume['secret']['secretName'] print "Unable to find the %s the docker registry config" print "Please run \"oc get dc docker-registry\" to investigate" sys.exit(1)
def run(self): ''' Main function that runs the check ''' self.parse_args() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) self.oc = OCUtil(namespace='openshift-infra', config_file='/tmp/admin.kubeconfig', verbose=self.args.verbose) pod_report = self.check_pods() self.get_hawkular_creds() metrics_report = self.check_node_metrics() # if metrics_report = 0, we need this check run again if metrics_report['success'] == 0: # sleep for 5 seconds, then run the second time node check logger.info( "The first time metrics check failed, 5 seconds later will start a second time check" ) time.sleep(commandDelay) logger.info("starting the second time metrics check") metrics_report = self.check_node_metrics() # persist second attempt if fails if metrics_report['success'] == 0: self.persist_details(metrics_report) self.report_to_zabbix(pod_report, metrics_report['success'])
def get_logging_namespace(self): """ Determine which logging namespace is in use """ # Assume the correct namespace is 'openshift-logging' and fall back to 'logging' # if that assumption ends up being wrong. oc_client = OCUtil(namespace='openshift-logging', config_file='/tmp/admin.kubeconfig', verbose=self.args.verbose) logger.info("Determining which namespace is in use...") try: oc_client.get_dc('logging-kibana') # If the previous call didn't throw an exception, logging is deployed in this namespace. logger.info("Using namespace: openshift-logging") return 'openshift-logging' except subprocess.CalledProcessError: logger.info("Using namespace: logging") return 'logging'
def main(): ''' Gather and send details on all visible S3 buckets ''' discovery_key = "disc.gcp" discovery_macro = "#GCS_BUCKET" prototype_bucket_size = "disc.gcp.size" prototype_bucket_count = "disc.gcp.objects" args = parse_args() ocutil = OCUtil() dc_yaml = ocutil.get_dc('docker-registry') registry_config_secret = get_registry_config_secret(dc_yaml) oc_yaml = ocutil.get_secrets(registry_config_secret) bucket = get_gcp_info(oc_yaml) gsutil = GcloudUtil(verbose=args.debug) bucket_list = gsutil.get_bucket_list() bucket_stats = {} for bucket in bucket_list: size, objects = gsutil.get_bucket_info(bucket) bucket_stats[bucket] = {"size": size, "objects": objects} if args.debug: print "Bucket stats: " + str(bucket_stats) if args.test: print "Test-only. Received results: " + str(bucket_stats) else: zgs = ZaggSender(verbose=args.debug) zgs.add_zabbix_dynamic_item(discovery_key, discovery_macro, bucket_list) for bucket in bucket_stats.keys(): zab_key = "{}[{}]".format(prototype_bucket_size, bucket) zgs.add_zabbix_keys({zab_key: int(round(bucket_stats[bucket]["size"]))}) zab_key = "{}[{}]".format(prototype_bucket_count, bucket) zgs.add_zabbix_keys({zab_key: bucket_stats[bucket]["objects"]}) zgs.send_metrics()
def run(self): ''' Main function that runs the check ''' self.parse_args() self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) self.oc = OCUtil(namespace='openshift-infra', config_file=self.kubeconfig, verbose=self.args.verbose) pod_report = self.check_pods() metrics_report = self.check_node_metrics() self.report_to_zabbix(pod_report, metrics_report)
def run(self): """ Main function to run the check """ self.parse_args() self.get_kubeconfig() ocutil = OCUtil(config_file=self.kubeconfig, verbose=self.args.verbose) self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug) try: oc_yaml = ocutil.get_service('docker-registry') self.get_registry_service(oc_yaml) oc_yaml = ocutil.get_endpoint('docker-registry') self.get_registry_endpoints(oc_yaml) except Exception as ex: print "Problem retreiving registry IPs: %s " % ex.message self.registry_service_check() self.registry_health_check() self.zagg_sender.send_metrics()
def run(self): """ Main function to run the check """ self.parse_args() self.get_kubeconfig() ocutil = OCUtil(config_file=self.kubeconfig, verbose=self.args.verbose) self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) try: oc_yaml = ocutil.get_service('docker-registry') self.get_registry_service(oc_yaml) oc_yaml = ocutil.get_endpoint('docker-registry') self.get_registry_endpoints(oc_yaml) except Exception as ex: print "Problem retreiving registry IPs: %s " % ex.message self.registry_service_check() self.registry_health_check() self.metric_sender.send_metrics()
def run(self): """ Main function that runs the check """ self.parse_args() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) self.oc = OCUtil(namespace=self.get_logging_namespace(), config_file='/tmp/admin.kubeconfig', verbose=self.args.verbose) self.get_pods() oldest_buffer = self.check_fluentd_queues() self.send_metrics(oldest_buffer)
def main(): """ Gather and send details on all visible S3 buckets """ discovery_key = "disc.aws" discovery_macro = "#S3_BUCKET" prototype_s3_size = "disc.aws.size" prototype_s3_count = "disc.aws.objects" args = parse_args() ocutil = OCUtil() oc_yaml = ocutil.get_secrets("dockerregistry") aws_access, aws_secret = get_aws_creds(oc_yaml) awsutil = AWSUtil(aws_access, aws_secret, args.debug) bucket_list = awsutil.get_bucket_list(args.debug) bucket_stats = {} for bucket in bucket_list: s3_size, s3_objects = awsutil.get_bucket_info(bucket, args.debug) bucket_stats[bucket] = {"size": s3_size, "objects": s3_objects} if args.debug: print "Bucket stats: " + str(bucket_stats) if args.test: print "Test-only. Received results: " + str(bucket_stats) else: zgs = ZaggSender(verbose=args.debug) zgs.add_zabbix_dynamic_item(discovery_key, discovery_macro, bucket_list) for bucket in bucket_stats.keys(): zab_key = "{}[{}]".format(prototype_s3_size, bucket) zgs.add_zabbix_keys({zab_key: int(round(bucket_stats[bucket]["size"]))}) zab_key = "{}[{}]".format(prototype_s3_count, bucket) zgs.add_zabbix_keys({zab_key: bucket_stats[bucket]["objects"]}) zgs.send_metrics()
class ZabbixInfo(object): ''' this will check the zabbix data and compare it with the real world ''' def __init__(self, args=None, ): '''initial for the InfraNodePodStatus''' self.args = args self.kubeconfig = '/tmp/admin.kubeconfig' self.oc = OCUtil(namespace=self.args.namespace, config_file=self.kubeconfig) def check_all_hosts(self, zabbix_data_sync_inventory_hosts, clusterid): ''' check the situation ''' result = 1 zabbix_data_sync_inventory_hosts_names = [] for host in zabbix_data_sync_inventory_hosts: zabbix_data_sync_inventory_hosts_names.append(host['name']) desire_number_cluster = cluster_desired_compute_size + cluster_desired_infra_size + cluster_desired_master_size logging.getLogger().info("the requested number of instance is :" + str(desire_number_cluster)) hosts = self.oc.get_nodes() #print hosts for host in hosts['items']: hostnameincluster = "" if host['metadata']['labels']['type'] == 'master': hostnameincluster = host['metadata']['labels']['hostname'] elif host['metadata']['labels']['type'] == 'infra': hostnameincluster = clusterid + "-infra-" + host['metadata']['labels']['kubernetes.io/hostname'] else: hostnameincluster = clusterid + "-compute-" + host['metadata']['labels']['kubernetes.io/hostname'] if hostnameincluster in zabbix_data_sync_inventory_hosts_names: logging.getLogger().info("found host in zabbix :" + str(hostnameincluster)) else: result = 0 logging.getLogger().info("host not in zabbix:" + str(hostnameincluster)) if result == 1: if len(hosts['items']) == desire_number_cluster: logging.getLogger().info("currrently cluster have :" + str(len(hosts['items']))) logging.getLogger().info("all the node under monitoring and the number is the same as requested:" + str(desire_number_cluster)) else: result = 2 logging.getLogger().info("cluster node number is different with requested") return result def send_metrics(self, status): """send_metrics""" ms = MetricSender(verbose=self.args.verbose) ms.add_metric({'openshift.master.zabbix.inventory.status': status}) ms.send_metrics()
def main(): ''' Gather and send details on all visible S3 buckets ''' #get the region with open('/container_setup/monitoring-config.yml', 'r') as f: doc = yaml.load(f) bucket_region = doc['oso_region'] args = parse_args() ocutil = OCUtil() dc_yaml = ocutil.get_dc('docker-registry') registry_config_secret = get_registry_config_secret(dc_yaml) oc_yaml = ocutil.get_secrets(registry_config_secret) aws_access, aws_secret = get_aws_creds(oc_yaml) awsutil = AWSUtil(aws_access, aws_secret, args.debug) bucket_list = awsutil.get_bucket_list(verbose=args.debug, BucketRegion=bucket_region) bucket_stats = {} for bucket in bucket_list: #print bucket s3_size, s3_objects = awsutil.get_bucket_info( bucket, verbose=args.debug, BucketRegion=bucket_region) bucket_stats[bucket] = {"size": s3_size, "objects": s3_objects} if args.debug: print "Bucket stats: " + str(bucket_stats) if args.test: print "Test-only. Received results: " + str(bucket_stats) else: send_zagg_data(bucket_list, bucket_stats, args)
def run(self): ''' Main function that runs the check ''' self.parse_args() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) self.oc = OCUtil(namespace='openshift-infra', config_file='/tmp/admin.kubeconfig', verbose=self.args.verbose) pod_report = self.check_pods() self.get_hawkular_creds() metrics_report = self.check_node_metrics() self.report_to_zabbix(pod_report, metrics_report)
def main(): ''' main() ''' args = parse_args() if args.verbose: logger.setLevel(logging.DEBUG) logger.info("Starting") # TODO: include this in library projects_info = OCUtil().get_projects() maxDelta = testProjects( projects_info['items'], current_time=datetime.datetime.now(), ) send_zagg_data(maxDelta) logger.info('Oldest Terminating project: %s seconds', maxDelta)
def main(): ''' main() ''' args = parse_args() if args.verbose: logger.setLevel(logging.DEBUG) logger.info("Starting") # TODO: include this in library projects_info = OCUtil()._run_cmd("oc get projects -o yaml") time_keeps_max = testProjects( projects_info['items'], current_time=datetime.datetime.now(), ) send_zagg_data(time_keeps_max) logger.info('Oldest Terminating project: %s seconds', time_keeps_max)
class InfraNodePodStatus(object): ''' This is a check for making sure the internal pods like router and registry running and located on different infra nodes ''' def __init__(self): '''initial for the InfraNodePodStatus''' self.kubeconfig = '/tmp/admin.kubeconfig' self.oc = OCUtil(namespace='default', config_file=self.kubeconfig) def check_pods(self): ''' get all the pod information ''' pods = self.oc.get_pods() pod_report = {} for pod in pods['items']: pod_name = pod['metadata']['name'] pod_report[pod_name] = {} pod_report[pod_name]['hostIP'] = pod['status']['hostIP'] pod_report[pod_name]['status'] = pod['status']['phase'] return pod_report @staticmethod def compare_ip(keyword, pod_info_dict): ''' to compare the pod host ip and check the pod status ''' pod_hostip_status = [ pod_info_dict[i] for i in pod_info_dict.keys() if keyword in i ] # pod_status = [pod_info_dict[i] for i in pod_info_dict.keys() if keyword in i] pod_run_num = 0 for i in pod_hostip_status: if i['status'] == "Running": pod_run_num += 1 if len(pod_hostip_status) == 2: if pod_hostip_status[0]['hostIP'] != pod_hostip_status[1]['hostIP']: # print "ok, you do not need do anything for {} pod".format(keyword) result_code = 1 else: # print "there are something wrong, please check the pod" result_code = 0 else: print "plese check the pod" result_code = 0 # result_code 1 means the two pods are on different nodes # pod_run_num means the running pod number return result_code, pod_run_num def run(self): ''' run the command and send the code to zabbix ''' ms = MetricSender() pod_report = self.check_pods() # the check_value is the value to send to zabbix router_check_value = self.compare_ip('router', pod_report) registry_check_value = self.compare_ip('registry', pod_report) print router_check_value, registry_check_value ms.add_metric({'openshift.router.pod.location': router_check_value[0]}) ms.add_metric({'openshift.router.pod.status': router_check_value[1]}) ms.add_metric( {'openshift.registry.pod.location': registry_check_value[0]}) ms.add_metric( {'openshift.registry.pod.status': registry_check_value[1]}) ms.send_metrics()
class InfraNodePodStatus(object): ''' This is a check for making sure the internal pods like router and registry running and located on different infra nodes ''' def __init__( self, args=None, ): '''initial for the InfraNodePodStatus''' self.args = args self.kubeconfig = '/tmp/admin.kubeconfig' self.oc = OCUtil(namespace=self.args.namespace, config_file=self.kubeconfig) self.all_pods = self.get_all_pods() def get_all_pods(self): ''' get all the pod information ''' pods = self.oc.get_pods() pod_report = {} for pod in pods['items']: pod_name = pod['metadata']['name'] pod_report[pod_name] = {} pod_report[pod_name]['hostIP'] = pod['status']['hostIP'] pod_report[pod_name]['status'] = pod['status']['phase'] return pod_report def get_expected_replicas(self, deploymentconfig): ''' get expected replica count from deploymentconfig ''' defined_replicas = self.oc.get_dc(deploymentconfig)['spec']['replicas'] return defined_replicas def get_pods_by_name(self, podname): """get_pods_by_name""" return [ self.all_pods[i] for i in self.all_pods.keys() if i.startswith(podname + '-') ] def check_pods( self, podname, keybase="", pod_optional=False, ): ''' to compare the pod host ip and check the pod status ''' logging.getLogger().info("Finding pods for: %s", podname) result_code = 1 pods = self.get_pods_by_name(podname) logging.getLogger().info("Pods Found: %s", len(pods)) expected_replicas = 0 try: expected_replicas = self.get_expected_replicas(podname) except Exception: logging.getLogger().warn("dc not found for pod %s", podname) if pod_optional: logging.getLogger().warn( "Some clusters don't have pod %s, please confirm before trying to fix this", podname) return # nothing we should do, so quit early, don't do more checks logging.getLogger().info("Expected Replicas: %s", expected_replicas) if len(pods) != expected_replicas: result_code = 0 logging.getLogger().critical("Count Pods and Replicas don't match") count_pods_running = len([i for i in pods if i['status'] == "Running"]) logging.getLogger().info("Pods Running: %s", count_pods_running) if len(pods) != count_pods_running: result_code = 0 logging.getLogger().critical("Some pods are not in running state") host_ips = set([x['hostIP'] for x in pods]) logging.getLogger().info("Hosts found: %d", len(host_ips)) if len(host_ips) < 2 or len(pods) < 2: result_code = 0 logging.getLogger().critical( "%s has %d pods on %d hosts, not distributed", podname, len(pods), len(host_ips)) if result_code == 0: logging.getLogger().critical("Please check pods are in running " "state, and on unique hosts") logging.getLogger().critical("oc get pods -n %s -o wide", self.args.namespace) # result_code 1 means the pods are on different nodes # count_pods_running means the running pod number self.send_metrics(keybase=keybase, location=result_code, status=count_pods_running) def send_metrics(self, keybase="", location="", status=""): """send_metrics""" ms = MetricSender(verbose=self.args.verbose) ms.add_metric({keybase + '.location': location}) ms.add_metric({keybase + '.status': status}) ms.send_metrics()
class OpenshiftRouterChecks(object): """Checks for the Openshift Router""" def __init__(self): self.args = None self.metrics = None # metric sender self.kubeconfig = None self.parse_args() self.get_kubeconfig() self.ocutil = None def get_kubeconfig(self): """Find kubeconfig to use for OCUtil""" # Default master kubeconfig kubeconfig = '/tmp/admin.kubeconfig' non_master_kube_dir = '/etc/origin/node' if os.path.exists(kubeconfig): # If /tmp/admin.kubeconfig exists, use it! pass elif os.path.isdir(non_master_kube_dir): for my_file in os.listdir(non_master_kube_dir): if my_file.endswith(".kubeconfig"): kubeconfig = os.path.join(non_master_kube_dir, my_file) if self.args.debug: print "Using kubeconfig: {}".format(kubeconfig) self.kubeconfig = kubeconfig def check_all_router_health(self): """ Perform defined router health check on all routers """ discovery_key = "disc.openshift.cluster.router" discovery_macro = "#OS_ROUTER" router_health_item = "disc.openshift.cluster.router.health" router_pods = self.find_router_pods() health_report = {} for router_name, pod_details in router_pods.iteritems(): health = self.router_pod_healthy(pod_details) if self.args.verbose: print "{} healthy: {}\n".format(router_name, health) health_report[router_name] = health # make dynamic items, and queue up the associated data router_names = health_report.keys() self.metrics.add_dynamic_metric(discovery_key, discovery_macro, router_names, synthetic=True) for router_name, health_status in health_report.iteritems(): zbx_key = "{}[{}]".format(router_health_item, router_name) self.metrics.add_metric({zbx_key: int(health_status)}, synthetic=True) def running_pod_count_check(self): """ return hash of deployment configs containing whether the number of running pods matches the definition in the deployment config """ router_pods = self.find_router_pods() # get actual running pod count (per DC) dc_pod_count = {} for _, details in router_pods.iteritems(): dc_name = details['metadata']['labels']['deploymentconfig'] dc_pod_count[dc_name] = dc_pod_count.get(dc_name, 0) + 1 if self.args.debug: print "Running pod count: {}".format(dc_pod_count) # get expected pod count as defined in each router DC expected_pod_count = {} for dc_name in dc_pod_count.keys(): expected_pod_count[dc_name] = self.ocutil.get_dc( dc_name)['spec']['replicas'] if self.args.debug: print "Expected pod count: {}".format(expected_pod_count) results = {} for dc_name in dc_pod_count.keys(): results[dc_name] = bool( dc_pod_count[dc_name] == expected_pod_count[dc_name]) if self.args.verbose or self.args.debug: print "DC replica count matching actual counts: {}".format(results) return results def check_router_replica_count(self): """ Check whether the running router replica count is the same as what is defined in the deployment config """ discovery_key = "disc.openshift.cluster.router" discovery_macro = "#ROUTER_DC" dc_status_item = "disc.openshift.cluster.router.expected_pod_count" replica_results = self.running_pod_count_check() # make dynamic items, and queue up the associated data dc_names = replica_results.keys() self.metrics.add_dynamic_metric(discovery_key, discovery_macro, dc_names, synthetic=True) for dc_name, replica_status in replica_results.iteritems(): zbx_key = "{}[{}]".format(dc_status_item, dc_name) self.metrics.add_metric({zbx_key: int(replica_status)}, synthetic=True) def run(self): """Main function to run the check""" self.ocutil = OCUtil(config_file=self.kubeconfig, verbose=self.args.verbose) self.metrics = MetricSender(verbose=self.args.verbose, debug=self.args.debug) self.check_all_router_health() self.check_router_replica_count() if self.args.dry_run: self.metrics.print_unique_metrics_key_value() else: self.metrics.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Openshift Router sender') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('--dry-run', action='store_true', default=False, help='Collect stats, but no report to zabbix') self.args = parser.parse_args() @staticmethod def get_router_health_url(router): """ build router healthcheck URL """ podip = router['status']['podIP'] port = router['spec']['containers'][0]['livenessProbe']['httpGet'][ 'port'] path = router['spec']['containers'][0]['livenessProbe']['httpGet'][ 'path'] url = 'http://{}:{}{}'.format(podip, port, path) return url @staticmethod def router_pod_healthy(router): """ ping the health port for router pod health """ url = OpenshiftRouterChecks.get_router_health_url(router) try: result = urllib2.urlopen(url).getcode() if result == 200: return True else: return False except (urllib2.HTTPError, urllib2.URLError): return False def find_router_pods(self): """ return dict of PODs running haproxy (the router pods) """ router_pods = {} for pod in self.ocutil.get_pods()['items']: try: img = pod['status']['containerStatuses'][0]['image'] if 'ose-haproxy-router' in img: router_pods[pod['metadata']['name']] = pod except KeyError: pass return router_pods
# Our jenkins server does not include these rpms. # In the future we might move this to a container where these # libs might exist #pylint: disable=import-error from openshift_tools.monitoring.ocutil import OCUtil from openshift_tools.monitoring.metric_sender import MetricSender import logging logging.basicConfig( format='%(asctime)s - %(relativeCreated)6d - %(levelname)-8s - %(message)s', ) logger = logging.getLogger() logger.setLevel(logging.INFO) ocutil = OCUtil() valid_build_states = ["cancelled", "complete", "new", "error", "failed"] def runOCcmd(cmd, base_cmd='oc'): """ log commands through ocutil """ logger.info(base_cmd + " " + cmd) return ocutil.run_user_cmd( cmd, base_cmd=base_cmd, ) def parse_args(): """ parse the args from the cli """
class EBSStuckVolumesCheck(object): """ This class houses a check that looks for EBS volumes that are stuck in a transition state (attaching, detaching, busy, etc). """ def __init__(self): """ initialize EBSStuckVolumesCheck class """ self.args = None self.vol_state_data = None self.parse_args() # Make sure we're using the profile they've requested. if self.args.aws_creds_profile: os.environ['AWS_PROFILE'] = self.args.aws_creds_profile self.eu = EbsUtil(self.args.region, verbose=self.args.verbose) self.ocutil = OCUtil(verbose=self.args.verbose) self.mts = MetricSender(verbose=self.args.verbose) def parse_args(self): ''' Parse arguments passed to the script ''' parser = argparse.ArgumentParser( description='OpenShift Cluster Metrics Checker') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose output') parser.add_argument('--region', required=True, help='AWS EC2 Region to check') parser.add_argument('--stuck-after', default=120, type=int, help='Amount of time in seconds after which the volume is ' + \ 'determined to be "stuck".') parser.add_argument('--aws-creds-profile', required=False, help='The AWS credentials profile to use.') self.args = parser.parse_args() @staticmethod def read_raw_volume_state_data(): """ Reads in the raw string the volume state data from disk """ if not os.path.isfile(STATE_DATA_FILE): return "" # Act like the file is blank with open(STATE_DATA_FILE, 'r') as stream: return stream.read() def load_volume_state_data(self): """ Loads the volume state data from disk """ if os.path.isfile(STATE_DATA_FILE): with open(STATE_DATA_FILE, 'r') as stream: self.vol_state_data = yaml.load(stream) else: self.vol_state_data = {} def save_volume_state_data(self): """ Saves the volume state data to disk """ with open(STATE_DATA_FILE, 'w') as outfile: yaml.dump(self.vol_state_data, outfile, default_flow_style=False, allow_unicode=True) def add_new_transitioning_volumes(self, trans_vols): """ Adds volumes that we haven't seen before that are in a transitioning state. """ for vol in trans_vols: vol_uri = self.eu.generate_volume_uri(vol) if vol_uri not in self.vol_state_data.keys(): # This is the first time we've seen this volume, add it. vol_uri = self.eu.generate_volume_uri(vol) self.vol_state_data[vol_uri] = {} self.vol_state_data[vol_uri][STUCK_AFTER_KEY] = datetime.now() + \ timedelta(seconds=self.args.stuck_after) self.vol_state_data[vol_uri][VOLUME_ID_KEY] = str(vol.id) self.vol_state_data[vol_uri][STATE_KEY] = TRANSITION_STATE self.vol_state_data[vol_uri][ATTACH_STATUS_KEY] = str( vol.attach_data.status) def set_stuck_volumes(self): """ Sets volumes to state 'stuck' if they've passed their transition state deadline. """ for item in self.vol_state_data.itervalues(): # We don't want to set unstuck volumes back to stuck. if item[STATE_KEY] != UNSTUCK_STATE: if datetime.now() > item[STUCK_AFTER_KEY]: item[STATE_KEY] = STUCK_STATE def set_unstuck_volumes(self, trans_vols): """ Change volumes that were in state 'stuck' that are no longer in transition, to state 'unstuck'. """ trans_vol_ids = [str(vol.id) for vol in trans_vols] for vol_uri, cache_data in self.vol_state_data.iteritems(): if cache_data[STATE_KEY] == STUCK_STATE and \ cache_data[VOLUME_ID_KEY] not in trans_vol_ids: # This volue was stuck, but isn't any longer self.vol_state_data[vol_uri][STATE_KEY] = UNSTUCK_STATE def report_stuck_volumes(self): """ sends data to monitoring that these volumes are stuck. """ for vol_uri, cache_data in self.vol_state_data.iteritems(): if cache_data[STATE_KEY] == STUCK_STATE: self.mts.add_dynamic_metric(EBS_VOLUME_URI_DISC_KEY, EBS_VOLUME_URI_DISC_MACRO, [vol_uri]) item_name = '%s[%s]' % (EBS_VOLUME_ATTACH_STATE_KEY, vol_uri) self.mts.add_metric({item_name: MONITORING_STUCK_VALUE}) # Actually send them self.mts.send_metrics() def report_unstuck_volumes(self): """ sends data to monitoring that these volumes have become unstuck. """ for vol_uri, cache_data in self.vol_state_data.iteritems(): if cache_data[STATE_KEY] == UNSTUCK_STATE: self.mts.add_dynamic_metric(EBS_VOLUME_URI_DISC_KEY, EBS_VOLUME_URI_DISC_MACRO, [vol_uri]) item_name = '%s[%s]' % (EBS_VOLUME_ATTACH_STATE_KEY, vol_uri) self.mts.add_metric({item_name: MONITORING_UNSTUCK_VALUE}) # Actually send them self.mts.send_metrics() def remove_unstuck_volumes_from_state_data(self): """ Removes state 'unstuck' volumes from the state data (no longer need to track them) """ for vol_uri in self.vol_state_data.keys(): cache_data = self.vol_state_data[vol_uri] if cache_data[STATE_KEY] == UNSTUCK_STATE: # This volume was stuck, but isn't any longer del self.vol_state_data[vol_uri] def remove_no_longer_transitioning_volumes(self, trans_vols): """ Remove volumes that were transitioning, but are no longer in the trans_vols list """ trans_vol_ids = [str(vol.id) for vol in trans_vols] for vol_uri in self.vol_state_data.keys(): cache_data = self.vol_state_data[vol_uri] if cache_data[STATE_KEY] == TRANSITION_STATE and \ cache_data[VOLUME_ID_KEY] not in trans_vol_ids: # This volume was transitioning, but isn't any longer del self.vol_state_data[vol_uri] def get_cluster_volumes(self): """ Return the cluster's volume list """ volume_list = self.ocutil.get_pvs()['items'] just_the_aws_path = [ x['spec']['awsElasticBlockStore']['volumeID'] for x in volume_list ] just_the_volume_ids = [ re.sub("^aws://.*/", "", x) for x in just_the_aws_path ] return just_the_volume_ids @staticmethod def filter_out_non_cluster_vols(account_vols, cluster_vols): """ We have a list of all volumes in the account, return only those that are part of this cluster """ cluster_list = [x for x in account_vols if x.id in cluster_vols] return cluster_list def run(self): """ Run the main logic of this check """ # Load the state machine data self.load_volume_state_data() # Get the volumes that are currently in a transitioning state full_trans_vols = self.eu.get_trans_attach_status_vols() # Get the cluster's list of volumes cluster_vols = self.get_cluster_volumes() # Remove volumes that aren't part of this cluster trans_vols = self.filter_out_non_cluster_vols(full_trans_vols, cluster_vols) # Based on that list, weed out the volumes that used to be transitioning, # that are no longer in the transitioning volumes list. This means that # it was a normal volume transition, probably from attaching to attached # or detaching to detached (aka None). self.remove_no_longer_transitioning_volumes(trans_vols) # Check on the volumes that were in the stuck state that are no longer # in the transitioning volumes list. This means that they went from stuck # to unstuck. We need to track these so that we can report that they've become # unstuck to monitoring. self.set_unstuck_volumes(trans_vols) # Add any volumes that are transitioning that we haven't seen before to our data self.add_new_transitioning_volumes(trans_vols) # Change volumes that are still transitioning and have hit their deadline to # finish that transition to a state of "stuck" self.set_stuck_volumes() # Report to monitoring the stuck volumes self.report_stuck_volumes() # Report to monitoring the volumes that were stuck, but are now unstuck (no # longer transitioning) self.report_unstuck_volumes() # Since the unstuck volumes have been reported, they can safeuly be removed from # our tracking now. self.remove_unstuck_volumes_from_state_data() # Make sure we save state for the next run. self.save_volume_state_data() self.eu.verbose_print("\nTracking Volumes") self.eu.verbose_print("----------------\n") # Cat out the state file raw_state_file = self.read_raw_volume_state_data() self.eu.verbose_print(raw_state_file)
def __init__(self): '''initial for the InfraNodePodStatus''' self.kubeconfig = '/tmp/admin.kubeconfig' self.oc = OCUtil(namespace='default', config_file=self.kubeconfig)