def __init__(
     self,
     args=None,
 ):
     '''initial for the InfraNodePodStatus'''
     self.args = args
     self.kubeconfig = '/tmp/admin.kubeconfig'
     self.oc = OCUtil(namespace=self.args.namespace,
                      config_file=self.kubeconfig)
Esempio n. 2
0
    def __init__(self):
        """ initialize EBSStuckVolumesCheck class """
        self.args = None
        self.vol_state_data = None

        self.parse_args()

        # Make sure we're using the profile they've requested.
        if self.args.aws_creds_profile:
            os.environ['AWS_PROFILE'] = self.args.aws_creds_profile

        self.eu = EbsUtil(self.args.region, verbose=self.args.verbose)
        self.ocutil = OCUtil(verbose=self.args.verbose)
        self.mts = MetricSender(verbose=self.args.verbose)
    def run(self):
        """Main function to run the check"""

        self.ocutil = OCUtil(config_file=self.kubeconfig,
                             verbose=self.args.verbose)
        self.zgs = ZaggSender(verbose=self.args.verbose, debug=self.args.debug)

        self.check_all_router_health()
        self.check_router_replica_count()

        if self.args.dry_run:
            self.zgs.print_unique_metrics_key_value()
        else:
            self.zgs.send_metrics()
def main():
    ''' Gather and send details on all visible S3 buckets '''
    logger.info("start")

    discovery_key = "disc.aws"
    discovery_macro = "#S3_BUCKET"
    prototype_s3_size = "disc.aws.size"
    prototype_s3_count = "disc.aws.objects"

    args = parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)
        logger.debug("verbose flag set")

    ocutil = OCUtil()
    dc_yaml = ocutil.get_dc('docker-registry')
    registry_config_secret = get_registry_config_secret(dc_yaml)

    oc_yaml = ocutil.get_secrets(registry_config_secret)

    aws_access, aws_secret = get_aws_creds(oc_yaml)
    awsutil = AWSUtil(aws_access, aws_secret, args.debug)

    bucket_list = awsutil.get_bucket_list(args.debug)

    bucket_stats = {}

    for bucket in bucket_list:
        s3_size, s3_objects = awsutil.get_bucket_info(bucket, args.debug)
        bucket_stats[bucket] = {"size": s3_size, "objects": s3_objects}

    if args.debug:
        print "Bucket stats: " + str(bucket_stats)

    if args.test:
        print "Test-only. Received results: " + str(bucket_stats)
    else:
        zgs = ZaggSender(verbose=args.debug)
        zgs.add_zabbix_dynamic_item(discovery_key, discovery_macro,
                                    bucket_list)
        for bucket in bucket_stats.keys():
            zab_key = "{}[{}]".format(prototype_s3_size, bucket)
            zgs.add_zabbix_keys(
                {zab_key: int(round(bucket_stats[bucket]["size"]))})

            zab_key = "{}[{}]".format(prototype_s3_count, bucket)
            zgs.add_zabbix_keys({zab_key: bucket_stats[bucket]["objects"]})
        zgs.send_metrics()
def get_registry_config_secret(yaml_results):
    ''' Find the docker registry config secret '''

    ocutil = OCUtil()
    volumes = yaml_results['spec']['template']['spec']['volumes']
    for volume in volumes:
        if 'emptyDir' in volume:
            continue
        secret_dict = ocutil.get_secrets(volume['secret']['secretName'])
        if 'config.yml' in secret_dict['data']:
            return volume['secret']['secretName']

    print "Unable to find the %s the docker registry config"
    print "Please run \"oc get dc docker-registry\" to investigate"
    sys.exit(1)
Esempio n. 6
0
    def run(self):
        ''' Main function that runs the check '''
        self.parse_args()
        self.metric_sender = MetricSender(verbose=self.args.verbose,
                                          debug=self.args.debug)

        self.oc = OCUtil(namespace='openshift-infra',
                         config_file='/tmp/admin.kubeconfig',
                         verbose=self.args.verbose)

        pod_report = self.check_pods()
        self.get_hawkular_creds()
        metrics_report = self.check_node_metrics()
        # if metrics_report = 0, we need this check run again
        if metrics_report['success'] == 0:
            # sleep for 5 seconds, then run the second time node check
            logger.info(
                "The first time metrics check failed, 5 seconds later will start a second time check"
            )
            time.sleep(commandDelay)
            logger.info("starting the second time metrics check")
            metrics_report = self.check_node_metrics()
            # persist second attempt if fails
            if metrics_report['success'] == 0:
                self.persist_details(metrics_report)
        self.report_to_zabbix(pod_report, metrics_report['success'])
Esempio n. 7
0
 def get_logging_namespace(self):
     """ Determine which logging namespace is in use """
     # Assume the correct namespace is 'openshift-logging' and fall back to 'logging'
     # if that assumption ends up being wrong.
     oc_client = OCUtil(namespace='openshift-logging',
                        config_file='/tmp/admin.kubeconfig',
                        verbose=self.args.verbose)
     logger.info("Determining which namespace is in use...")
     try:
         oc_client.get_dc('logging-kibana')
         # If the previous call didn't throw an exception, logging is deployed in this namespace.
         logger.info("Using namespace: openshift-logging")
         return 'openshift-logging'
     except subprocess.CalledProcessError:
         logger.info("Using namespace: logging")
         return 'logging'
def main():
    ''' Gather and send details on all visible S3 buckets '''

    discovery_key = "disc.gcp"
    discovery_macro = "#GCS_BUCKET"
    prototype_bucket_size = "disc.gcp.size"
    prototype_bucket_count = "disc.gcp.objects"

    args = parse_args()

    ocutil = OCUtil()
    dc_yaml = ocutil.get_dc('docker-registry')
    registry_config_secret = get_registry_config_secret(dc_yaml)

    oc_yaml = ocutil.get_secrets(registry_config_secret)

    bucket = get_gcp_info(oc_yaml)
    gsutil = GcloudUtil(verbose=args.debug)

    bucket_list = gsutil.get_bucket_list()

    bucket_stats = {}

    for bucket in bucket_list:
        size, objects = gsutil.get_bucket_info(bucket)
        bucket_stats[bucket] = {"size": size, "objects": objects}

    if args.debug:
        print "Bucket stats: " + str(bucket_stats)

    if args.test:
        print "Test-only. Received results: " + str(bucket_stats)
    else:
        zgs = ZaggSender(verbose=args.debug)
        zgs.add_zabbix_dynamic_item(discovery_key, discovery_macro, bucket_list)
        for bucket in bucket_stats.keys():
            zab_key = "{}[{}]".format(prototype_bucket_size, bucket)
            zgs.add_zabbix_keys({zab_key: int(round(bucket_stats[bucket]["size"]))})

            zab_key = "{}[{}]".format(prototype_bucket_count, bucket)
            zgs.add_zabbix_keys({zab_key: bucket_stats[bucket]["objects"]})
        zgs.send_metrics()
    def run(self):
        ''' Main function that runs the check '''
        self.parse_args()
        self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug)

        self.oc = OCUtil(namespace='openshift-infra', config_file=self.kubeconfig, verbose=self.args.verbose)

        pod_report = self.check_pods()
        metrics_report = self.check_node_metrics()

        self.report_to_zabbix(pod_report, metrics_report)
    def run(self):
        """  Main function to run the check """

        self.parse_args()
        self.get_kubeconfig()
        ocutil = OCUtil(config_file=self.kubeconfig, verbose=self.args.verbose)
        self.zagg_sender = ZaggSender(verbose=self.args.verbose, debug=self.args.debug)

        try:
            oc_yaml = ocutil.get_service('docker-registry')
            self.get_registry_service(oc_yaml)
            oc_yaml = ocutil.get_endpoint('docker-registry')
            self.get_registry_endpoints(oc_yaml)
        except Exception as ex:
            print "Problem retreiving registry IPs: %s " % ex.message

        self.registry_service_check()
        self.registry_health_check()

        self.zagg_sender.send_metrics()
    def run(self):
        """  Main function to run the check """

        self.parse_args()
        self.get_kubeconfig()
        ocutil = OCUtil(config_file=self.kubeconfig, verbose=self.args.verbose)
        self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug)

        try:
            oc_yaml = ocutil.get_service('docker-registry')
            self.get_registry_service(oc_yaml)
            oc_yaml = ocutil.get_endpoint('docker-registry')
            self.get_registry_endpoints(oc_yaml)
        except Exception as ex:
            print "Problem retreiving registry IPs: %s " % ex.message

        self.registry_service_check()
        self.registry_health_check()

        self.metric_sender.send_metrics()
Esempio n. 12
0
    def run(self):
        """ Main function that runs the check """
        self.parse_args()
        self.metric_sender = MetricSender(verbose=self.args.verbose,
                                          debug=self.args.debug)
        self.oc = OCUtil(namespace=self.get_logging_namespace(),
                         config_file='/tmp/admin.kubeconfig',
                         verbose=self.args.verbose)
        self.get_pods()

        oldest_buffer = self.check_fluentd_queues()

        self.send_metrics(oldest_buffer)
def main():
    """ Gather and send details on all visible S3 buckets """

    discovery_key = "disc.aws"
    discovery_macro = "#S3_BUCKET"
    prototype_s3_size = "disc.aws.size"
    prototype_s3_count = "disc.aws.objects"

    args = parse_args()

    ocutil = OCUtil()
    oc_yaml = ocutil.get_secrets("dockerregistry")

    aws_access, aws_secret = get_aws_creds(oc_yaml)
    awsutil = AWSUtil(aws_access, aws_secret, args.debug)

    bucket_list = awsutil.get_bucket_list(args.debug)

    bucket_stats = {}

    for bucket in bucket_list:
        s3_size, s3_objects = awsutil.get_bucket_info(bucket, args.debug)
        bucket_stats[bucket] = {"size": s3_size, "objects": s3_objects}

    if args.debug:
        print "Bucket stats: " + str(bucket_stats)

    if args.test:
        print "Test-only. Received results: " + str(bucket_stats)
    else:
        zgs = ZaggSender(verbose=args.debug)
        zgs.add_zabbix_dynamic_item(discovery_key, discovery_macro, bucket_list)
        for bucket in bucket_stats.keys():
            zab_key = "{}[{}]".format(prototype_s3_size, bucket)
            zgs.add_zabbix_keys({zab_key: int(round(bucket_stats[bucket]["size"]))})

            zab_key = "{}[{}]".format(prototype_s3_count, bucket)
            zgs.add_zabbix_keys({zab_key: bucket_stats[bucket]["objects"]})
        zgs.send_metrics()
class ZabbixInfo(object):
    '''
      this will check the zabbix data and compare it with the real world
    '''
    def __init__(self, args=None, ):
        '''initial for the InfraNodePodStatus'''
        self.args = args
        self.kubeconfig = '/tmp/admin.kubeconfig'
        self.oc = OCUtil(namespace=self.args.namespace, config_file=self.kubeconfig)

    def check_all_hosts(self, zabbix_data_sync_inventory_hosts, clusterid):
        ''' check the situation  '''
        result = 1
        zabbix_data_sync_inventory_hosts_names = []
        for host in zabbix_data_sync_inventory_hosts:
            zabbix_data_sync_inventory_hosts_names.append(host['name'])

        desire_number_cluster = cluster_desired_compute_size + cluster_desired_infra_size + cluster_desired_master_size
        logging.getLogger().info("the requested number of instance is :" + str(desire_number_cluster))
        hosts = self.oc.get_nodes()
        #print hosts
        for host in hosts['items']:
            hostnameincluster = ""
            if host['metadata']['labels']['type'] == 'master':
                hostnameincluster = host['metadata']['labels']['hostname']
            elif host['metadata']['labels']['type'] == 'infra':
                hostnameincluster = clusterid + "-infra-" + host['metadata']['labels']['kubernetes.io/hostname']
            else:
                hostnameincluster = clusterid + "-compute-" + host['metadata']['labels']['kubernetes.io/hostname']

            if hostnameincluster in zabbix_data_sync_inventory_hosts_names:
                logging.getLogger().info("found host in zabbix :" + str(hostnameincluster))
            else:
                result = 0
                logging.getLogger().info("host not in zabbix:" + str(hostnameincluster))

        if result == 1:
            if len(hosts['items']) == desire_number_cluster:
                logging.getLogger().info("currrently cluster have :" + str(len(hosts['items'])))
                logging.getLogger().info("all the node under monitoring and the number is the same as requested:" + str(desire_number_cluster))
            else:
                result = 2
                logging.getLogger().info("cluster node number is different with requested")

        return result

    def send_metrics(self, status):
        """send_metrics"""
        ms = MetricSender(verbose=self.args.verbose)
        ms.add_metric({'openshift.master.zabbix.inventory.status': status})
        ms.send_metrics()
Esempio n. 15
0
def main():
    ''' Gather and send details on all visible S3 buckets '''

    #get the region
    with open('/container_setup/monitoring-config.yml', 'r') as f:
        doc = yaml.load(f)
    bucket_region = doc['oso_region']

    args = parse_args()

    ocutil = OCUtil()
    dc_yaml = ocutil.get_dc('docker-registry')
    registry_config_secret = get_registry_config_secret(dc_yaml)

    oc_yaml = ocutil.get_secrets(registry_config_secret)

    aws_access, aws_secret = get_aws_creds(oc_yaml)
    awsutil = AWSUtil(aws_access, aws_secret, args.debug)

    bucket_list = awsutil.get_bucket_list(verbose=args.debug,
                                          BucketRegion=bucket_region)

    bucket_stats = {}

    for bucket in bucket_list:
        #print bucket
        s3_size, s3_objects = awsutil.get_bucket_info(
            bucket, verbose=args.debug, BucketRegion=bucket_region)
        bucket_stats[bucket] = {"size": s3_size, "objects": s3_objects}

    if args.debug:
        print "Bucket stats: " + str(bucket_stats)

    if args.test:
        print "Test-only. Received results: " + str(bucket_stats)
    else:
        send_zagg_data(bucket_list, bucket_stats, args)
Esempio n. 16
0
    def run(self):
        ''' Main function that runs the check '''
        self.parse_args()
        self.metric_sender = MetricSender(verbose=self.args.verbose,
                                          debug=self.args.debug)

        self.oc = OCUtil(namespace='openshift-infra',
                         config_file='/tmp/admin.kubeconfig',
                         verbose=self.args.verbose)

        pod_report = self.check_pods()
        self.get_hawkular_creds()
        metrics_report = self.check_node_metrics()

        self.report_to_zabbix(pod_report, metrics_report)
Esempio n. 17
0
def main():
    ''' main() '''
    args = parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)

    logger.info("Starting")

    # TODO: include this in library
    projects_info = OCUtil().get_projects()

    maxDelta = testProjects(
        projects_info['items'],
        current_time=datetime.datetime.now(),
    )

    send_zagg_data(maxDelta)
    logger.info('Oldest Terminating project: %s seconds', maxDelta)
Esempio n. 18
0
def main():
    ''' main() '''
    args = parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)

    logger.info("Starting")

    # TODO: include this in library
    projects_info = OCUtil()._run_cmd("oc get projects -o yaml")

    time_keeps_max = testProjects(
        projects_info['items'],
        current_time=datetime.datetime.now(),
    )

    send_zagg_data(time_keeps_max)
    logger.info('Oldest Terminating project: %s seconds', time_keeps_max)
class InfraNodePodStatus(object):
    '''
      This is a check for making sure the internal pods like
      router and registry running and located on different infra nodes
    '''
    def __init__(self):
        '''initial for the InfraNodePodStatus'''
        self.kubeconfig = '/tmp/admin.kubeconfig'
        self.oc = OCUtil(namespace='default', config_file=self.kubeconfig)

    def check_pods(self):
        ''' get all the pod information '''
        pods = self.oc.get_pods()
        pod_report = {}
        for pod in pods['items']:
            pod_name = pod['metadata']['name']
            pod_report[pod_name] = {}
            pod_report[pod_name]['hostIP'] = pod['status']['hostIP']
            pod_report[pod_name]['status'] = pod['status']['phase']
        return pod_report

    @staticmethod
    def compare_ip(keyword, pod_info_dict):
        ''' to compare the pod host ip and check the pod status '''
        pod_hostip_status = [
            pod_info_dict[i] for i in pod_info_dict.keys() if keyword in i
        ]
        #        pod_status = [pod_info_dict[i] for i in pod_info_dict.keys() if keyword in i]
        pod_run_num = 0
        for i in pod_hostip_status:
            if i['status'] == "Running":
                pod_run_num += 1
        if len(pod_hostip_status) == 2:
            if pod_hostip_status[0]['hostIP'] != pod_hostip_status[1]['hostIP']:
                # print "ok, you do not need do anything for {} pod".format(keyword)
                result_code = 1
            else:
                # print "there are something wrong, please check the pod"
                result_code = 0
        else:
            print "plese check the pod"
            result_code = 0
        # result_code 1 means the two pods are on different nodes
        # pod_run_num means the running pod number
        return result_code, pod_run_num

    def run(self):
        ''' run the command and send the code to zabbix '''
        ms = MetricSender()

        pod_report = self.check_pods()

        # the check_value is the value to send to zabbix
        router_check_value = self.compare_ip('router', pod_report)
        registry_check_value = self.compare_ip('registry', pod_report)
        print router_check_value, registry_check_value

        ms.add_metric({'openshift.router.pod.location': router_check_value[0]})
        ms.add_metric({'openshift.router.pod.status': router_check_value[1]})
        ms.add_metric(
            {'openshift.registry.pod.location': registry_check_value[0]})
        ms.add_metric(
            {'openshift.registry.pod.status': registry_check_value[1]})
        ms.send_metrics()
class InfraNodePodStatus(object):
    '''
      This is a check for making sure the internal pods like
      router and registry running and located on different infra nodes
    '''
    def __init__(
        self,
        args=None,
    ):
        '''initial for the InfraNodePodStatus'''
        self.args = args
        self.kubeconfig = '/tmp/admin.kubeconfig'
        self.oc = OCUtil(namespace=self.args.namespace,
                         config_file=self.kubeconfig)
        self.all_pods = self.get_all_pods()

    def get_all_pods(self):
        ''' get all the pod information '''
        pods = self.oc.get_pods()
        pod_report = {}
        for pod in pods['items']:
            pod_name = pod['metadata']['name']
            pod_report[pod_name] = {}
            pod_report[pod_name]['hostIP'] = pod['status']['hostIP']
            pod_report[pod_name]['status'] = pod['status']['phase']
        return pod_report

    def get_expected_replicas(self, deploymentconfig):
        ''' get expected replica count from deploymentconfig '''
        defined_replicas = self.oc.get_dc(deploymentconfig)['spec']['replicas']
        return defined_replicas

    def get_pods_by_name(self, podname):
        """get_pods_by_name"""
        return [
            self.all_pods[i] for i in self.all_pods.keys()
            if i.startswith(podname + '-')
        ]

    def check_pods(
        self,
        podname,
        keybase="",
        pod_optional=False,
    ):
        ''' to compare the pod host ip and check the pod status '''
        logging.getLogger().info("Finding pods for: %s", podname)

        result_code = 1

        pods = self.get_pods_by_name(podname)
        logging.getLogger().info("Pods Found: %s", len(pods))

        expected_replicas = 0
        try:
            expected_replicas = self.get_expected_replicas(podname)
        except Exception:
            logging.getLogger().warn("dc not found for pod %s", podname)
            if pod_optional:
                logging.getLogger().warn(
                    "Some clusters don't have pod %s, please confirm before trying to fix this",
                    podname)
            return  # nothing we should do, so quit early, don't do more checks

        logging.getLogger().info("Expected Replicas: %s", expected_replicas)
        if len(pods) != expected_replicas:
            result_code = 0
            logging.getLogger().critical("Count Pods and Replicas don't match")

        count_pods_running = len([i for i in pods if i['status'] == "Running"])
        logging.getLogger().info("Pods Running: %s", count_pods_running)
        if len(pods) != count_pods_running:
            result_code = 0
            logging.getLogger().critical("Some pods are not in running state")

        host_ips = set([x['hostIP'] for x in pods])
        logging.getLogger().info("Hosts found: %d", len(host_ips))
        if len(host_ips) < 2 or len(pods) < 2:
            result_code = 0
            logging.getLogger().critical(
                "%s has %d pods on %d hosts, not distributed", podname,
                len(pods), len(host_ips))

        if result_code == 0:
            logging.getLogger().critical("Please check pods are in running "
                                         "state, and on unique hosts")
            logging.getLogger().critical("oc get pods -n %s -o wide",
                                         self.args.namespace)

        # result_code 1 means the pods are on different nodes
        # count_pods_running means the running pod number
        self.send_metrics(keybase=keybase,
                          location=result_code,
                          status=count_pods_running)

    def send_metrics(self, keybase="", location="", status=""):
        """send_metrics"""
        ms = MetricSender(verbose=self.args.verbose)
        ms.add_metric({keybase + '.location': location})
        ms.add_metric({keybase + '.status': status})
        ms.send_metrics()
class OpenshiftRouterChecks(object):
    """Checks for the Openshift Router"""
    def __init__(self):
        self.args = None
        self.metrics = None  # metric sender
        self.kubeconfig = None
        self.parse_args()
        self.get_kubeconfig()
        self.ocutil = None

    def get_kubeconfig(self):
        """Find kubeconfig to use for OCUtil"""
        # Default master kubeconfig
        kubeconfig = '/tmp/admin.kubeconfig'
        non_master_kube_dir = '/etc/origin/node'

        if os.path.exists(kubeconfig):
            # If /tmp/admin.kubeconfig exists, use it!
            pass
        elif os.path.isdir(non_master_kube_dir):
            for my_file in os.listdir(non_master_kube_dir):
                if my_file.endswith(".kubeconfig"):
                    kubeconfig = os.path.join(non_master_kube_dir, my_file)

        if self.args.debug:
            print "Using kubeconfig: {}".format(kubeconfig)

        self.kubeconfig = kubeconfig

    def check_all_router_health(self):
        """ Perform defined router health check on all routers """

        discovery_key = "disc.openshift.cluster.router"
        discovery_macro = "#OS_ROUTER"
        router_health_item = "disc.openshift.cluster.router.health"

        router_pods = self.find_router_pods()
        health_report = {}
        for router_name, pod_details in router_pods.iteritems():
            health = self.router_pod_healthy(pod_details)
            if self.args.verbose:
                print "{} healthy: {}\n".format(router_name, health)
            health_report[router_name] = health

        # make dynamic items, and queue up the associated data
        router_names = health_report.keys()
        self.metrics.add_dynamic_metric(discovery_key,
                                        discovery_macro,
                                        router_names,
                                        synthetic=True)

        for router_name, health_status in health_report.iteritems():
            zbx_key = "{}[{}]".format(router_health_item, router_name)
            self.metrics.add_metric({zbx_key: int(health_status)},
                                    synthetic=True)

    def running_pod_count_check(self):
        """ return hash of deployment configs containing whether the number
            of running pods matches the definition in the deployment config """

        router_pods = self.find_router_pods()

        # get actual running pod count (per DC)
        dc_pod_count = {}
        for _, details in router_pods.iteritems():
            dc_name = details['metadata']['labels']['deploymentconfig']
            dc_pod_count[dc_name] = dc_pod_count.get(dc_name, 0) + 1

        if self.args.debug:
            print "Running pod count: {}".format(dc_pod_count)

        # get expected pod count as defined in each router DC
        expected_pod_count = {}
        for dc_name in dc_pod_count.keys():
            expected_pod_count[dc_name] = self.ocutil.get_dc(
                dc_name)['spec']['replicas']

        if self.args.debug:
            print "Expected pod count: {}".format(expected_pod_count)

        results = {}
        for dc_name in dc_pod_count.keys():
            results[dc_name] = bool(
                dc_pod_count[dc_name] == expected_pod_count[dc_name])

        if self.args.verbose or self.args.debug:
            print "DC replica count matching actual counts: {}".format(results)

        return results

    def check_router_replica_count(self):
        """ Check whether the running router replica count is the same
            as what is defined in the deployment config """

        discovery_key = "disc.openshift.cluster.router"
        discovery_macro = "#ROUTER_DC"
        dc_status_item = "disc.openshift.cluster.router.expected_pod_count"

        replica_results = self.running_pod_count_check()

        # make dynamic items, and queue up the associated data
        dc_names = replica_results.keys()
        self.metrics.add_dynamic_metric(discovery_key,
                                        discovery_macro,
                                        dc_names,
                                        synthetic=True)

        for dc_name, replica_status in replica_results.iteritems():
            zbx_key = "{}[{}]".format(dc_status_item, dc_name)
            self.metrics.add_metric({zbx_key: int(replica_status)},
                                    synthetic=True)

    def run(self):
        """Main function to run the check"""

        self.ocutil = OCUtil(config_file=self.kubeconfig,
                             verbose=self.args.verbose)
        self.metrics = MetricSender(verbose=self.args.verbose,
                                    debug=self.args.debug)

        self.check_all_router_health()
        self.check_router_replica_count()

        if self.args.dry_run:
            self.metrics.print_unique_metrics_key_value()
        else:
            self.metrics.send_metrics()

    def parse_args(self):
        """ parse the args from the cli """

        parser = argparse.ArgumentParser(description='Openshift Router sender')
        parser.add_argument('-v',
                            '--verbose',
                            action='store_true',
                            default=None,
                            help='Verbose?')
        parser.add_argument('--debug',
                            action='store_true',
                            default=None,
                            help='Debug?')
        parser.add_argument('--dry-run',
                            action='store_true',
                            default=False,
                            help='Collect stats, but no report to zabbix')

        self.args = parser.parse_args()

    @staticmethod
    def get_router_health_url(router):
        """ build router healthcheck URL """

        podip = router['status']['podIP']
        port = router['spec']['containers'][0]['livenessProbe']['httpGet'][
            'port']
        path = router['spec']['containers'][0]['livenessProbe']['httpGet'][
            'path']
        url = 'http://{}:{}{}'.format(podip, port, path)

        return url

    @staticmethod
    def router_pod_healthy(router):
        """ ping the health port for router pod health """

        url = OpenshiftRouterChecks.get_router_health_url(router)

        try:
            result = urllib2.urlopen(url).getcode()
            if result == 200:
                return True
            else:
                return False
        except (urllib2.HTTPError, urllib2.URLError):
            return False

    def find_router_pods(self):
        """ return dict of PODs running haproxy (the router pods) """

        router_pods = {}
        for pod in self.ocutil.get_pods()['items']:
            try:
                img = pod['status']['containerStatuses'][0]['image']
                if 'ose-haproxy-router' in img:
                    router_pods[pod['metadata']['name']] = pod
            except KeyError:
                pass

        return router_pods
# Our jenkins server does not include these rpms.
# In the future we might move this to a container where these
# libs might exist
#pylint: disable=import-error
from openshift_tools.monitoring.ocutil import OCUtil
from openshift_tools.monitoring.metric_sender import MetricSender

import logging
logging.basicConfig(
    format='%(asctime)s - %(relativeCreated)6d - %(levelname)-8s - %(message)s',
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

ocutil = OCUtil()

valid_build_states = ["cancelled", "complete", "new", "error", "failed"]


def runOCcmd(cmd, base_cmd='oc'):
    """ log commands through ocutil """
    logger.info(base_cmd + " " + cmd)
    return ocutil.run_user_cmd(
        cmd,
        base_cmd=base_cmd,
    )


def parse_args():
    """ parse the args from the cli """
Esempio n. 23
0
class EBSStuckVolumesCheck(object):
    """
       This class houses a check that looks for EBS volumes that are stuck in a
       transition state (attaching, detaching, busy, etc).
    """
    def __init__(self):
        """ initialize EBSStuckVolumesCheck class """
        self.args = None
        self.vol_state_data = None

        self.parse_args()

        # Make sure we're using the profile they've requested.
        if self.args.aws_creds_profile:
            os.environ['AWS_PROFILE'] = self.args.aws_creds_profile

        self.eu = EbsUtil(self.args.region, verbose=self.args.verbose)
        self.ocutil = OCUtil(verbose=self.args.verbose)
        self.mts = MetricSender(verbose=self.args.verbose)

    def parse_args(self):
        ''' Parse arguments passed to the script '''
        parser = argparse.ArgumentParser(
            description='OpenShift Cluster Metrics Checker')
        parser.add_argument('-v',
                            '--verbose',
                            action='store_true',
                            default=None,
                            help='Verbose output')
        parser.add_argument('--region',
                            required=True,
                            help='AWS EC2 Region to check')
        parser.add_argument('--stuck-after', default=120, type=int,
                            help='Amount of time in seconds after which the volume is ' + \
                                 'determined to be "stuck".')
        parser.add_argument('--aws-creds-profile',
                            required=False,
                            help='The AWS credentials profile to use.')

        self.args = parser.parse_args()

    @staticmethod
    def read_raw_volume_state_data():
        """ Reads in the raw string the volume state data from disk """
        if not os.path.isfile(STATE_DATA_FILE):
            return ""  # Act like the file is blank

        with open(STATE_DATA_FILE, 'r') as stream:
            return stream.read()

    def load_volume_state_data(self):
        """ Loads the volume state data from disk """
        if os.path.isfile(STATE_DATA_FILE):
            with open(STATE_DATA_FILE, 'r') as stream:
                self.vol_state_data = yaml.load(stream)
        else:
            self.vol_state_data = {}

    def save_volume_state_data(self):
        """ Saves the volume state data to disk """
        with open(STATE_DATA_FILE, 'w') as outfile:
            yaml.dump(self.vol_state_data,
                      outfile,
                      default_flow_style=False,
                      allow_unicode=True)

    def add_new_transitioning_volumes(self, trans_vols):
        """ Adds volumes that we haven't seen before that are in a transitioning state. """
        for vol in trans_vols:
            vol_uri = self.eu.generate_volume_uri(vol)

            if vol_uri not in self.vol_state_data.keys():
                # This is the first time we've seen this volume, add it.
                vol_uri = self.eu.generate_volume_uri(vol)
                self.vol_state_data[vol_uri] = {}
                self.vol_state_data[vol_uri][STUCK_AFTER_KEY] = datetime.now() + \
                    timedelta(seconds=self.args.stuck_after)
                self.vol_state_data[vol_uri][VOLUME_ID_KEY] = str(vol.id)
                self.vol_state_data[vol_uri][STATE_KEY] = TRANSITION_STATE

            self.vol_state_data[vol_uri][ATTACH_STATUS_KEY] = str(
                vol.attach_data.status)

    def set_stuck_volumes(self):
        """ Sets volumes to state 'stuck' if they've passed their transition state deadline. """
        for item in self.vol_state_data.itervalues():
            # We don't want to set unstuck volumes back to stuck.
            if item[STATE_KEY] != UNSTUCK_STATE:
                if datetime.now() > item[STUCK_AFTER_KEY]:
                    item[STATE_KEY] = STUCK_STATE

    def set_unstuck_volumes(self, trans_vols):
        """
            Change volumes that were in state 'stuck' that are no longer in transition,
            to state 'unstuck'.
        """

        trans_vol_ids = [str(vol.id) for vol in trans_vols]

        for vol_uri, cache_data in self.vol_state_data.iteritems():
            if cache_data[STATE_KEY] == STUCK_STATE and \
               cache_data[VOLUME_ID_KEY] not in trans_vol_ids:
                # This volue was stuck, but isn't any longer
                self.vol_state_data[vol_uri][STATE_KEY] = UNSTUCK_STATE

    def report_stuck_volumes(self):
        """ sends data to monitoring that these volumes are stuck. """
        for vol_uri, cache_data in self.vol_state_data.iteritems():
            if cache_data[STATE_KEY] == STUCK_STATE:
                self.mts.add_dynamic_metric(EBS_VOLUME_URI_DISC_KEY,
                                            EBS_VOLUME_URI_DISC_MACRO,
                                            [vol_uri])

                item_name = '%s[%s]' % (EBS_VOLUME_ATTACH_STATE_KEY, vol_uri)
                self.mts.add_metric({item_name: MONITORING_STUCK_VALUE})

        # Actually send them
        self.mts.send_metrics()

    def report_unstuck_volumes(self):
        """ sends data to monitoring that these volumes have become unstuck. """
        for vol_uri, cache_data in self.vol_state_data.iteritems():
            if cache_data[STATE_KEY] == UNSTUCK_STATE:
                self.mts.add_dynamic_metric(EBS_VOLUME_URI_DISC_KEY,
                                            EBS_VOLUME_URI_DISC_MACRO,
                                            [vol_uri])

                item_name = '%s[%s]' % (EBS_VOLUME_ATTACH_STATE_KEY, vol_uri)
                self.mts.add_metric({item_name: MONITORING_UNSTUCK_VALUE})

        # Actually send them
        self.mts.send_metrics()

    def remove_unstuck_volumes_from_state_data(self):
        """ Removes state 'unstuck' volumes from the state data (no longer need to track them) """
        for vol_uri in self.vol_state_data.keys():
            cache_data = self.vol_state_data[vol_uri]
            if cache_data[STATE_KEY] == UNSTUCK_STATE:
                # This volume was stuck, but isn't any longer
                del self.vol_state_data[vol_uri]

    def remove_no_longer_transitioning_volumes(self, trans_vols):
        """ Remove volumes that were transitioning, but are no longer in the trans_vols list """

        trans_vol_ids = [str(vol.id) for vol in trans_vols]

        for vol_uri in self.vol_state_data.keys():
            cache_data = self.vol_state_data[vol_uri]
            if cache_data[STATE_KEY] == TRANSITION_STATE and \
               cache_data[VOLUME_ID_KEY] not in trans_vol_ids:
                # This volume was transitioning, but isn't any longer
                del self.vol_state_data[vol_uri]

    def get_cluster_volumes(self):
        """ Return the cluster's volume list """
        volume_list = self.ocutil.get_pvs()['items']
        just_the_aws_path = [
            x['spec']['awsElasticBlockStore']['volumeID'] for x in volume_list
        ]

        just_the_volume_ids = [
            re.sub("^aws://.*/", "", x) for x in just_the_aws_path
        ]

        return just_the_volume_ids

    @staticmethod
    def filter_out_non_cluster_vols(account_vols, cluster_vols):
        """ We have a list of all volumes in the account, return only
            those that are part of this cluster """

        cluster_list = [x for x in account_vols if x.id in cluster_vols]

        return cluster_list

    def run(self):
        """ Run the main logic of this check """

        # Load the state machine data
        self.load_volume_state_data()

        # Get the volumes that are currently in a transitioning state
        full_trans_vols = self.eu.get_trans_attach_status_vols()

        # Get the cluster's list of volumes
        cluster_vols = self.get_cluster_volumes()

        # Remove volumes that aren't part of this cluster
        trans_vols = self.filter_out_non_cluster_vols(full_trans_vols,
                                                      cluster_vols)

        # Based on that list, weed out the volumes that used to be transitioning,
        # that are no longer in the transitioning volumes list. This means that
        # it was a normal volume transition, probably from attaching to attached
        # or detaching to detached (aka None).
        self.remove_no_longer_transitioning_volumes(trans_vols)

        # Check on the volumes that were in the stuck state that are no longer
        # in the transitioning volumes list. This means that they went from stuck
        # to unstuck. We need to track these so that we can report that they've become
        # unstuck to monitoring.
        self.set_unstuck_volumes(trans_vols)

        # Add any volumes that are transitioning that we haven't seen before to our data
        self.add_new_transitioning_volumes(trans_vols)

        # Change volumes that are still transitioning and have hit their deadline to
        # finish that transition to a state of "stuck"
        self.set_stuck_volumes()

        # Report to monitoring the stuck volumes
        self.report_stuck_volumes()

        # Report to monitoring the volumes that were stuck, but are now unstuck (no
        # longer transitioning)
        self.report_unstuck_volumes()

        # Since the unstuck volumes have been reported, they can safeuly be removed from
        # our tracking now.
        self.remove_unstuck_volumes_from_state_data()

        # Make sure we save state for the next run.
        self.save_volume_state_data()

        self.eu.verbose_print("\nTracking Volumes")
        self.eu.verbose_print("----------------\n")

        # Cat out the state file
        raw_state_file = self.read_raw_volume_state_data()
        self.eu.verbose_print(raw_state_file)
 def __init__(self):
     '''initial for the InfraNodePodStatus'''
     self.kubeconfig = '/tmp/admin.kubeconfig'
     self.oc = OCUtil(namespace='default', config_file=self.kubeconfig)