def get_clusters_status_wise_counts(self, cluster_summaries): clusters_status_wise_counts = { 'status': { 'total': 0 }, 'near_full': 0, pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0 } cluster_alerts = [] for cluster_summary in cluster_summaries: cluster_tendrl_context = {} cluster_status = {} sds_name = central_store_util.get_cluster_sds_name( cluster_summary.cluster_id) try: cluster_tendrl_context = central_store_util.read( '/clusters/%s/TendrlContext' % cluster_summary.cluster_id) cluster_status = central_store_util.read( '/clusters/%s/GlobalDetails' % cluster_summary.cluster_id) cluster_status = cluster_status.get('status') except EtcdKeyNotFound: return clusters_status_wise_counts if (self.name in cluster_tendrl_context.get('sds_name')): if cluster_status: if (cluster_status not in clusters_status_wise_counts['status']): clusters_status_wise_counts['status'][ cluster_status] = 1 else: clusters_status_wise_counts['status'][ cluster_status ] = \ clusters_status_wise_counts['status'][ cluster_status ] + 1 clusters_status_wise_counts['status']['total'] = \ clusters_status_wise_counts['status']['total'] + 1 cluster_critical_alerts, cluster_warning_alerts = \ parse_resource_alerts( None, pm_consts.CLUSTER, cluster_id=cluster_summary.cluster_id ) cluster_alerts.extend(cluster_critical_alerts) cluster_alerts.extend(cluster_warning_alerts) clusters_status_wise_counts[ pm_consts.CRITICAL_ALERTS] = clusters_status_wise_counts[ pm_consts.CRITICAL_ALERTS] + len( cluster_critical_alerts) clusters_status_wise_counts[ pm_consts.WARNING_ALERTS] = clusters_status_wise_counts[ pm_consts.WARNING_ALERTS] + len(cluster_warning_alerts) for cluster_alert in cluster_alerts: if (cluster_alert['severity'] == pm_consts.CRITICAL and cluster_alert['resource'] == 'cluster_utilization'): clusters_status_wise_counts['near_full'] = \ clusters_status_wise_counts.get('near_full', 0) + 1 return clusters_status_wise_counts
def configure_monitoring(self, integration_id): try: sds_tendrl_context = central_store_util.read( 'clusters/%s/TendrlContext' % integration_id) except EtcdKeyNotFound: return None except EtcdException as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to configure monitoring for ' 'cluster %s as tendrl context could ' 'not be fetched.' % integration_id, "exception": ex })) return for plugin in SDSPlugin.plugins: if plugin.name == sds_tendrl_context['sds_name']: return plugin.configure_monitoring(sds_tendrl_context) Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": 'No plugin defined for %s. Hence cannot ' 'configure it' % sds_tendrl_context['sds_name'] })) return None
def parse_cluster(self, cluster_id): utilization = central_store_util.read( '/clusters/%s/Utilization' % cluster_id ) used = 0 total = 0 percent_used = 0 if utilization.get('used_capacity'): used = utilization.get('used_capacity') elif utilization.get('used'): used = utilization.get('used') if utilization.get('raw_capacity'): total = utilization.get('raw_capacity') elif utilization.get('total'): total = utilization.get('total') if utilization.get('pcnt_used'): percent_used = utilization.get('pcnt_used') return ClusterSummary( utilization={ 'total': int(total), 'used': int(used), 'percent_used': float(percent_used) }, iops=str(self.get_cluster_iops(cluster_id)), hosts_count=self.parse_host_count(cluster_id), sds_type=central_store_util.get_cluster_sds_name(cluster_id), node_summaries=self.cluster_nodes_summary( cluster_id ), sds_det=NS.sds_monitoring_manager.get_cluster_summary( cluster_id, central_store_util.get_cluster_name(cluster_id) ), cluster_id=cluster_id, )
def parse_cluster(self, cluster_id): utilization = {} try: utilization = central_store_util.read('/clusters/%s/Utilization' % cluster_id) except (EtcdKeyNotFound, AttributeError, EtcdException) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Utilization not available for cluster' ' %s.' % cluster_id, "exception": ex })) used = 0 total = 0 percent_used = 0 if utilization.get('used_capacity'): used = utilization.get('used_capacity') elif utilization.get('used'): used = utilization.get('used') if utilization.get('raw_capacity'): total = utilization.get('raw_capacity') elif utilization.get('total'): total = utilization.get('total') if utilization.get('pcnt_used'): percent_used = utilization.get('pcnt_used') try: sds_name = central_store_util.get_cluster_sds_name(cluster_id) except (EtcdKeyNotFound, EtcdException, AttributeError) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Error caught fetching sds name of' ' cluster %s.' % cluster_id, "exception": ex })) return ClusterSummary( utilization={ 'total': int(total), 'used': int(used), 'percent_used': float(percent_used) }, iops=str(self.get_cluster_iops(cluster_id)), hosts_count=self.parse_host_count(cluster_id), sds_type=sds_name, node_summaries=self.cluster_nodes_summary(cluster_id), sds_det=NS.sds_monitoring_manager.get_cluster_summary( cluster_id, central_store_util.get_cluster_name(cluster_id)), cluster_id=cluster_id, )
def parse_host_count(self, cluster_id): status_wise_count = { 'total': 0, 'down': 0, 'crit_alert_count': 0, 'warn_alert_count': 0 } cluster_nodes = central_store_util.get_cluster_node_ids(cluster_id) for node_id in cluster_nodes: try: node_context = central_store_util.read( '/clusters/%s/nodes/%s/NodeContext' % (cluster_id, node_id)) except (EtcdKeyNotFound, AttributeError, EtcdException) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch node-context from' ' /clusters/%s/nodes/%s/NodeContext' % (cluster_id, node_id), "exception": ex })) continue status = node_context.get('status') if status: if status != 'UP': status_wise_count['down'] = status_wise_count['down'] + 1 status_wise_count['total'] = status_wise_count['total'] + 1 alerts = [] try: alerts = central_store_util.get_node_alerts(node_id) except EtcdKeyNotFound: pass except (AttributeError, EtcdException) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Error fetching alerts for node %s' % (node_id), "exception": ex })) for alert in alerts: if alert.get('severity') == 'CRITICAL': status_wise_count['crit_alert_count'] = \ status_wise_count['crit_alert_count'] + 1 elif alert.get('severity') == 'WARNING': status_wise_count['warn_alert_count'] = \ status_wise_count['warn_alert_count'] + 1 return status_wise_count
def get_node_services_count(self, node_id): services = {} try: services = central_store_util.read('nodes/%s/Services' % node_id) except EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch services of ' 'node %s' % node_id, "exception": ex })) return services
def get_services_count(self, cluster_node_ids): node_service_counts = {} for node_id in cluster_node_ids: try: services = central_store_util.read('nodes/%s/Services' % node_id) except EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch services of ' 'node %s' % node_id, "exception": ex })) continue for service_name, service_det in services.iteritems(): try: if service_name in self.supported_services: if service_name not in node_service_counts: service_counter = {'running': 0, 'not_running': 0} else: service_counter = node_service_counts[service_name] if service_det['exists'] == 'True': if service_det['running'] == 'True': service_counter['running'] = \ service_counter['running'] + 1 else: service_counter['not_running'] = \ service_counter['not_running'] + 1 node_service_counts[service_name] = service_counter except (ValueError, AttributeError, KeyError) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to parse services of ' 'node %s' % node_id, "exception": ex })) continue return node_service_counts
def get_services_count(self, cluster_node_ids): node_service_counts = {} for node_id in cluster_node_ids: services = central_store_util.read('nodes/%s/Services' % node_id) for service_name, service_det in services.iteritems(): if service_name in self.supported_services: if service_name not in node_service_counts: service_counter = {'running': 0, 'not_running': 0} else: service_counter = node_service_counts[service_name] if service_det['exists'] == 'True': if service_det['running'] == 'True': service_counter['running'] = \ service_counter['running'] + 1 else: service_counter['not_running'] = \ service_counter['not_running'] + 1 node_service_counts[service_name] = service_counter return node_service_counts
def cluster_nodes_summary(self, cluster_id): node_summaries = [] node_ids = central_store_util.get_cluster_node_ids(cluster_id) for node_id in node_ids: try: node_summary = central_store_util.read( '/monitoring/summary/nodes/%s' % node_id) node_summaries.append(node_summary) except (EtcdKeyNotFound, AttributeError, EtcdException) as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": 'Error caught fetching node summary of' ' node %s.' % node_id, "exception": ex })) continue return node_summaries
def get_node_services_count(self, node_id): services = central_store_util.read('nodes/%s/Services' % node_id) return services