def get_brick_status_wise_counts(self, cluster_id, bricks): brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } for brick_path, brick_det in bricks.iteritems(): if ( 'status' in brick_det and brick_det['status'] == 'Stopped' ): brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id ) brick_status_wise_counts[ pm_consts.CRITICAL_ALERTS ] = len(crit_alerts) brick_status_wise_counts[ pm_consts.WARNING_ALERTS ] = len(warn_alerts) return brick_status_wise_counts
def get_osd_status_wise_counts(self, cluster_id, osds): osd_status_wise_counts = { 'total': 0, 'down': 0, pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0, 'near_full': 0 } for osd in osds: if 'up' not in osd.get('state', ''): osd_status_wise_counts['down'] = \ osd_status_wise_counts['down'] + 1 osd_status_wise_counts['total'] = \ osd_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'osd', pm_consts.CLUSTER, cluster_id=cluster_id ) for osd_alert in crit_alerts: if ( osd_alert['severity'] == pm_consts.CRITICAL and osd_alert['resource'] == 'osd_utilization' ): osd_status_wise_counts['near_full'] = \ osd_status_wise_counts.get('near_full', 0) + 1 osd_status_wise_counts[ pm_consts.CRITICAL_ALERTS ] = len(crit_alerts) osd_status_wise_counts[ pm_consts.WARNING_ALERTS ] = len(warn_alerts) return osd_status_wise_counts
def get_brick_status_wise_counts(self, cluster_id, volumes_det): brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: for volume_id, volume_det in volumes_det.iteritems(): for brick_path, brick_det in volume_det.get('Bricks', {}).iteritems(): if brick_det['status'] == 'Stopped': brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 except Exception as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Exception caught computing brick " "status wise counts", "exception": ex })) crit_alerts, warn_alerts = parse_resource_alerts('brick', pm_consts.CLUSTER, cluster_id=cluster_id) brick_status_wise_counts[pm_consts.CRITICAL_ALERTS] = len(crit_alerts) brick_status_wise_counts[pm_consts.WARNING_ALERTS] = len(warn_alerts) return brick_status_wise_counts
def get_volume_status_wise_counts(self, cluster_id, volumes): volume_status_wise_counts = { 'down': 0, 'total': 0, 'degraded': 0, pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0 } # Needs to be tested for vol_id, vol_det in volumes.iteritems(): if 'Started' not in vol_det.get('status'): volume_status_wise_counts['down'] = \ volume_status_wise_counts['down'] + 1 volume_status_wise_counts['total'] = \ volume_status_wise_counts['total'] + 1 volumes_up_degraded = 0 try: volumes_up_degraded = NS._int.client.read( '/clusters/%s/GlobalDetails/volume_up_degraded' % cluster_id).value except EtcdKeyNotFound: pass volume_status_wise_counts['degraded'] = \ int(volumes_up_degraded or 0) crit_alerts, warn_alerts = parse_resource_alerts('volume', pm_consts.CLUSTER, cluster_id=cluster_id) volume_status_wise_counts[pm_consts.CRITICAL_ALERTS] = len(crit_alerts) volume_status_wise_counts[pm_consts.WARNING_ALERTS] = len(warn_alerts) return volume_status_wise_counts
def get_clusters_status_wise_counts(self, cluster_summaries): clusters_status_wise_counts = { 'status': { 'total': 0 }, 'near_full': 0, pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0 } cluster_alerts = [] for cluster_summary in cluster_summaries: cluster_tendrl_context = {} cluster_status = {} sds_name = central_store_util.get_cluster_sds_name( cluster_summary.cluster_id) try: cluster_tendrl_context = central_store_util.read( '/clusters/%s/TendrlContext' % cluster_summary.cluster_id) cluster_status = central_store_util.read( '/clusters/%s/GlobalDetails' % cluster_summary.cluster_id) cluster_status = cluster_status.get('status') except EtcdKeyNotFound: return clusters_status_wise_counts if (self.name in cluster_tendrl_context.get('sds_name')): if cluster_status: if (cluster_status not in clusters_status_wise_counts['status']): clusters_status_wise_counts['status'][ cluster_status] = 1 else: clusters_status_wise_counts['status'][ cluster_status ] = \ clusters_status_wise_counts['status'][ cluster_status ] + 1 clusters_status_wise_counts['status']['total'] = \ clusters_status_wise_counts['status']['total'] + 1 cluster_critical_alerts, cluster_warning_alerts = \ parse_resource_alerts( None, pm_consts.CLUSTER, cluster_id=cluster_summary.cluster_id ) cluster_alerts.extend(cluster_critical_alerts) cluster_alerts.extend(cluster_warning_alerts) clusters_status_wise_counts[ pm_consts.CRITICAL_ALERTS] = clusters_status_wise_counts[ pm_consts.CRITICAL_ALERTS] + len( cluster_critical_alerts) clusters_status_wise_counts[ pm_consts.WARNING_ALERTS] = clusters_status_wise_counts[ pm_consts.WARNING_ALERTS] + len(cluster_warning_alerts) for cluster_alert in cluster_alerts: if (cluster_alert['severity'] == pm_consts.CRITICAL and cluster_alert['resource'] == 'cluster_utilization'): clusters_status_wise_counts['near_full'] = \ clusters_status_wise_counts.get('near_full', 0) + 1 return clusters_status_wise_counts
def get_node_osd_status_wise_counts(self, node_id): osds_in_node = [] osd_status_wise_counts = { 'total': 0, 'down': 0, pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0 } cluster_id = central_store_util.get_node_cluster_id(node_id) node_ip = '' ip_indexes = etcd_read_key('/indexes/ip') for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip try: osds = etcd_read_key('/clusters/%s/maps/osd_map/data/osds' % cluster_id) osds = ast.literal_eval(osds.get('osds', '[]')) for osd in osds: if (node_ip in osd.get('cluster_addr', '') or node_ip in osd.get('public_addr', '')): osds_in_node.append(osd.get('osd')) if 'up' not in osd.get('state'): osd_status_wise_counts['down'] = \ osd_status_wise_counts['down'] + 1 osd_status_wise_counts['total'] = \ osd_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'osd', pm_consts.CLUSTER, cluster_id=cluster_id) count = 0 for alert in crit_alerts: plugin_instance = alert['tags'].get('plugin_instance', '') if int(plugin_instance[len('osd_'):]) in osds_in_node: count = count + 1 osd_status_wise_counts[pm_consts.CRITICAL_ALERTS] = count count = 0 for alert in warn_alerts: plugin_instance = alert['tags'].get('plugin_instance', '') if int(plugin_instance[len('osd_'):]) in osds_in_node: count = count + 1 osd_status_wise_counts[pm_consts.WARNING_ALERTS] = count except Exception as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Exception caught computing node osd " "counts", "exception": ex })) return osd_status_wise_counts
def get_rbd_status_wise_counts(self, cluster_id, rbds): # No status for rbds so currently only alert counters will be available rbd_status_wise_counts = { pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0, pm_consts.TOTAL: 0 } rbd_status_wise_counts[pm_consts.TOTAL] = len(rbds) crit_alerts, warn_alerts = parse_resource_alerts('rbd', pm_consts.CLUSTER, cluster_id=cluster_id) rbd_status_wise_counts[pm_consts.CRITICAL_ALERTS] = len(crit_alerts) rbd_status_wise_counts[pm_consts.WARNING_ALERTS] = len(warn_alerts) return rbd_status_wise_counts
def get_node_brick_status_counts(self, node_id): node_name = central_store_util.get_node_name_from_id(node_id) ip_indexes = etcd_read_key('/indexes/ip') node_ip = '' for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: cluster_id = central_store_util.get_node_cluster_id(node_id) if cluster_id: volumes_det = self.get_cluster_volumes(cluster_id) for volume_id, volume_det in volumes_det.iteritems(): for brick_path, brick_det in volume_det.get( 'Bricks', {}).iteritems(): if (brick_det['hostname'] == node_name or brick_det['hostname'] == node_ip): if brick_det['status'] == 'Stopped': brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id) count = 0 for alert in crit_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[pm_consts.CRITICAL_ALERTS] = count count = 0 for alert in warn_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[pm_consts.WARNING_ALERTS] = count except Exception as ex: Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Exception caught fetching node brick" " status wise counts", "exception": ex })) return brick_status_wise_counts
def get_pool_status_wise_counts(self, cluster_id, pools): # No status for pools, so only alert counters will be available pool_status_wise_counts = { pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0, pm_consts.TOTAL: 0 } pool_status_wise_counts[pm_consts.TOTAL] = \ len(pools.keys()) crit_alerts, warn_alerts = parse_resource_alerts('pool', pm_consts.CLUSTER, cluster_id=cluster_id) pool_status_wise_counts[pm_consts.CRITICAL_ALERTS] = len(crit_alerts) pool_status_wise_counts[pm_consts.WARNING_ALERTS] = len(warn_alerts) return pool_status_wise_counts
def get_node_brick_status_counts(self, node_id): brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: node_name = central_store_util.get_node_name_from_id(node_id) except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching node name for node " "%s" % node_id, "exception": ex } ) ) return brick_status_wise_counts try: ip_indexes = etcd_read_key('/indexes/ip') except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching ip indexes", "exception": ex } ) ) return brick_status_wise_counts node_ip = '' for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip break try: cluster_id = central_store_util.get_node_cluster_id( node_id ) if cluster_id: bricks = self.get_cluster_bricks(cluster_id) for brick_path, brick_det in bricks.iteritems(): if ( brick_det['hostname'] == node_name or brick_det['hostname'] == node_ip ): if ( 'status' in brick_det and brick_det['status'] == 'Stopped' ): brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id ) count = 0 for alert in crit_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.CRITICAL_ALERTS ] = count count = 0 for alert in warn_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.WARNING_ALERTS ] = count except ( TendrlPerformanceMonitoringException, AttributeError, ValueError, KeyError ) as ex: Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Exception caught fetching node brick" " status wise counts", "exception": ex } ) ) return brick_status_wise_counts