def get_mon_status_wise_counts(self, cluster_id): mon_status_wise_counts = { 'outside_quorum': 0, 'total': 0 } try: mons = etcd_read_key( '/clusters/%s/maps/mon_map/data' % cluster_id ) mons = json.loads(mons.get('data', '{}')) mons = mons['mons'] mon_status_wise_counts['total'] = len(mons) outside_quorum = etcd_read_key( '/clusters/%s/maps/mon_status/data' % cluster_id ) outside_quorum = json.loads(outside_quorum.get('data', '{}')) outside_quorum = outside_quorum.get('outside_quorum', []) mon_status_wise_counts['outside_quorum'] = len(outside_quorum) except EtcdKeyNotFound: pass except (ValueError, TendrlPerformanceMonitoringException) as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Exception caught computing mon status " "wise counts", "exception": ex } ) ) return mon_status_wise_counts
def get_node_osd_status_wise_counts(self, node_id): osds_in_node = [] osd_status_wise_counts = { 'total': 0, 'down': 0, pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0 } cluster_id = central_store_util.get_node_cluster_id(node_id) node_ip = '' ip_indexes = etcd_read_key('/indexes/ip') for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip try: osds = etcd_read_key('/clusters/%s/maps/osd_map/data/osds' % cluster_id) osds = ast.literal_eval(osds.get('osds', '[]')) for osd in osds: if (node_ip in osd.get('cluster_addr', '') or node_ip in osd.get('public_addr', '')): osds_in_node.append(osd.get('osd')) if 'up' not in osd.get('state'): osd_status_wise_counts['down'] = \ osd_status_wise_counts['down'] + 1 osd_status_wise_counts['total'] = \ osd_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'osd', pm_consts.CLUSTER, cluster_id=cluster_id) count = 0 for alert in crit_alerts: plugin_instance = alert['tags'].get('plugin_instance', '') if int(plugin_instance[len('osd_'):]) in osds_in_node: count = count + 1 osd_status_wise_counts[pm_consts.CRITICAL_ALERTS] = count count = 0 for alert in warn_alerts: plugin_instance = alert['tags'].get('plugin_instance', '') if int(plugin_instance[len('osd_'):]) in osds_in_node: count = count + 1 osd_status_wise_counts[pm_consts.WARNING_ALERTS] = count except Exception as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Exception caught computing node osd " "counts", "exception": ex })) return osd_status_wise_counts
def get_mon_status_wise_counts(self, cluster_id): mon_status_wise_counts = {'outside_quorum': 0, 'total': 0} try: mons = etcd_read_key('/clusters/%s/maps/mon_map/data' % cluster_id) mons = json.loads(mons.get('data', '{}')) mons = mons['mons'] mon_status_wise_counts['total'] = len(mons) outside_quorum = etcd_read_key( '/clusters/%s/maps/mon_status/data' % cluster_id) outside_quorum = json.loads(outside_quorum.get('data', '{}')) outside_quorum = outside_quorum.get('outside_quorum', []) mon_status_wise_counts['outside_quorum'] = len(outside_quorum) except EtcdKeyNotFound: pass return mon_status_wise_counts
def get_nw_node_interfaces(self, node_id, nw_type, cluster_id): nw_node_interfaces = [] try: nw_subnet = etcd_read_key('/clusters/%s/maps/config/data' % (cluster_id)) nw_subnet = json.loads(nw_subnet.get('data', '{}')) nw_subnet = nw_subnet.get(nw_type, '') if nw_subnet: nw_subnet = nw_subnet.replace('/', '_') networks = etcd_read_key('/networks/%s/%s' % (nw_subnet, node_id)) for interface_id, interface_det in networks.iteritems(): nw_node_interfaces.append(interface_det.get('interface')) except Exception: pass return nw_node_interfaces
def get_cluster_osds(self, cluster_id): osds = [] try: osd_data = etcd_read_key('/clusters/%s/maps/osd_map' % cluster_id) osd_data = json.loads(osd_data['data']) osds = osd_data.get('osds') except EtcdKeyNotFound: pass return osds
def get_cluster_pools(self, cluster_id): pools = {} try: pool_ids = self.get_cluster_pool_ids(cluster_id) for pool_id in pool_ids: pool = etcd_read_key('/clusters/%s/Pools/%s' % (cluster_id, pool_id)) pools[pool_id] = pool except EtcdKeyNotFound: pass return pools
def get_cluster_osds(self, cluster_id): osds = [] try: osd_data = etcd_read_key( '/clusters/%s/maps/osd_map' % cluster_id ) osd_data = json.loads(osd_data['data']) osds = osd_data.get('osds') if not osds: osds = [] except (EtcdKeyNotFound, TendrlPerformanceMonitoringException): pass return osds
def get_nw_node_interfaces(self, node_id, nw_type, cluster_id): nw_node_interfaces = [] try: nw_subnet = etcd_read_key( '/clusters/%s/maps/config/data' % ( cluster_id ) ) nw_subnet = json.loads(nw_subnet.get('data', '{}')) nw_subnet = nw_subnet.get(nw_type, '') if nw_subnet: nw_subnet = nw_subnet.replace('/', '_') networks = etcd_read_key( '/networks/%s/%s' % (nw_subnet, node_id) ) for interface_id, interface_det in networks.iteritems(): nw_node_interfaces.append( interface_det.get('interface') ) except ( EtcdKeyNotFound, ValueError, TypeError, AttributeError ): pass except TendrlPerformanceMonitoringException as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Error fetching %s n/w info for node " "%s" % (nw_type, node_id), "exception": ex } ) ) return nw_node_interfaces
def get_pg_counts(self, cluster_id): try: pg_summary = etcd_read_key('/clusters/%s/maps/pg_summary/data' % cluster_id) pg_summary = json.loads(pg_summary.get('data', '{}')) if 'all' not in pg_summary: return {} pg_summary = pg_summary['all'] if isinstance(pg_summary, basestring): pg_summary = ast.literal_eval(pg_summary) return _calculate_pg_counters(pg_summary) except EtcdKeyNotFound: return {}
def get_cluster_volumes(self, cluster_id): volumes = {} try: volume_ids = self.get_cluster_volume_ids(cluster_id) except EtcdKeyNotFound: return volumes for volume_id in volume_ids: try: volume = etcd_read_key('/clusters/%s/Volumes/%s' % (cluster_id, volume_id)) volumes[volume_id] = volume except EtcdKeyNotFound: continue return volumes
def get_cluster_bricks(self, cluster_id): ret_val = {} try: etcd_bricks = central_store_util.read_key( '/clusters/%s/Bricks/all' % cluster_id ) except EtcdKeyNotFound: return ret_val for etcd_brick in etcd_bricks.leaves: try: etcd_brick_key_contents = etcd_brick.key.split('/') brick = etcd_read_key( '/clusters/%s/Bricks/all/%s' % ( cluster_id, etcd_brick_key_contents[5] ) ) if 'vol_id' not in brick: continue if ( 'utilization' in brick and 'brick_path' in brick ): brick['utilization']['vol_name'] = \ central_store_util.get_volume_name( cluster_id, brick['vol_id'] ) brick['utilization']['cluster_name'] = \ central_store_util.get_cluster_name(cluster_id) brick['utilization']['brick_path'] = \ brick['brick_path'] brick['utilization']['hostname'] = \ brick['hostname'] ret_val[etcd_brick_key_contents[5]] = brick except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Error fetching details for %s" " brick" % etcd_brick.key, "exception": ex } ) ) continue return ret_val
def get_node_brick_status_counts(self, node_id): node_name = central_store_util.get_node_name_from_id(node_id) ip_indexes = etcd_read_key('/indexes/ip') node_ip = '' for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: cluster_id = central_store_util.get_node_cluster_id(node_id) if cluster_id: volumes_det = self.get_cluster_volumes(cluster_id) for volume_id, volume_det in volumes_det.iteritems(): for brick_path, brick_det in volume_det.get( 'Bricks', {}).iteritems(): if (brick_det['hostname'] == node_name or brick_det['hostname'] == node_ip): if brick_det['status'] == 'Stopped': brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id) count = 0 for alert in crit_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[pm_consts.CRITICAL_ALERTS] = count count = 0 for alert in warn_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[pm_consts.WARNING_ALERTS] = count except Exception as ex: Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Exception caught fetching node brick" " status wise counts", "exception": ex })) return brick_status_wise_counts
def get_cluster_pools(self, cluster_id): pools = {} try: pool_ids = self.get_cluster_pool_ids(cluster_id) for pool_id in pool_ids: pool = etcd_read_key( '/clusters/%s/Pools/%s' % ( cluster_id, pool_id ) ) pools[pool_id] = pool except (EtcdKeyNotFound, TendrlPerformanceMonitoringException): pass return pools
def get_rbds(self, cluster_id, pools): rbds = [] pool_ids = [] try: for pool_id, pool in pools.iteritems(): pool_ids.append(pool_id) rbd_names = self.get_rbd_names(cluster_id, pool_ids) except EtcdKeyNotFound: pass for pool_id, pool_rbds in rbd_names.iteritems(): for rbd in pool_rbds: try: rbd_dict = etcd_read_key('/clusters/%s/Pools/%s/Rbds/%s' % (cluster_id, pool_id, rbd)) rbds.append(rbd_dict) except EtcdKeyNotFound: continue return rbds
def configure_monitoring(self, sds_tendrl_context): configs = [] cluster_node_ids = \ central_store_util.get_cluster_node_ids( sds_tendrl_context['integration_id'] ) for node_id in cluster_node_ids: sds_node_context = etcd_read_key( '/clusters/%s/nodes/%s/NodeContext' % ( sds_tendrl_context['integration_id'], node_id ) ) config = NS.performance_monitoring.config.data['thresholds'] if isinstance(config, basestring): config = ast.literal_eval(config.encode('ascii', 'ignore')) for plugin, plugin_config in config[self.name].iteritems(): if isinstance(plugin_config, basestring): plugin_config = ast.literal_eval( plugin_config.encode('ascii', 'ignore') ) p_conf = copy.deepcopy(plugin_config) p_conf['cluster_id'] = \ sds_tendrl_context['integration_id'] p_conf['cluster_name'] = \ sds_tendrl_context['cluster_name'] configs.append({ 'plugin': "tendrl_%sfs_%s" % (self.name, plugin), 'plugin_conf': p_conf, 'node_id': node_id, 'fqdn': sds_node_context['fqdn'] }) configs.append({ 'plugin': "tendrl_%sfs_peer_network_throughput" % ( self.name ), 'plugin_conf': { 'peer_name': sds_node_context['fqdn'] }, 'node_id': node_id, 'fqdn': sds_node_context['fqdn'] }) return configs
def configure_monitoring(self, sds_tendrl_context): configs = [] cluster_node_ids = \ central_store_util.get_cluster_node_ids( sds_tendrl_context['integration_id'] ) for node_id in cluster_node_ids: sds_node_context = etcd_read_key( '/clusters/%s/nodes/%s/NodeContext' % (sds_tendrl_context['integration_id'], node_id)) if 'mon' in sds_node_context['tags']: config = NS.performance_monitoring.config.data['thresholds'] if isinstance(config, basestring): config = ast.literal_eval(config.encode('ascii', 'ignore')) for plugin, plugin_config in config[self.name].iteritems(): if isinstance(plugin_config, basestring): plugin_config = ast.literal_eval( plugin_config.encode('ascii', 'ignore')) is_configured = True if node_id not in self.configured_nodes: self.configured_nodes[node_id] = [ "tendrl_%s_%s" % (self.name, plugin) ] is_configured = False if ("tendrl_%s_%s" % (self.name, plugin) not in self.configured_nodes.get(node_id, [])): node_plugins = self.configured_nodes.get(node_id, []) node_plugins.append("tendrl_%s_%s" % (self.name, plugin)) self.configured_nodes[node_id] = node_plugins is_configured = False if not is_configured: plugin_config['cluster_id'] = \ sds_tendrl_context['integration_id'] plugin_config['cluster_name'] = \ sds_tendrl_context['cluster_name'] configs.append({ 'plugin': "tendrl_%s_%s" % (self.name, plugin), 'plugin_conf': plugin_config, 'node_id': node_id, 'fqdn': sds_node_context['fqdn'] }) is_configured = True if ("tendrl_ceph_cluster_iops" not in self.configured_nodes.get(node_id, [])): node_plugins = self.configured_nodes.get(node_id, []) node_plugins.append("tendrl_ceph_cluster_iops") self.configured_nodes[node_id] = node_plugins is_configured = False if not is_configured: plugin_config = { 'cluster_id': sds_tendrl_context['integration_id'], 'cluster_name': sds_tendrl_context['cluster_name'] } configs.append({ 'plugin': "tendrl_ceph_cluster_iops", 'plugin_conf': plugin_config, 'node_id': node_id, 'fqdn': sds_node_context['fqdn'] }) is_configured = True if ("tendrl_ceph_node_network_throughput" not in self.configured_nodes.get(node_id, [])): plugin_config = {} plugin_config['cluster_network'] = ' '.join( self.get_nw_node_interfaces( node_id, 'cluster_network', sds_tendrl_context['integration_id'])) plugin_config['public_network'] = ' '.join( self.get_nw_node_interfaces( node_id, 'public_network', sds_tendrl_context['integration_id'])) if (plugin_config['cluster_network'] and plugin_config['public_network']): node_plugins = self.configured_nodes.get(node_id, []) node_plugins.append("tendrl_ceph_node_network_throughput") self.configured_nodes[node_id] = node_plugins configs.append({ 'plugin': "tendrl_%s_node_network_throughput" % (self.name), 'plugin_conf': plugin_config, 'node_id': node_id, 'fqdn': sds_node_context['fqdn'] }) return configs
def get_node_brick_status_counts(self, node_id): brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: node_name = central_store_util.get_node_name_from_id(node_id) except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching node name for node " "%s" % node_id, "exception": ex } ) ) return brick_status_wise_counts try: ip_indexes = etcd_read_key('/indexes/ip') except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching ip indexes", "exception": ex } ) ) return brick_status_wise_counts node_ip = '' for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip break try: cluster_id = central_store_util.get_node_cluster_id( node_id ) if cluster_id: bricks = self.get_cluster_bricks(cluster_id) for brick_path, brick_det in bricks.iteritems(): if ( brick_det['hostname'] == node_name or brick_det['hostname'] == node_ip ): if ( 'status' in brick_det and brick_det['status'] == 'Stopped' ): brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id ) count = 0 for alert in crit_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.CRITICAL_ALERTS ] = count count = 0 for alert in warn_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.WARNING_ALERTS ] = count except ( TendrlPerformanceMonitoringException, AttributeError, ValueError, KeyError ) as ex: Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Exception caught fetching node brick" " status wise counts", "exception": ex } ) ) return brick_status_wise_counts
def configure_monitoring(self, sds_tendrl_context): configs = [] cluster_node_ids = \ central_store_util.get_cluster_node_ids( sds_tendrl_context['integration_id'] ) for node_id in cluster_node_ids: sds_node_context = etcd_read_key( '/clusters/%s/nodes/%s/NodeContext' % (sds_tendrl_context['integration_id'], node_id)) config = NS.performance_monitoring.config.data['thresholds'] if isinstance(config, basestring): config = ast.literal_eval(config.encode('ascii', 'ignore')) for plugin, plugin_config in config[self.name].iteritems(): if isinstance(plugin_config, basestring): plugin_config = ast.literal_eval( plugin_config.encode('ascii', 'ignore')) is_configured = True if node_id not in self.configured_nodes: self.configured_nodes[node_id] = [plugin] is_configured = False if ("tendrl_%sfs_%s" % (self.name, plugin) not in self.configured_nodes[node_id]): node_plugins = self.configured_nodes[node_id] node_plugins.append("tendrl_%sfs_%s" % (self.name, plugin)) self.configured_nodes[node_id] = node_plugins is_configured = False if not is_configured: plugin_config['cluster_id'] = \ sds_tendrl_context['integration_id'] plugin_config['cluster_name'] = \ sds_tendrl_context['cluster_name'] configs.append({ 'plugin': "tendrl_%sfs_%s" % (self.name, plugin), 'plugin_conf': plugin_config, 'node_id': node_id, 'fqdn': sds_node_context['fqdn'] }) is_configured = True if ("%sfs_peer_network_throughput" % (self.name) not in self.configured_nodes[node_id]): node_plugins = self.configured_nodes[node_id] node_plugins.append("%sfs_peer_network_throughput" % (self.name)) self.configured_nodes[node_id] = node_plugins is_configured = False if not is_configured: configs.append({ 'plugin': "tendrl_%sfs_peer_network_throughput" % (self.name), 'plugin_conf': { 'peer_name': sds_node_context['fqdn'] }, 'node_id': node_id, 'fqdn': sds_node_context['fqdn'] }) return configs