def quorum_status(): """ Reports the status of monitor quorum """ base_result = MetricData( name='cephlm.monitor.quorum', messages={ 'ok': 'Monitors are in quorum.', 'warn': 'Monitors ({msg}) is/are not in quorum.', 'fail': 'Monitors ({msg}) have not formed quorum.', 'unknown': 'Probe error: {msg}.' }) msg = '' value = Severity.ok try: output = Monitor.get_quorum_status() quorum = output['quorum'] monitors = output['monmap']['mons'] if len(quorum) < len(monitors): value = Severity.warn for mon in monitors: if mon['rank'] not in quorum: msg += mon['name'] + ', ' msg = msg[:-2] except CephCommandTimeoutException: value = Severity.fail cluster_name, config, config_file = Ceph._get_ceph_config() msg = config.get('global', 'mon_host') except CephCommandException as e: value = Severity.unknown msg = str(e) result = base_result.child(msgkeys={'msg': msg}) result.value = value return result
def check_status(): """ Display the status of the ceph cluster as returned by 'ceph -s' command """ base_result = MetricData(name='cephlm.cluster.status', messages={ 'ok': 'Cluster is in healthy state.', 'warn': 'Cluster is in warning state: {msg}.', 'fail': 'Cluster is in error state: {msg}.', 'unknown': 'Probe error: {msg}.' }) try: output = Cluster.get_status() except (CephLMException, CephCommandException, CephCommandTimeoutException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result status = output['health']['overall_status'] summary = output['health']['summary'] msgkeys = {'msg': Cluster._process_status_message(status, summary)} result = base_result.child(msgkeys=msgkeys) result.value = Cluster._process_status(status) return result
def check_hpssacli(): """ Checks controller and drive information with hpssacli [Run as root] """ base_result = MetricData(name='cephlm.hpssacli', messages=hpssacli.BASE_RESULT.messages) HPssaCli._override_plugin_settings() try: results = hpssacli.main() except Exception as e: # Unlike other parameters, we do not know the list of metrics here. # Hence there is no way to set each of them to error. Instead we # raise exception wich will be handled by the generic cephlm-probe # exception handler msg = "Unknown exception occured when " \ "executing swiftlm hpssacli module" raise CephLMException(msg) ceph_results = list() for entry in results: # Extract the main metric name, and strip off the parent hierarchy # E.g., swiftlm.hp_hardware.hpssacli.smart_array to smart_array name = entry.name.split('hpssacli.', 1)[1] # Clone the dimensions excluding entries pointing to external # service references dimensions = { key: value for key, value in entry.dimensions.iteritems() if key not in ['service'] } # Convert external metric class to cephlm metric class result = base_result.child(name=name, dimensions=dimensions) result.value = HPssaCli._get_severity_level(entry.value) ceph_results.append(result) return ceph_results
def check_monitor_connectivity(): """ Display the connectivity of the Ceph cluster to each Monitor host """ base_result = MetricData(name='cephlm.connectivity.status', messages={ 'ok': 'Monitors {mons} are reachable.', 'warn': 'Monitor(s) {mons} is/are unreachable.', 'fail': 'Monitor(s) {mons} is/are unreachable.', 'unknown': 'Probe error: {msg}.' }) try: monitors = Cluster.get_monitors() reachable, unreachable = \ Cluster._verify_monitor_connectivity(monitors) except (CephLMException, CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result if len(unreachable) == 0: result = base_result.child(msgkeys={'mons': ', '.join(reachable)}) result.value = Severity.ok else: result = base_result.child( msgkeys={'mons': ', '.join(unreachable)}) if len(reachable) == 0: result.value = Severity.fail else: result.value = Severity.warn return result
def test_response_child(self): r = MetricData(name='name', messages={'a': 'b'}) r['test'] = 'test' c = r.child(dimensions={'test2': 'test2'}) self.assertIn('test', c) self.assertIn('test2', c) self.assertDictEqual({'a': 'b'}, c.messages) self.assertEqual('cephlm.name', c.name) c = r.child() self.assertIn('test', c) self.assertNotIn('test2', c)
def test_child_msgkeys(self): r = MetricData(name='name', messages={ 'ok': 'test message', 'test': 'test with meta {test_value} and {test_value2}', }) c = r.child(dimensions={'test_value': '123'}, msgkeys={'test_value2': '456'}) c.message = 'test' self.assertEqual('test with meta 123 and 456', str(c))
def main(): args = parse_args() metrics = [] for func in args.selected: try: r = func() if isinstance(r, list) and r and isinstance(r[0], MetricData): metrics.extend([result.metric() for result in r]) elif isinstance(r, MetricData): metrics.append(r.metric()) except: # noqa t, v, tb = sys.exc_info() backtrace = ' '.join(traceback.format_exception(t, v, tb)) r = MetricData.single('cephlm.probe.failure', Severity.fail, '{error} failed with: {check}', dimensions={ 'component': 'cephlm-probe', 'service': 'ceph-storage' }, msgkeys={ 'check': func.__module__, 'error': backtrace.replace('\n', ' ') }) metrics.append(r.metric()) # There is no point in reporting multiple measurements of # cephlm.probe.failure metric in same cycle. check_failures_found = [] for metric in metrics: if metric.get('metric') == 'cephlm.probe.failure': check_failures_found.append(metric) if check_failures_found: # Remove all except one instance for metric in check_failures_found[:-1]: metrics.remove(metric) else: r = MetricData.single('cephlm.probe.failure', Severity.ok, 'ok', dimensions={ 'component': 'cephlm-probe', 'service': 'ceph-storage' }) metrics.append(r.metric()) FORMATS[args.format](metrics, args.pretty)
def test_create_metricdata(self): r = MetricData(name='name', messages={}) self.assertEqual('cephlm.name', r.name) self.assertEqual('', r.message) self.assertEqual(None, r.value) self.assertIn('hostname', r.dimensions)
def pool_stats(): """ Publishes the pool statistics """ result = list() INVALID_VALUE = -1 probe_failed = False metric_dict = { 'count': 'count', 'total_objects': 'objects', 'usage_bytes': 'size_bytes', 'top_three_by_usage_bytes': 'top_pools_by_size', 'top_three_by_objects': 'top_pools_by_objects', } try: pool_dict = Pool._stats() except (exc.CephLMException, exc.CephCommandException, exc.CephCommandTimeoutException) as e: probe_failed = True msg = str(e) for metric_name, state in metric_dict.iteritems(): name = "cephlm.pool.%s" % metric_name if probe_failed: value = INVALID_VALUE elif 'top_three' in metric_name: msg, value = Pool._pools_by_metric(pool_dict, state) else: msg, value = Pool._return_total_metrics(pool_dict, state) base_result = MetricData.single(name, value, message=msg) result.append(base_result) return result
def osd_stats(): """ Publishes the osd statistics """ metric_dict = { 'up': OSD._up_count, 'up_out': OSD._up_out_count, 'down': OSD._down_count, 'down_in': OSD._down_in_count, 'total': OSD._total_count } INVALID_VALUE = -1 result = list() probe_failed = False try: osd_stats = OSD._stats() except (exc.CephLMException, exc.CephCommandException, exc.CephCommandTimeoutException) as e: probe_failed = True for metric_state, func in metric_dict.iteritems(): name = "cephlm.osd.%s_count" % metric_state if probe_failed: value = INVALID_VALUE msg = "Probe error: Command 'ceph osd tree' failed" else: value, msg = func(osd_stats) msg = "OSD(s) %s" % msg if msg else "No OSD(s)" msg += " is/are in cluster" if metric_state == 'total' \ else " is/are %s" % metric_state base_result = MetricData.single(name, value, message=msg) result.append(base_result) return result
def check_nic_speed(): """ Checks for optimal nic speed requirement in a ceph node [Run as root] """ base_result = MetricData(name='cephlm.perfscale.nic_speed', messages={ 'ok': '{msg}', 'warn': '{msg}', 'unknown': 'Probe error: {msg}' }) try: nic_info = get_nic_info() ceph_bindings = get_ceph_bind_ips() except CephCommandException as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result # Public IP will always exist for a ceph node irrespective of the # network model. It is the network on which ceph client calls are made public_ip = ceph_bindings.get('public_ip', None) # Private IP or Cluster IP will exist only for OSD nodes provided the # deployment follows multi-network model private_ip = ceph_bindings.get('private_ip', None) nic_speeds = PerfScale._process_nic_speed(public_ip, private_ip, nic_info) shared_external_net = PerfScale._has_shared_external_networks( public_ip, private_ip, nic_info) metrics = list() for entry in ceph_bindings: ip = ceph_bindings[entry] severity, msg = PerfScale._format_nic_speed_status( ip, nic_speeds[ip], shared_external_net) metric = base_result.child(msgkeys={'msg': msg}) metric.name = 'cephlm.perfscale.nic_speed_%s' \ % entry.replace('_ip', '') metric.value = severity metrics.append(metric) return metrics
def test_dict_behaviour(self): r = MetricData(name='name', messages={}) r['test'] = 1000 # dimension values must be strings so we check they are converted # properly self.assertEqual('1000', r['test']) del r['test'] self.assertNotIn('test', r)
def check_osd_node_ram(): """ Checks for optimal memory requirement in a Ceph OSD node [Run as root] """ base_result = MetricData( name='cephlm.perfscale.osd_node_ram', messages={ 'ok': 'Host RAM({ram}GiB) meets %s GiB per TiB of data disk' '({total_osd_size}TiB) guideline.' % PerfScale.GiB_PER_TiB_DATA, 'warn': 'Host RAM({ram}GiB) violates %s GiB per TiB of data disk' # noqa '({total_osd_size}TiB) guideline.' % PerfScale.GiB_PER_TiB_DATA, 'unknown': 'Probe error: {msg}' }) try: journal_disks, data_disks = Ceph.get_ceph_disk_list() mem_info = get_system_memory_info() disks_info = get_system_disks_size() except (CephLMException, CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result total_osd_size, ram = PerfScale._process_osd_ram_data( data_disks, disks_info, mem_info) if not data_disks: # Ideally this check will not be run on non OSD nodes, but in case # it does, we return an empty list return list() result = base_result.child(msgkeys={ 'ram': '%s' % ram, 'total_osd_size': '%s' % total_osd_size }) result.value = PerfScale._process_osd_ram_status(total_osd_size, ram) return result
def check_status(): """ Reports the status of the rados gateway service """ base_result = MetricData( name='cephlm.radosgw.status', messages={ 'ok': 'Radosgw ({ip_port}) is in healthy state.', 'fail': 'Radosgw ({ip_port}) is in error state.', 'unknown': 'Probe error: {msg}.' }) try: ip_port = Radosgw._fetch_radosgw_ip_port() status_success = Radosgw.get_status(ip_port) except (CephLMException, CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result result = base_result.child(msgkeys={'ip_port': ip_port}) result.value = Severity.ok if status_success else Severity.fail return result
class HPssaCliData: MOCK_BASE_RESULT = MetricData( name='swiftlm.hpssacli', messages={ 'no_battery': 'No cache battery', 'unknown': 'hpssacli command failed', 'controller_status': '{sub_component} status is {status}', 'in_hba_mode': 'Controller is in HBA mode;' ' performance will be poor', 'physical_drive': 'Drive {serial_number}: ' '{box}:{bay} has status: {status}', 'l_drive': 'Logical Drive {logical_drive} has status: {status}', 'l_cache': 'Logical Drive {logical_drive}' ' has cache status: {caching}', 'ok': 'OK', 'fail': 'FAIL', } ) MOCK_CHILD_FLOAT = MOCK_BASE_RESULT.child() MOCK_CHILD_FLOAT.name = 'swiftlm.hp_hardware.hpssacli.smart_array.firmware' MOCK_CHILD_FLOAT.value = 3.0 MOCK_CHILD_FLOAT.dimensions = {'component': 'controller', 'controller_slot': '1', 'hostname': 'ardana-ccp-ceph0001-clm', 'model': 'Smart HBA H240', 'service': 'object-storage'} MOCK_CHILD_OK = MOCK_BASE_RESULT.child() MOCK_CHILD_OK.name = 'swiftlm.hp_hardware.hpssacli.smart_array' MOCK_CHILD_OK.value = Severity.ok MOCK_CHILD_OK.dimensions = {'component': 'controller', 'sub_component': 'controller_not_hba_mode', 'controller_slot': '1', 'hostname': 'ardana-ccp-ceph0001-clm', 'model': 'Smart HBA H240', 'service': 'object-storage'} MOCK_CHILD_FAIL = MOCK_BASE_RESULT.child() MOCK_CHILD_FAIL.name = 'swiftlm.hp_hardware.hpssacli.smart_array' MOCK_CHILD_FAIL.value = Severity.fail MOCK_CHILD_FAIL.dimensions = {'component': 'controller', 'sub_component': 'battery_capacitor_status', 'controller_slot': '1', 'hostname': 'ardana-ccp-ceph0001-clm', 'model': 'Smart HBA H240', 'service': 'object-storage'} MOCK_RESPONSE = [MOCK_CHILD_FLOAT, MOCK_CHILD_OK, MOCK_CHILD_FAIL]
def check_osd_journal_ratio(): """ Checks the ratio of osd disks mapped to journal disks """ base_result = MetricData(name='cephlm.osd.osd_journal_ratio', messages={ 'ok': 'OSDs abide %s:1 OSD to Journal ratio' % OSD.OPTIMAL_OSD_PER_JOURNAL, 'warn': '{msg}', 'unknown': 'Probe error: {msg}' }) try: journal_disks, data_disks = OSD.get_ceph_disk_list() except (exc.CephLMException, exc.CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result # Set metric to warning state when there is both journal and data # partition on a given disk shared_osd_journals = \ set(journal_disks.keys()).intersection(set(data_disks.keys())) # Set metric to warning state when the number of OSDs mapped to a given # journal disk exceeds the recommended limit non_optimal_disks = { key: val for key, val in journal_disks.iteritems() if len(val) > OSD.OPTIMAL_OSD_PER_JOURNAL } return OSD._process_journal_status(base_result, shared_osd_journals, non_optimal_disks)
def pg_stats(): """ Function to aggregate all metrics """ msg = '' INVALID_VALUE = -1 probe_failed = False try: pg_stats = PG._stats() except (exc.CephLMException, exc.CephCommandException, exc.CephCommandTimeoutException) as e: probe_failed = True msg = 'Probe error: ' + str(e) if probe_failed: value = INVALID_VALUE else: value = pg_stats.pop('count') for pg_state, count in pg_stats.iteritems(): msg += '%s=%s, ' % (pg_state, count) msg = msg[:-2] name = "cephlm.pg.count" base_result = MetricData.single(name, value, message=msg) return base_result
def capacity_stats(): """ Publishes the capacity statistics """ metric_list = ['total_bytes', 'used_bytes', 'available_bytes', 'perc_utilization'] msg = '' result = list() capacity_dict = dict() INVALID_VALUE = -1 probe_failed = False try: capacity_dict = Capacity._stats() except (exc.CephLMException, exc.CephCommandException, exc.CephCommandTimeoutException) as e: probe_failed = True msg = str(e) for metric_name in metric_list: name = "cephlm.capacity.%s" % metric_name value = capacity_dict[metric_name] \ if not probe_failed else INVALID_VALUE base_result = MetricData.single(name, value, message=msg) result.append(base_result) return result
def test_message(self): r = MetricData(name='name', messages={ 'ok': 'test message', 'test': 'test with meta {test_value} and {test_value2}', }) # Test automatic message assignment when a the Status Enum is used # as the value self.assertEqual('', r.message) r.value = Severity.ok self.assertEqual('test message', r.message) # Test that an error is raised when trying to use a message without # providing all of the dimension values first. with self.assertRaisesRegexp(ValueError, 'requires a dimension or'): r.message = 'test' r['test_value'] = '123' r.msgkey('test_value2', '456') r.message = 'test' self.assertEqual('test with meta 123 and 456', str(r))
def test_equality_behaviour(self): m_a = MetricData('name', self.messages, self.dimensions) m_b = MetricData('name', self.messages, self.dimensions) self.assertEqual(m_a, m_b) m_a = MetricData('name', self.messages, self.dimensions) m_b = MetricData('not-name', self.messages, self.dimensions) self.assertNotEqual(m_a, m_b) m_a = MetricData('name', {'a': 1}, self.dimensions) m_b = MetricData('name', {'b': 2}, self.dimensions) self.assertEqual( m_a, m_b, 'Message dictionaries should not ' 'affect equality of MetricData instances') m_a = MetricData('name', self.messages, self.dimensions) m_b = MetricData('name', self.messages, {}) self.assertNotEqual(m_a, m_b) m_a = MetricData('name', self.messages, self.dimensions) m_b = MetricData('name', self.messages, self.dimensions) m_a.message = 'ok' m_b.message = 'fail' self.assertNotEqual(m_a, m_b) m_a = MetricData('name', self.messages, self.dimensions) m_b = MetricData('name', self.messages, self.dimensions) m_a.value = 1 m_b.value = 2 self.assertNotEqual(m_a, m_b)