def check_status(): """ Display the status of the ceph cluster as returned by 'ceph -s' command """ base_result = MetricData(name='cephlm.cluster.status', messages={ 'ok': 'Cluster is in healthy state.', 'warn': 'Cluster is in warning state: {msg}.', 'fail': 'Cluster is in error state: {msg}.', 'unknown': 'Probe error: {msg}.' }) try: output = Cluster.get_status() except (CephLMException, CephCommandException, CephCommandTimeoutException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result status = output['health']['overall_status'] summary = output['health']['summary'] msgkeys = {'msg': Cluster._process_status_message(status, summary)} result = base_result.child(msgkeys=msgkeys) result.value = Cluster._process_status(status) return result
def check_monitor_connectivity(): """ Display the connectivity of the Ceph cluster to each Monitor host """ base_result = MetricData(name='cephlm.connectivity.status', messages={ 'ok': 'Monitors {mons} are reachable.', 'warn': 'Monitor(s) {mons} is/are unreachable.', 'fail': 'Monitor(s) {mons} is/are unreachable.', 'unknown': 'Probe error: {msg}.' }) try: monitors = Cluster.get_monitors() reachable, unreachable = \ Cluster._verify_monitor_connectivity(monitors) except (CephLMException, CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result if len(unreachable) == 0: result = base_result.child(msgkeys={'mons': ', '.join(reachable)}) result.value = Severity.ok else: result = base_result.child( msgkeys={'mons': ', '.join(unreachable)}) if len(reachable) == 0: result.value = Severity.fail else: result.value = Severity.warn return result
def test_response_child(self): r = MetricData(name='name', messages={'a': 'b'}) r['test'] = 'test' c = r.child(dimensions={'test2': 'test2'}) self.assertIn('test', c) self.assertIn('test2', c) self.assertDictEqual({'a': 'b'}, c.messages) self.assertEqual('cephlm.name', c.name) c = r.child() self.assertIn('test', c) self.assertNotIn('test2', c)
def check_hpssacli(): """ Checks controller and drive information with hpssacli [Run as root] """ base_result = MetricData(name='cephlm.hpssacli', messages=hpssacli.BASE_RESULT.messages) HPssaCli._override_plugin_settings() try: results = hpssacli.main() except Exception as e: # Unlike other parameters, we do not know the list of metrics here. # Hence there is no way to set each of them to error. Instead we # raise exception wich will be handled by the generic cephlm-probe # exception handler msg = "Unknown exception occured when " \ "executing swiftlm hpssacli module" raise CephLMException(msg) ceph_results = list() for entry in results: # Extract the main metric name, and strip off the parent hierarchy # E.g., swiftlm.hp_hardware.hpssacli.smart_array to smart_array name = entry.name.split('hpssacli.', 1)[1] # Clone the dimensions excluding entries pointing to external # service references dimensions = { key: value for key, value in entry.dimensions.iteritems() if key not in ['service'] } # Convert external metric class to cephlm metric class result = base_result.child(name=name, dimensions=dimensions) result.value = HPssaCli._get_severity_level(entry.value) ceph_results.append(result) return ceph_results
def quorum_status(): """ Reports the status of monitor quorum """ base_result = MetricData( name='cephlm.monitor.quorum', messages={ 'ok': 'Monitors are in quorum.', 'warn': 'Monitors ({msg}) is/are not in quorum.', 'fail': 'Monitors ({msg}) have not formed quorum.', 'unknown': 'Probe error: {msg}.' }) msg = '' value = Severity.ok try: output = Monitor.get_quorum_status() quorum = output['quorum'] monitors = output['monmap']['mons'] if len(quorum) < len(monitors): value = Severity.warn for mon in monitors: if mon['rank'] not in quorum: msg += mon['name'] + ', ' msg = msg[:-2] except CephCommandTimeoutException: value = Severity.fail cluster_name, config, config_file = Ceph._get_ceph_config() msg = config.get('global', 'mon_host') except CephCommandException as e: value = Severity.unknown msg = str(e) result = base_result.child(msgkeys={'msg': msg}) result.value = value return result
def check_nic_speed(): """ Checks for optimal nic speed requirement in a ceph node [Run as root] """ base_result = MetricData(name='cephlm.perfscale.nic_speed', messages={ 'ok': '{msg}', 'warn': '{msg}', 'unknown': 'Probe error: {msg}' }) try: nic_info = get_nic_info() ceph_bindings = get_ceph_bind_ips() except CephCommandException as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result # Public IP will always exist for a ceph node irrespective of the # network model. It is the network on which ceph client calls are made public_ip = ceph_bindings.get('public_ip', None) # Private IP or Cluster IP will exist only for OSD nodes provided the # deployment follows multi-network model private_ip = ceph_bindings.get('private_ip', None) nic_speeds = PerfScale._process_nic_speed(public_ip, private_ip, nic_info) shared_external_net = PerfScale._has_shared_external_networks( public_ip, private_ip, nic_info) metrics = list() for entry in ceph_bindings: ip = ceph_bindings[entry] severity, msg = PerfScale._format_nic_speed_status( ip, nic_speeds[ip], shared_external_net) metric = base_result.child(msgkeys={'msg': msg}) metric.name = 'cephlm.perfscale.nic_speed_%s' \ % entry.replace('_ip', '') metric.value = severity metrics.append(metric) return metrics
def check_osd_node_ram(): """ Checks for optimal memory requirement in a Ceph OSD node [Run as root] """ base_result = MetricData( name='cephlm.perfscale.osd_node_ram', messages={ 'ok': 'Host RAM({ram}GiB) meets %s GiB per TiB of data disk' '({total_osd_size}TiB) guideline.' % PerfScale.GiB_PER_TiB_DATA, 'warn': 'Host RAM({ram}GiB) violates %s GiB per TiB of data disk' # noqa '({total_osd_size}TiB) guideline.' % PerfScale.GiB_PER_TiB_DATA, 'unknown': 'Probe error: {msg}' }) try: journal_disks, data_disks = Ceph.get_ceph_disk_list() mem_info = get_system_memory_info() disks_info = get_system_disks_size() except (CephLMException, CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result total_osd_size, ram = PerfScale._process_osd_ram_data( data_disks, disks_info, mem_info) if not data_disks: # Ideally this check will not be run on non OSD nodes, but in case # it does, we return an empty list return list() result = base_result.child(msgkeys={ 'ram': '%s' % ram, 'total_osd_size': '%s' % total_osd_size }) result.value = PerfScale._process_osd_ram_status(total_osd_size, ram) return result
def test_child_msgkeys(self): r = MetricData(name='name', messages={ 'ok': 'test message', 'test': 'test with meta {test_value} and {test_value2}', }) c = r.child(dimensions={'test_value': '123'}, msgkeys={'test_value2': '456'}) c.message = 'test' self.assertEqual('test with meta 123 and 456', str(c))
def check_status(): """ Reports the status of the rados gateway service """ base_result = MetricData( name='cephlm.radosgw.status', messages={ 'ok': 'Radosgw ({ip_port}) is in healthy state.', 'fail': 'Radosgw ({ip_port}) is in error state.', 'unknown': 'Probe error: {msg}.' }) try: ip_port = Radosgw._fetch_radosgw_ip_port() status_success = Radosgw.get_status(ip_port) except (CephLMException, CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result result = base_result.child(msgkeys={'ip_port': ip_port}) result.value = Severity.ok if status_success else Severity.fail return result
def check_osd_journal_ratio(): """ Checks the ratio of osd disks mapped to journal disks """ base_result = MetricData(name='cephlm.osd.osd_journal_ratio', messages={ 'ok': 'OSDs abide %s:1 OSD to Journal ratio' % OSD.OPTIMAL_OSD_PER_JOURNAL, 'warn': '{msg}', 'unknown': 'Probe error: {msg}' }) try: journal_disks, data_disks = OSD.get_ceph_disk_list() except (exc.CephLMException, exc.CephCommandException) as e: result = base_result.child(msgkeys={'msg': str(e)}) result.value = Severity.unknown return result # Set metric to warning state when there is both journal and data # partition on a given disk shared_osd_journals = \ set(journal_disks.keys()).intersection(set(data_disks.keys())) # Set metric to warning state when the number of OSDs mapped to a given # journal disk exceeds the recommended limit non_optimal_disks = { key: val for key, val in journal_disks.iteritems() if len(val) > OSD.OPTIMAL_OSD_PER_JOURNAL } return OSD._process_journal_status(base_result, shared_osd_journals, non_optimal_disks)