Example #1
0
    def check_status():
        """
        Display the status of the ceph cluster as returned by 'ceph -s' command
        """
        base_result = MetricData(name='cephlm.cluster.status',
                                 messages={
                                     'ok': 'Cluster is in healthy state.',
                                     'warn':
                                     'Cluster is in warning state: {msg}.',
                                     'fail':
                                     'Cluster is in error state: {msg}.',
                                     'unknown': 'Probe error: {msg}.'
                                 })

        try:
            output = Cluster.get_status()
        except (CephLMException, CephCommandException,
                CephCommandTimeoutException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        status = output['health']['overall_status']
        summary = output['health']['summary']
        msgkeys = {'msg': Cluster._process_status_message(status, summary)}

        result = base_result.child(msgkeys=msgkeys)
        result.value = Cluster._process_status(status)
        return result
Example #2
0
    def check_monitor_connectivity():
        """
        Display the connectivity of the Ceph cluster to each Monitor host
        """
        base_result = MetricData(name='cephlm.connectivity.status',
                                 messages={
                                     'ok': 'Monitors {mons} are reachable.',
                                     'warn':
                                     'Monitor(s) {mons} is/are unreachable.',
                                     'fail':
                                     'Monitor(s) {mons} is/are unreachable.',
                                     'unknown': 'Probe error: {msg}.'
                                 })

        try:
            monitors = Cluster.get_monitors()
            reachable, unreachable = \
                Cluster._verify_monitor_connectivity(monitors)
        except (CephLMException, CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        if len(unreachable) == 0:
            result = base_result.child(msgkeys={'mons': ', '.join(reachable)})
            result.value = Severity.ok
        else:
            result = base_result.child(
                msgkeys={'mons': ', '.join(unreachable)})
            if len(reachable) == 0:
                result.value = Severity.fail
            else:
                result.value = Severity.warn
        return result
Example #3
0
    def test_response_child(self):
        r = MetricData(name='name', messages={'a': 'b'})
        r['test'] = 'test'

        c = r.child(dimensions={'test2': 'test2'})
        self.assertIn('test', c)
        self.assertIn('test2', c)
        self.assertDictEqual({'a': 'b'}, c.messages)
        self.assertEqual('cephlm.name', c.name)

        c = r.child()
        self.assertIn('test', c)
        self.assertNotIn('test2', c)
Example #4
0
    def check_hpssacli():
        """
        Checks controller and drive information with hpssacli [Run as root]
        """
        base_result = MetricData(name='cephlm.hpssacli',
                                 messages=hpssacli.BASE_RESULT.messages)
        HPssaCli._override_plugin_settings()
        try:
            results = hpssacli.main()
        except Exception as e:
            # Unlike other parameters, we do not know the list of metrics here.
            # Hence there is no way to set each of them to error. Instead we
            # raise exception wich will be handled by the generic cephlm-probe
            # exception handler
            msg = "Unknown exception occured when " \
                  "executing swiftlm hpssacli module"
            raise CephLMException(msg)

        ceph_results = list()
        for entry in results:
            # Extract the main metric name, and strip off the parent hierarchy
            # E.g., swiftlm.hp_hardware.hpssacli.smart_array to smart_array
            name = entry.name.split('hpssacli.', 1)[1]
            # Clone the dimensions excluding entries pointing to external
            # service references
            dimensions = {
                key: value
                for key, value in entry.dimensions.iteritems()
                if key not in ['service']
            }
            # Convert external metric class to cephlm metric class
            result = base_result.child(name=name, dimensions=dimensions)
            result.value = HPssaCli._get_severity_level(entry.value)
            ceph_results.append(result)
        return ceph_results
Example #5
0
 def quorum_status():
     """
     Reports the status of monitor quorum
     """
     base_result = MetricData(
         name='cephlm.monitor.quorum',
         messages={
             'ok': 'Monitors are in quorum.',
             'warn': 'Monitors ({msg}) is/are not in quorum.',
             'fail': 'Monitors ({msg}) have not formed quorum.',
             'unknown': 'Probe error: {msg}.'
         })
     msg = ''
     value = Severity.ok
     try:
         output = Monitor.get_quorum_status()
         quorum = output['quorum']
         monitors = output['monmap']['mons']
         if len(quorum) < len(monitors):
             value = Severity.warn
             for mon in monitors:
                 if mon['rank'] not in quorum:
                     msg += mon['name'] + ', '
             msg = msg[:-2]
     except CephCommandTimeoutException:
         value = Severity.fail
         cluster_name, config, config_file = Ceph._get_ceph_config()
         msg = config.get('global', 'mon_host')
     except CephCommandException as e:
         value = Severity.unknown
         msg = str(e)
     result = base_result.child(msgkeys={'msg': msg})
     result.value = value
     return result
Example #6
0
    def check_nic_speed():
        """
        Checks for optimal nic speed requirement in a ceph node [Run as root]
        """
        base_result = MetricData(name='cephlm.perfscale.nic_speed',
                                 messages={
                                     'ok': '{msg}',
                                     'warn': '{msg}',
                                     'unknown': 'Probe error: {msg}'
                                 })
        try:
            nic_info = get_nic_info()
            ceph_bindings = get_ceph_bind_ips()
        except CephCommandException as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        # Public IP will always exist for a ceph node irrespective of the
        # network model. It is the network on which ceph client calls are made
        public_ip = ceph_bindings.get('public_ip', None)

        # Private IP or Cluster IP will exist only for OSD nodes provided the
        # deployment follows multi-network model
        private_ip = ceph_bindings.get('private_ip', None)

        nic_speeds = PerfScale._process_nic_speed(public_ip, private_ip,
                                                  nic_info)

        shared_external_net = PerfScale._has_shared_external_networks(
            public_ip, private_ip, nic_info)

        metrics = list()
        for entry in ceph_bindings:
            ip = ceph_bindings[entry]
            severity, msg = PerfScale._format_nic_speed_status(
                ip, nic_speeds[ip], shared_external_net)
            metric = base_result.child(msgkeys={'msg': msg})
            metric.name = 'cephlm.perfscale.nic_speed_%s' \
                          % entry.replace('_ip', '')
            metric.value = severity
            metrics.append(metric)

        return metrics
Example #7
0
    def check_osd_node_ram():
        """
        Checks for optimal memory requirement in a Ceph OSD node [Run as root]
        """
        base_result = MetricData(
            name='cephlm.perfscale.osd_node_ram',
            messages={
                'ok':
                'Host RAM({ram}GiB) meets %s GiB per TiB of data disk'
                '({total_osd_size}TiB) guideline.' %
                PerfScale.GiB_PER_TiB_DATA,
                'warn':
                'Host RAM({ram}GiB) violates %s GiB per TiB of data disk'  # noqa
                '({total_osd_size}TiB) guideline.' %
                PerfScale.GiB_PER_TiB_DATA,
                'unknown':
                'Probe error: {msg}'
            })

        try:
            journal_disks, data_disks = Ceph.get_ceph_disk_list()
            mem_info = get_system_memory_info()
            disks_info = get_system_disks_size()
        except (CephLMException, CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        total_osd_size, ram = PerfScale._process_osd_ram_data(
            data_disks, disks_info, mem_info)

        if not data_disks:
            # Ideally this check will not be run on non OSD nodes, but in case
            # it does, we return an empty list
            return list()

        result = base_result.child(msgkeys={
            'ram': '%s' % ram,
            'total_osd_size': '%s' % total_osd_size
        })
        result.value = PerfScale._process_osd_ram_status(total_osd_size, ram)
        return result
Example #8
0
    def test_child_msgkeys(self):
        r = MetricData(name='name',
                       messages={
                           'ok': 'test message',
                           'test':
                           'test with meta {test_value} and {test_value2}',
                       })

        c = r.child(dimensions={'test_value': '123'},
                    msgkeys={'test_value2': '456'})
        c.message = 'test'

        self.assertEqual('test with meta 123 and 456', str(c))
Example #9
0
    def check_status():
        """
        Reports the status of the rados gateway service
        """
        base_result = MetricData(
            name='cephlm.radosgw.status',
            messages={
                'ok': 'Radosgw ({ip_port}) is in healthy state.',
                'fail': 'Radosgw ({ip_port}) is in error state.',
                'unknown': 'Probe error: {msg}.'
            })

        try:
            ip_port = Radosgw._fetch_radosgw_ip_port()
            status_success = Radosgw.get_status(ip_port)
        except (CephLMException, CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        result = base_result.child(msgkeys={'ip_port': ip_port})
        result.value = Severity.ok if status_success else Severity.fail
        return result
Example #10
0
    def check_osd_journal_ratio():
        """
        Checks the ratio of osd disks mapped to journal disks
        """
        base_result = MetricData(name='cephlm.osd.osd_journal_ratio',
                                 messages={
                                     'ok':
                                     'OSDs abide %s:1 OSD to Journal ratio' %
                                     OSD.OPTIMAL_OSD_PER_JOURNAL,
                                     'warn':
                                     '{msg}',
                                     'unknown':
                                     'Probe error: {msg}'
                                 })
        try:
            journal_disks, data_disks = OSD.get_ceph_disk_list()
        except (exc.CephLMException, exc.CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        # Set metric to warning state when there is both journal and data
        # partition on a given disk
        shared_osd_journals = \
            set(journal_disks.keys()).intersection(set(data_disks.keys()))

        # Set metric to warning state when the number of OSDs mapped to a given
        # journal disk exceeds the recommended limit
        non_optimal_disks = {
            key: val
            for key, val in journal_disks.iteritems()
            if len(val) > OSD.OPTIMAL_OSD_PER_JOURNAL
        }

        return OSD._process_journal_status(base_result, shared_osd_journals,
                                           non_optimal_disks)