Example #1
0
 def quorum_status():
     """
     Reports the status of monitor quorum
     """
     base_result = MetricData(
         name='cephlm.monitor.quorum',
         messages={
             'ok': 'Monitors are in quorum.',
             'warn': 'Monitors ({msg}) is/are not in quorum.',
             'fail': 'Monitors ({msg}) have not formed quorum.',
             'unknown': 'Probe error: {msg}.'
         })
     msg = ''
     value = Severity.ok
     try:
         output = Monitor.get_quorum_status()
         quorum = output['quorum']
         monitors = output['monmap']['mons']
         if len(quorum) < len(monitors):
             value = Severity.warn
             for mon in monitors:
                 if mon['rank'] not in quorum:
                     msg += mon['name'] + ', '
             msg = msg[:-2]
     except CephCommandTimeoutException:
         value = Severity.fail
         cluster_name, config, config_file = Ceph._get_ceph_config()
         msg = config.get('global', 'mon_host')
     except CephCommandException as e:
         value = Severity.unknown
         msg = str(e)
     result = base_result.child(msgkeys={'msg': msg})
     result.value = value
     return result
Example #2
0
    def check_status():
        """
        Display the status of the ceph cluster as returned by 'ceph -s' command
        """
        base_result = MetricData(name='cephlm.cluster.status',
                                 messages={
                                     'ok': 'Cluster is in healthy state.',
                                     'warn':
                                     'Cluster is in warning state: {msg}.',
                                     'fail':
                                     'Cluster is in error state: {msg}.',
                                     'unknown': 'Probe error: {msg}.'
                                 })

        try:
            output = Cluster.get_status()
        except (CephLMException, CephCommandException,
                CephCommandTimeoutException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        status = output['health']['overall_status']
        summary = output['health']['summary']
        msgkeys = {'msg': Cluster._process_status_message(status, summary)}

        result = base_result.child(msgkeys=msgkeys)
        result.value = Cluster._process_status(status)
        return result
Example #3
0
    def check_hpssacli():
        """
        Checks controller and drive information with hpssacli [Run as root]
        """
        base_result = MetricData(name='cephlm.hpssacli',
                                 messages=hpssacli.BASE_RESULT.messages)
        HPssaCli._override_plugin_settings()
        try:
            results = hpssacli.main()
        except Exception as e:
            # Unlike other parameters, we do not know the list of metrics here.
            # Hence there is no way to set each of them to error. Instead we
            # raise exception wich will be handled by the generic cephlm-probe
            # exception handler
            msg = "Unknown exception occured when " \
                  "executing swiftlm hpssacli module"
            raise CephLMException(msg)

        ceph_results = list()
        for entry in results:
            # Extract the main metric name, and strip off the parent hierarchy
            # E.g., swiftlm.hp_hardware.hpssacli.smart_array to smart_array
            name = entry.name.split('hpssacli.', 1)[1]
            # Clone the dimensions excluding entries pointing to external
            # service references
            dimensions = {
                key: value
                for key, value in entry.dimensions.iteritems()
                if key not in ['service']
            }
            # Convert external metric class to cephlm metric class
            result = base_result.child(name=name, dimensions=dimensions)
            result.value = HPssaCli._get_severity_level(entry.value)
            ceph_results.append(result)
        return ceph_results
Example #4
0
    def check_monitor_connectivity():
        """
        Display the connectivity of the Ceph cluster to each Monitor host
        """
        base_result = MetricData(name='cephlm.connectivity.status',
                                 messages={
                                     'ok': 'Monitors {mons} are reachable.',
                                     'warn':
                                     'Monitor(s) {mons} is/are unreachable.',
                                     'fail':
                                     'Monitor(s) {mons} is/are unreachable.',
                                     'unknown': 'Probe error: {msg}.'
                                 })

        try:
            monitors = Cluster.get_monitors()
            reachable, unreachable = \
                Cluster._verify_monitor_connectivity(monitors)
        except (CephLMException, CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        if len(unreachable) == 0:
            result = base_result.child(msgkeys={'mons': ', '.join(reachable)})
            result.value = Severity.ok
        else:
            result = base_result.child(
                msgkeys={'mons': ', '.join(unreachable)})
            if len(reachable) == 0:
                result.value = Severity.fail
            else:
                result.value = Severity.warn
        return result
Example #5
0
    def test_response_child(self):
        r = MetricData(name='name', messages={'a': 'b'})
        r['test'] = 'test'

        c = r.child(dimensions={'test2': 'test2'})
        self.assertIn('test', c)
        self.assertIn('test2', c)
        self.assertDictEqual({'a': 'b'}, c.messages)
        self.assertEqual('cephlm.name', c.name)

        c = r.child()
        self.assertIn('test', c)
        self.assertNotIn('test2', c)
Example #6
0
    def test_child_msgkeys(self):
        r = MetricData(name='name',
                       messages={
                           'ok': 'test message',
                           'test':
                           'test with meta {test_value} and {test_value2}',
                       })

        c = r.child(dimensions={'test_value': '123'},
                    msgkeys={'test_value2': '456'})
        c.message = 'test'

        self.assertEqual('test with meta 123 and 456', str(c))
Example #7
0
def main():
    args = parse_args()
    metrics = []

    for func in args.selected:
        try:
            r = func()
            if isinstance(r, list) and r and isinstance(r[0], MetricData):
                metrics.extend([result.metric() for result in r])
            elif isinstance(r, MetricData):
                metrics.append(r.metric())
        except:  # noqa
            t, v, tb = sys.exc_info()
            backtrace = ' '.join(traceback.format_exception(t, v, tb))
            r = MetricData.single('cephlm.probe.failure',
                                  Severity.fail,
                                  '{error} failed with: {check}',
                                  dimensions={
                                      'component': 'cephlm-probe',
                                      'service': 'ceph-storage'
                                  },
                                  msgkeys={
                                      'check': func.__module__,
                                      'error': backtrace.replace('\n', ' ')
                                  })
            metrics.append(r.metric())

    # There is no point in reporting multiple measurements of
    # cephlm.probe.failure metric in same cycle.
    check_failures_found = []
    for metric in metrics:
        if metric.get('metric') == 'cephlm.probe.failure':
            check_failures_found.append(metric)
    if check_failures_found:
        # Remove all except one instance
        for metric in check_failures_found[:-1]:
            metrics.remove(metric)
    else:
        r = MetricData.single('cephlm.probe.failure',
                              Severity.ok,
                              'ok',
                              dimensions={
                                  'component': 'cephlm-probe',
                                  'service': 'ceph-storage'
                              })
        metrics.append(r.metric())

    FORMATS[args.format](metrics, args.pretty)
Example #8
0
    def test_create_metricdata(self):
        r = MetricData(name='name', messages={})

        self.assertEqual('cephlm.name', r.name)
        self.assertEqual('', r.message)
        self.assertEqual(None, r.value)
        self.assertIn('hostname', r.dimensions)
Example #9
0
 def pool_stats():
     """
     Publishes the pool statistics
     """
     result = list()
     INVALID_VALUE = -1
     probe_failed = False
     metric_dict = {
         'count': 'count',
         'total_objects': 'objects',
         'usage_bytes': 'size_bytes',
         'top_three_by_usage_bytes': 'top_pools_by_size',
         'top_three_by_objects': 'top_pools_by_objects',
     }
     try:
         pool_dict = Pool._stats()
     except (exc.CephLMException, exc.CephCommandException,
             exc.CephCommandTimeoutException) as e:
         probe_failed = True
         msg = str(e)
     for metric_name, state in metric_dict.iteritems():
         name = "cephlm.pool.%s" % metric_name
         if probe_failed:
             value = INVALID_VALUE
         elif 'top_three' in metric_name:
             msg, value = Pool._pools_by_metric(pool_dict, state)
         else:
             msg, value = Pool._return_total_metrics(pool_dict, state)
         base_result = MetricData.single(name, value, message=msg)
         result.append(base_result)
     return result
Example #10
0
    def osd_stats():
        """
        Publishes the osd statistics
        """
        metric_dict = {
            'up': OSD._up_count,
            'up_out': OSD._up_out_count,
            'down': OSD._down_count,
            'down_in': OSD._down_in_count,
            'total': OSD._total_count
        }
        INVALID_VALUE = -1
        result = list()
        probe_failed = False
        try:
            osd_stats = OSD._stats()
        except (exc.CephLMException, exc.CephCommandException,
                exc.CephCommandTimeoutException) as e:
            probe_failed = True

        for metric_state, func in metric_dict.iteritems():
            name = "cephlm.osd.%s_count" % metric_state
            if probe_failed:
                value = INVALID_VALUE
                msg = "Probe error: Command 'ceph osd tree' failed"
            else:
                value, msg = func(osd_stats)
                msg = "OSD(s) %s" % msg if msg else "No OSD(s)"
                msg += " is/are in cluster" if metric_state == 'total' \
                    else " is/are %s" % metric_state
            base_result = MetricData.single(name, value, message=msg)
            result.append(base_result)
        return result
Example #11
0
    def check_nic_speed():
        """
        Checks for optimal nic speed requirement in a ceph node [Run as root]
        """
        base_result = MetricData(name='cephlm.perfscale.nic_speed',
                                 messages={
                                     'ok': '{msg}',
                                     'warn': '{msg}',
                                     'unknown': 'Probe error: {msg}'
                                 })
        try:
            nic_info = get_nic_info()
            ceph_bindings = get_ceph_bind_ips()
        except CephCommandException as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        # Public IP will always exist for a ceph node irrespective of the
        # network model. It is the network on which ceph client calls are made
        public_ip = ceph_bindings.get('public_ip', None)

        # Private IP or Cluster IP will exist only for OSD nodes provided the
        # deployment follows multi-network model
        private_ip = ceph_bindings.get('private_ip', None)

        nic_speeds = PerfScale._process_nic_speed(public_ip, private_ip,
                                                  nic_info)

        shared_external_net = PerfScale._has_shared_external_networks(
            public_ip, private_ip, nic_info)

        metrics = list()
        for entry in ceph_bindings:
            ip = ceph_bindings[entry]
            severity, msg = PerfScale._format_nic_speed_status(
                ip, nic_speeds[ip], shared_external_net)
            metric = base_result.child(msgkeys={'msg': msg})
            metric.name = 'cephlm.perfscale.nic_speed_%s' \
                          % entry.replace('_ip', '')
            metric.value = severity
            metrics.append(metric)

        return metrics
Example #12
0
    def test_dict_behaviour(self):
        r = MetricData(name='name', messages={})

        r['test'] = 1000
        # dimension values must be strings so we check they are converted
        # properly
        self.assertEqual('1000', r['test'])
        del r['test']

        self.assertNotIn('test', r)
Example #13
0
    def check_osd_node_ram():
        """
        Checks for optimal memory requirement in a Ceph OSD node [Run as root]
        """
        base_result = MetricData(
            name='cephlm.perfscale.osd_node_ram',
            messages={
                'ok':
                'Host RAM({ram}GiB) meets %s GiB per TiB of data disk'
                '({total_osd_size}TiB) guideline.' %
                PerfScale.GiB_PER_TiB_DATA,
                'warn':
                'Host RAM({ram}GiB) violates %s GiB per TiB of data disk'  # noqa
                '({total_osd_size}TiB) guideline.' %
                PerfScale.GiB_PER_TiB_DATA,
                'unknown':
                'Probe error: {msg}'
            })

        try:
            journal_disks, data_disks = Ceph.get_ceph_disk_list()
            mem_info = get_system_memory_info()
            disks_info = get_system_disks_size()
        except (CephLMException, CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        total_osd_size, ram = PerfScale._process_osd_ram_data(
            data_disks, disks_info, mem_info)

        if not data_disks:
            # Ideally this check will not be run on non OSD nodes, but in case
            # it does, we return an empty list
            return list()

        result = base_result.child(msgkeys={
            'ram': '%s' % ram,
            'total_osd_size': '%s' % total_osd_size
        })
        result.value = PerfScale._process_osd_ram_status(total_osd_size, ram)
        return result
Example #14
0
    def check_status():
        """
        Reports the status of the rados gateway service
        """
        base_result = MetricData(
            name='cephlm.radosgw.status',
            messages={
                'ok': 'Radosgw ({ip_port}) is in healthy state.',
                'fail': 'Radosgw ({ip_port}) is in error state.',
                'unknown': 'Probe error: {msg}.'
            })

        try:
            ip_port = Radosgw._fetch_radosgw_ip_port()
            status_success = Radosgw.get_status(ip_port)
        except (CephLMException, CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        result = base_result.child(msgkeys={'ip_port': ip_port})
        result.value = Severity.ok if status_success else Severity.fail
        return result
Example #15
0
class HPssaCliData:
    MOCK_BASE_RESULT = MetricData(
        name='swiftlm.hpssacli',
        messages={
            'no_battery': 'No cache battery',
            'unknown': 'hpssacli command failed',
            'controller_status': '{sub_component} status is {status}',
            'in_hba_mode': 'Controller is in HBA mode;'
                           ' performance will be poor',
            'physical_drive': 'Drive {serial_number}: '
            '{box}:{bay} has status: {status}',
            'l_drive': 'Logical Drive {logical_drive} has status: {status}',
            'l_cache': 'Logical Drive {logical_drive}'
                       ' has cache status: {caching}',
            'ok': 'OK',
            'fail': 'FAIL',
        }
    )

    MOCK_CHILD_FLOAT = MOCK_BASE_RESULT.child()
    MOCK_CHILD_FLOAT.name = 'swiftlm.hp_hardware.hpssacli.smart_array.firmware'
    MOCK_CHILD_FLOAT.value = 3.0
    MOCK_CHILD_FLOAT.dimensions = {'component': 'controller',
                                   'controller_slot': '1',
                                   'hostname': 'ardana-ccp-ceph0001-clm',
                                   'model': 'Smart HBA H240',
                                   'service': 'object-storage'}

    MOCK_CHILD_OK = MOCK_BASE_RESULT.child()
    MOCK_CHILD_OK.name = 'swiftlm.hp_hardware.hpssacli.smart_array'
    MOCK_CHILD_OK.value = Severity.ok
    MOCK_CHILD_OK.dimensions = {'component': 'controller',
                                'sub_component': 'controller_not_hba_mode',
                                'controller_slot': '1',
                                'hostname': 'ardana-ccp-ceph0001-clm',
                                'model': 'Smart HBA H240',
                                'service': 'object-storage'}

    MOCK_CHILD_FAIL = MOCK_BASE_RESULT.child()
    MOCK_CHILD_FAIL.name = 'swiftlm.hp_hardware.hpssacli.smart_array'
    MOCK_CHILD_FAIL.value = Severity.fail
    MOCK_CHILD_FAIL.dimensions = {'component': 'controller',
                                  'sub_component': 'battery_capacitor_status',
                                  'controller_slot': '1',
                                  'hostname': 'ardana-ccp-ceph0001-clm',
                                  'model': 'Smart HBA H240',
                                  'service': 'object-storage'}

    MOCK_RESPONSE = [MOCK_CHILD_FLOAT, MOCK_CHILD_OK, MOCK_CHILD_FAIL]
Example #16
0
    def check_osd_journal_ratio():
        """
        Checks the ratio of osd disks mapped to journal disks
        """
        base_result = MetricData(name='cephlm.osd.osd_journal_ratio',
                                 messages={
                                     'ok':
                                     'OSDs abide %s:1 OSD to Journal ratio' %
                                     OSD.OPTIMAL_OSD_PER_JOURNAL,
                                     'warn':
                                     '{msg}',
                                     'unknown':
                                     'Probe error: {msg}'
                                 })
        try:
            journal_disks, data_disks = OSD.get_ceph_disk_list()
        except (exc.CephLMException, exc.CephCommandException) as e:
            result = base_result.child(msgkeys={'msg': str(e)})
            result.value = Severity.unknown
            return result

        # Set metric to warning state when there is both journal and data
        # partition on a given disk
        shared_osd_journals = \
            set(journal_disks.keys()).intersection(set(data_disks.keys()))

        # Set metric to warning state when the number of OSDs mapped to a given
        # journal disk exceeds the recommended limit
        non_optimal_disks = {
            key: val
            for key, val in journal_disks.iteritems()
            if len(val) > OSD.OPTIMAL_OSD_PER_JOURNAL
        }

        return OSD._process_journal_status(base_result, shared_osd_journals,
                                           non_optimal_disks)
Example #17
0
 def pg_stats():
     """
     Function to aggregate all metrics
     """
     msg = ''
     INVALID_VALUE = -1
     probe_failed = False
     try:
         pg_stats = PG._stats()
     except (exc.CephLMException, exc.CephCommandException,
             exc.CephCommandTimeoutException) as e:
         probe_failed = True
         msg = 'Probe error: ' + str(e)
     if probe_failed:
         value = INVALID_VALUE
     else:
         value = pg_stats.pop('count')
         for pg_state, count in pg_stats.iteritems():
             msg += '%s=%s, ' % (pg_state, count)
         msg = msg[:-2]
     name = "cephlm.pg.count"
     base_result = MetricData.single(name, value, message=msg)
     return base_result
Example #18
0
 def capacity_stats():
     """
     Publishes the capacity statistics
     """
     metric_list = ['total_bytes', 'used_bytes',
                    'available_bytes', 'perc_utilization']
     msg = ''
     result = list()
     capacity_dict = dict()
     INVALID_VALUE = -1
     probe_failed = False
     try:
         capacity_dict = Capacity._stats()
     except (exc.CephLMException, exc.CephCommandException,
             exc.CephCommandTimeoutException) as e:
         probe_failed = True
         msg = str(e)
     for metric_name in metric_list:
         name = "cephlm.capacity.%s" % metric_name
         value = capacity_dict[metric_name] \
             if not probe_failed else INVALID_VALUE
         base_result = MetricData.single(name, value, message=msg)
         result.append(base_result)
     return result
Example #19
0
    def test_message(self):
        r = MetricData(name='name',
                       messages={
                           'ok': 'test message',
                           'test':
                           'test with meta {test_value} and {test_value2}',
                       })

        # Test automatic message assignment when a the Status Enum is used
        # as the value
        self.assertEqual('', r.message)
        r.value = Severity.ok
        self.assertEqual('test message', r.message)

        # Test that an error is raised when trying to use a message without
        # providing all of the dimension values first.
        with self.assertRaisesRegexp(ValueError, 'requires a dimension or'):
            r.message = 'test'

        r['test_value'] = '123'
        r.msgkey('test_value2', '456')
        r.message = 'test'

        self.assertEqual('test with meta 123 and 456', str(r))
Example #20
0
    def test_equality_behaviour(self):
        m_a = MetricData('name', self.messages, self.dimensions)
        m_b = MetricData('name', self.messages, self.dimensions)
        self.assertEqual(m_a, m_b)

        m_a = MetricData('name', self.messages, self.dimensions)
        m_b = MetricData('not-name', self.messages, self.dimensions)
        self.assertNotEqual(m_a, m_b)

        m_a = MetricData('name', {'a': 1}, self.dimensions)
        m_b = MetricData('name', {'b': 2}, self.dimensions)
        self.assertEqual(
            m_a, m_b, 'Message dictionaries should not '
            'affect equality of MetricData instances')

        m_a = MetricData('name', self.messages, self.dimensions)
        m_b = MetricData('name', self.messages, {})
        self.assertNotEqual(m_a, m_b)

        m_a = MetricData('name', self.messages, self.dimensions)
        m_b = MetricData('name', self.messages, self.dimensions)
        m_a.message = 'ok'
        m_b.message = 'fail'
        self.assertNotEqual(m_a, m_b)

        m_a = MetricData('name', self.messages, self.dimensions)
        m_b = MetricData('name', self.messages, self.dimensions)
        m_a.value = 1
        m_b.value = 2
        self.assertNotEqual(m_a, m_b)