def test_monitoring_shows_osd_down(measure_stop_ceph_osd): """ Make sure simple problems with OSD daemons are reported via OCP Prometheus. """ prometheus = PrometheusAPI() # time (in seconds) for monitoring to notice the change expected_delay = 60 affected_osd = measure_stop_ceph_osd['result'] # translate this into ceph daemon name ceph_daemon = "osd.{}".format(int(affected_osd[len('rook-ceph-osd-'):])) logger.info( f"affected osd was {affected_osd}, aka {ceph_daemon} ceph daemon") logger.info("let's check that ceph health was affected") health_result = prometheus.query_range( query='ceph_health_status', start=measure_stop_ceph_osd['start'], end=measure_stop_ceph_osd['stop'], step=15) health_validation = prometheus.check_query_range_result( result=health_result, good_values=[1], bad_values=[0], exp_metric_num=1, exp_delay=expected_delay) health_msg = "health status should be affected by missing osd" assert health_validation, health_msg logger.info("let's check that osd up value was affected") osd_up_result = prometheus.query_range( query='ceph_osd_up{ceph_daemon="%s"}' % ceph_daemon, start=measure_stop_ceph_osd['start'], end=measure_stop_ceph_osd['stop'], step=15) osd_up_validation = prometheus.check_query_range_result( result=osd_up_result, good_values=[0], bad_values=[1], exp_metric_num=1, exp_delay=expected_delay) osd_up_msg = "ceph_osd_up value should be affected by missing osd" assert osd_up_validation, osd_up_msg logger.info("let's check that osd in value was not affected") # osd in value is not affected because we just stopped the osd, we # haven't removed it from the luster osd_in_result = prometheus.query_range( query='ceph_osd_in{ceph_daemon="%s"}' % ceph_daemon, start=measure_stop_ceph_osd['start'], end=measure_stop_ceph_osd['stop'], step=15) osd_in_validation = prometheus.check_query_range_result( result=osd_in_result, good_values=[1], bad_values=[0], exp_metric_num=1) osd_in_msg = "ceph_osd_in value should not be affected by missing osd" assert osd_in_validation, osd_in_msg
def test_monitoring_shows_mon_down(measure_stop_ceph_mon): """ Make sure simple problems with MON daemons are reported via OCP Prometheus. """ prometheus = PrometheusAPI() # time (in seconds) for monitoring to notice the change expected_delay = 60 affected_mons = measure_stop_ceph_mon['result'] # we asked to stop just a single mon ... make this assumption explicit assert len(affected_mons) == 1 affected_mon = affected_mons[0] # translate this into ceph daemon name ceph_daemon = "mon.{}".format(affected_mon[len('rook-ceph-mon-'):]) logger.info( f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon") logger.info("let's check that ceph health was affected") health_result = prometheus.query_range( query='ceph_health_status', start=measure_stop_ceph_mon['start'], end=measure_stop_ceph_mon['stop'], step=15) health_validation = prometheus.check_query_range_result( result=health_result, good_values=[1], bad_values=[0], exp_metric_num=1, exp_delay=expected_delay) health_msg = "health status should be affected by missing mon" assert health_validation, health_msg logger.info("let's check that mon quorum status value was affected") mon_result = prometheus.query_range( query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon, start=measure_stop_ceph_mon['start'], end=measure_stop_ceph_mon['stop'], step=15) mon_validation = prometheus.check_query_range_result( result=mon_result, good_values=[0], bad_values=[1], exp_metric_num=1, exp_delay=expected_delay) mon_msg = "ceph_osd_up value should be affected by missing osd" assert mon_validation, mon_msg
def test_monitoring_reporting_ok_when_idle(workload_idle): """ When nothing is happening, OCP Prometheus reports OCS status as OK. If this test case fails, the status is either reported wrong or the cluster is in a broken state. Either way, a failure here is not good. """ prometheus = PrometheusAPI() health_result = prometheus.query_range(query='ceph_health_status', start=workload_idle['start'], end=workload_idle['stop'], step=15) health_validation = prometheus.check_query_range_result( result=health_result, good_values=[0], bad_values=[1], exp_metric_num=1) health_msg = "ceph_health_status {} report 0 (health ok) as expected" if health_validation: health_msg = health_msg.format('does') logger.info(health_msg) else: health_msg = health_msg.format('should') logger.error(health_msg) mon_result = prometheus.query_range(query='ceph_mon_quorum_status', start=workload_idle['start'], end=workload_idle['stop'], step=15) mon_validation = prometheus.check_query_range_result( result=mon_result, good_values=[1], bad_values=[0], exp_metric_num=workload_idle['result']['mon_num']) mon_msg = "ceph_mon_quorum_status {} indicate no problems with quorum" if mon_validation: mon_msg = mon_msg.format('does') logger.info(mon_msg) else: mon_msg = mon_msg.format('should') logger.error(mon_msg) osd_validations = [] for metric in ("ceph_osd_up", "ceph_osd_in"): osd_result = prometheus.query_range(query=metric, start=workload_idle['start'], end=workload_idle['stop'], step=15) osd_validation = prometheus.check_query_range_result( result=osd_result, good_values=[1], bad_values=[0], exp_metric_num=workload_idle['result']['osd_num']) osd_validations.append(osd_validation) osd_msg = "{} metric {} indicate no problems with OSDs" if osd_validation: osd_msg = osd_msg.format(metric, 'does') logger.info(osd_msg) else: osd_msg = osd_msg.format(metric, 'should') logger.error(osd_msg) # after logging everything properly, make the test fail if necessary # see ERRORs reported in the test log for details assert health_validation, health_msg assert mon_validation, mon_msg osds_msg = "ceph_osd_{up,in} metrics should indicate no OSD issues" assert all(osd_validations), osds_msg