def test_monitoring_shows_osd_down(measure_stop_ceph_osd): """ Make sure simple problems with OSD daemons are reported via OCP Prometheus. """ prometheus = PrometheusAPI() # time (in seconds) for monitoring to notice the change expected_delay = 60 affected_osd = measure_stop_ceph_osd['result'] # translate this into ceph daemon name ceph_daemon = "osd.{}".format(int(affected_osd[len('rook-ceph-osd-'):])) logger.info( f"affected osd was {affected_osd}, aka {ceph_daemon} ceph daemon") logger.info("let's check that ceph health was affected") health_result = prometheus.query_range( query='ceph_health_status', start=measure_stop_ceph_osd['start'], end=measure_stop_ceph_osd['stop'], step=15) health_validation = check_query_range_result(result=health_result, good_values=[1], bad_values=[0], exp_metric_num=1, exp_delay=expected_delay) health_msg = "health status should be affected by missing osd" logger.info("let's check that osd up value was affected") osd_up_result = prometheus.query_range( query='ceph_osd_up{ceph_daemon="%s"}' % ceph_daemon, start=measure_stop_ceph_osd['start'], end=measure_stop_ceph_osd['stop'], step=15) osd_up_validation = check_query_range_result(result=osd_up_result, good_values=[0], bad_values=[1], exp_metric_num=1, exp_delay=expected_delay) osd_up_msg = "ceph_osd_up value should be affected by missing osd" logger.info("let's check that osd in value was not affected") # osd in value is not affected because we just stopped the osd, we # haven't removed it from the luster osd_in_result = prometheus.query_range( query='ceph_osd_in{ceph_daemon="%s"}' % ceph_daemon, start=measure_stop_ceph_osd['start'], end=measure_stop_ceph_osd['stop'], step=15) osd_in_validation = check_query_range_result(result=osd_in_result, good_values=[1], bad_values=[0], exp_metric_num=1) osd_in_msg = "ceph_osd_in value should not be affected by missing osd" # checking validation results when all queries are performed makes sure # that there is evidence for all queries in the test case logs in case of # an assert failure assert health_validation, health_msg assert osd_up_validation, osd_up_msg assert osd_in_validation, osd_in_msg
def test_monitoring_shows_mon_down(measure_stop_ceph_mon): """ Make sure simple problems with MON daemons are reported via OCP Prometheus. """ prometheus = PrometheusAPI() # time (in seconds) for monitoring to notice the change expected_delay = 60 affected_mons = measure_stop_ceph_mon['result'] # we asked to stop just a single mon ... make this assumption explicit assert len(affected_mons) == 1 affected_mon = affected_mons[0] # translate this into ceph daemon name ceph_daemon = "mon.{}".format(affected_mon[len('rook-ceph-mon-'):]) logger.info( f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon") logger.info("let's check that ceph health was affected") health_result = prometheus.query_range( query='ceph_health_status', start=measure_stop_ceph_mon['start'], end=measure_stop_ceph_mon['stop'], step=15) health_validation = prometheus.check_query_range_result( result=health_result, good_values=[1], bad_values=[0], exp_metric_num=1, exp_delay=expected_delay) health_msg = "health status should be affected by missing mon" assert health_validation, health_msg logger.info("let's check that mon quorum status value was affected") mon_result = prometheus.query_range( query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon, start=measure_stop_ceph_mon['start'], end=measure_stop_ceph_mon['stop'], step=15) mon_validation = prometheus.check_query_range_result( result=mon_result, good_values=[0], bad_values=[1], exp_metric_num=1, exp_delay=expected_delay) mon_msg = "ceph_osd_up value should be affected by missing osd" assert mon_validation, mon_msg
def test_mcg_cpu_usage(workload_idle): """ Without any IO workload, cpu utilization of MCG pods should be minimal. No pod should utilize more than 0.1 cpu units. """ prometheus = PrometheusAPI() cpu_result = prometheus.query_range( query=CPU_USAGE_POD + '{namespace="openshift-storage",pod=~"^noobaa.*"}', start=workload_idle["start"], end=workload_idle["stop"], step=15, ) validation = check_query_range_result_limits( result=cpu_result, good_min=0.0, good_max=0.25, ) msg = "No NooBaa pod should utilize over 0.1 cpu units while idle." assert validation, msg
def test_workload_rbd(workload_storageutilization_50p_rbd): """ Purpose of this test is to make the workload fixture executed, and show how to query prometheus. Note that this test is valid only on 3 osd cluster with all pools using 3 way replication. """ prometheus = PrometheusAPI() # Asking for values of `ceph_osd_stat_bytes_used` for every 15s in # when the workload fixture was utilizing 50% of the OCS storage. result_used = prometheus.query_range( query="ceph_osd_stat_bytes_used", start=workload_storageutilization_50p_rbd["start"], end=workload_storageutilization_50p_rbd["stop"], step=15, ) # This time, we are asking for total OCS capacity, in the same format # as in previous case (for each OSD). result_total = prometheus.query_range( query="ceph_osd_stat_bytes", start=workload_storageutilization_50p_rbd["start"], end=workload_storageutilization_50p_rbd["stop"], step=15, ) # Check test assumption that ceph_osd_stat_bytes hasn't changed for each # OSD, and that each OSD has the same size. osd_stat_bytes = [] for metric in result_total: values = [] for ts, value in metric["values"]: values.append(value) assert all(value == values[0] for value in values) osd_stat_bytes.append(values[0]) assert all(value == osd_stat_bytes[0] for value in osd_stat_bytes) # Compute expected value of'ceph_osd_stat_bytes_used, based on percentage # utilized by the fixture. percentage = workload_storageutilization_50p_rbd["result"]["target_p"] expected_value = int(osd_stat_bytes[0]) * percentage # Now we can check the actual usage values from Prometheus. at_least_one_value_out_of_range = False for metric in result_used: name = metric["metric"]["__name__"] daemon = metric["metric"]["ceph_daemon"] logger.info(f"metric {name} from {daemon}") # We are skipping the 1st 10% of the values, as it could take some # additional time for all the data to be written everywhere, and # during this time utilization value still grows. start_index = int(len(metric["values"]) * 0.1) logger.info(f"ignoring first {start_index} values") for ts, value in metric["values"][:start_index]: value = int(value) dt = datetime.utcfromtimestamp(ts) logger.info(f"ignoring value {value} B at {dt}") for ts, value in metric["values"][start_index:]: value = int(value) dt = datetime.utcfromtimestamp(ts) # checking the value, with 10% error margin in each direction if expected_value * 0.90 <= value <= expected_value * 1.10: logger.info( f"value {value} B at {dt} is withing expected range") else: logger.error( (f"value {value} B at {dt} is outside of expected range" f" {expected_value} B +- 10%")) at_least_one_value_out_of_range = True assert not at_least_one_value_out_of_range
def test_monitoring_reporting_ok_when_idle(workload_idle): """ When nothing is happening, OCP Prometheus reports OCS status as OK. If this test case fails, the status is either reported wrong or the cluster is in a broken state. Either way, a failure here is not good. """ prometheus = PrometheusAPI() health_result = prometheus.query_range( query='ceph_health_status', start=workload_idle['start'], end=workload_idle['stop'], step=15) health_validation = check_query_range_result( result=health_result, good_values=[0], bad_values=[1], exp_metric_num=1) health_msg = "ceph_health_status {} report 0 (health ok) as expected" if health_validation: health_msg = health_msg.format('does') logger.info(health_msg) else: health_msg = health_msg.format('should') logger.error(health_msg) mon_result = prometheus.query_range( query='ceph_mon_quorum_status', start=workload_idle['start'], end=workload_idle['stop'], step=15) mon_validation = check_query_range_result( result=mon_result, good_values=[1], bad_values=[0], exp_metric_num=workload_idle['result']['mon_num']) mon_msg = "ceph_mon_quorum_status {} indicate no problems with quorum" if mon_validation: mon_msg = mon_msg.format('does') logger.info(mon_msg) else: mon_msg = mon_msg.format('should') logger.error(mon_msg) osd_validations = [] for metric in ("ceph_osd_up", "ceph_osd_in"): osd_result = prometheus.query_range( query=metric, start=workload_idle['start'], end=workload_idle['stop'], step=15) osd_validation = check_query_range_result( result=osd_result, good_values=[1], bad_values=[0], exp_metric_num=workload_idle['result']['osd_num']) osd_validations.append(osd_validation) osd_msg = "{} metric {} indicate no problems with OSDs" if osd_validation: osd_msg = osd_msg.format(metric, 'does') logger.info(osd_msg) else: osd_msg = osd_msg.format(metric, 'should') logger.error(osd_msg) # after logging everything properly, make the test fail if necessary # see ERRORs reported in the test log for details assert health_validation, health_msg assert mon_validation, mon_msg osds_msg = "ceph_osd_{up,in} metrics should indicate no OSD issues" assert all(osd_validations), osds_msg
def test_monitoring_shows_mon_down(measure_stop_ceph_mon): """ Make sure simple problems with MON daemons are reported via OCP Prometheus. """ prometheus = PrometheusAPI() # time (in seconds) for monitoring to notice the change expected_delay = 60 # query resolution step used in this test case (number of seconds) query_step = 15 affected_mons = measure_stop_ceph_mon["result"] # we asked to stop just a single mon ... make this assumption explicit assert len(affected_mons) == 1 affected_mon = affected_mons[0] # translate this into ceph daemon name ceph_daemon = "mon.{}".format(affected_mon[len("rook-ceph-mon-"):]) logger.info( f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon") logger.info("let's check that ceph health was affected") health_result = prometheus.query_range( query="ceph_health_status", start=measure_stop_ceph_mon["start"], end=measure_stop_ceph_mon["stop"], step=query_step, ) health_validation = check_query_range_result_enum( result=health_result, good_values=[1], bad_values=[0], exp_metric_num=1, exp_good_time=measure_stop_ceph_mon["min_downtime"], exp_delay=expected_delay, ) health_msg = "health status should be affected by missing mon" logger.info("let's check that mon quorum status value was affected") mon_result = prometheus.query_range( query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon, start=measure_stop_ceph_mon["start"], end=measure_stop_ceph_mon["stop"], step=query_step, validate=False, ) mon_validation = check_query_range_result_enum( result=mon_result, good_values=[0], bad_values=[1], exp_metric_num=1, exp_good_time=measure_stop_ceph_mon["min_downtime"], exp_delay=expected_delay, ) mon_msg = "ceph_mon_quorum_status value should be affected by missing mon" # checking validation results when both queries are performed makes sure # that there is evidence for both mon and health queries in the test case # logs in case of an assert failure assert health_validation, health_msg assert mon_validation, mon_msg # since we don't do strict result validation in the previous query, we # are going to check the min. expected size of the reply explicitly, taking # into account the min. expected downtime of the affected ceph mon assert len(mon_result) == 1, "there should be one metric for one mon" min_mon_samples = measure_stop_ceph_mon["min_downtime"] / query_step mon_sample_size = len(mon_result[0]["values"]) assert mon_sample_size >= min_mon_samples