Example #1
0
def test_check_query_range_result_exp_metric_num(query_range_result_ok):
    """
    Check that exp_metric_num is checked as expected when specified.
    """
    result1 = check_query_range_result_enum(query_range_result_ok,
                                            good_values=[1],
                                            exp_metric_num=2)
    assert result1, "check should pass when exp_metric_num matches the data"
    result2 = check_query_range_result_enum(query_range_result_ok,
                                            good_values=[1],
                                            exp_metric_num=3)
    assert not result2, "check should fail when exp_metric_num doesn't match"
Example #2
0
def test_check_query_range_result_single_error(
        query_range_result_single_error):
    """
    The function finds single error in query_range_result_single_error data,
    assuming 1 is a good value.
    """
    result1 = check_query_range_result_enum(query_range_result_single_error,
                                            good_values=[1])
    assert not result1
    result2 = check_query_range_result_enum(query_range_result_single_error,
                                            good_values=[1, 0])
    assert result2, "assuming both 1 and 0 are good values, check should pass"
Example #3
0
def test_check_query_range_result_exp_delay(query_range_result_delay_60s):
    """
    Check that exp_metric_num is taken into account, so that initial bad values
    are ignored.
    """
    result1 = check_query_range_result_enum(query_range_result_delay_60s,
                                            good_values=[1],
                                            bad_values=[0])
    assert not result1, "without specifying exp_delay validation should fail"
    result2 = check_query_range_result_enum(query_range_result_delay_60s,
                                            good_values=[1],
                                            bad_values=[0],
                                            exp_delay=60)
    assert result2, "taking exp_delay into account, validation should pass"
Example #4
0
def test_check_query_range_result_simple_fail(query_range_result_ok):
    """
    Assuming 0 is a good value, the validation should fail.
    """
    result = check_query_range_result_enum(query_range_result_ok,
                                           good_values=[0])
    assert not result
Example #5
0
def test_check_query_range_result_simple(query_range_result_ok):
    """
    The function validates query_range_result_ok data assuming 1 is a good
    value.
    """
    assert check_query_range_result_enum(query_range_result_ok,
                                         good_values=[1])
Example #6
0
def test_check_query_range_result_exp_good_time(
        query_range_result_bad_last_90s):
    """
    Check that exp_good_time is taken into account, so that initial bad values
    are ignored if appear after the good time passess.
    """
    result1 = check_query_range_result_enum(query_range_result_bad_last_90s,
                                            good_values=[1],
                                            bad_values=[0])
    assert not result1, "without exp_good_time validation should fail"
    result2 = check_query_range_result_enum(
        query_range_result_bad_last_90s,
        good_values=[1],
        bad_values=[0],
        exp_good_time=150,
    )
    assert result2, "taking exp_good_time into account, validation should pass"
Example #7
0
def test_check_query_range_result_null():
    """
    The function does't throw any exception and returns true when executed
    with empty arguments.
    """
    assert check_query_range_result_enum({}, [])
def test_monitoring_shows_osd_down(measure_stop_ceph_osd):
    """
    Make sure simple problems with OSD daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60

    affected_osd = measure_stop_ceph_osd["result"]
    # translate this into ceph daemon name
    ceph_daemon = "osd.{}".format(int(affected_osd[len("rook-ceph-osd-"):]))
    logger.info(
        f"affected osd was {affected_osd}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query="ceph_health_status",
        start=measure_stop_ceph_osd["start"],
        end=measure_stop_ceph_osd["stop"],
        step=15,
    )
    health_validation = check_query_range_result_enum(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_delay=expected_delay,
    )
    health_msg = "health status should be affected by missing osd"

    logger.info("let's check that osd up value was affected")
    osd_up_result = prometheus.query_range(
        query='ceph_osd_up{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd["start"],
        end=measure_stop_ceph_osd["stop"],
        step=15,
    )
    osd_up_validation = check_query_range_result_enum(
        result=osd_up_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_delay=expected_delay,
    )
    osd_up_msg = "ceph_osd_up value should be affected by missing osd"

    logger.info("let's check that osd in value was not affected")
    # osd in value is not affected because we just stopped the osd, we
    # haven't removed it from the luster
    osd_in_result = prometheus.query_range(
        query='ceph_osd_in{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd["start"],
        end=measure_stop_ceph_osd["stop"],
        step=15,
    )
    osd_in_validation = check_query_range_result_enum(result=osd_in_result,
                                                      good_values=[1],
                                                      bad_values=[0],
                                                      exp_metric_num=1)
    osd_in_msg = "ceph_osd_in value should not be affected by missing osd"

    # checking validation results when all queries are performed makes sure
    # that there is evidence for all queries in the test case logs in case of
    # an assert failure
    assert health_validation, health_msg
    assert osd_up_validation, osd_up_msg
    assert osd_in_validation, osd_in_msg
def test_monitoring_shows_mon_down(measure_stop_ceph_mon):
    """
    Make sure simple problems with MON daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60
    # query resolution step used in this test case (number of seconds)
    query_step = 15

    affected_mons = measure_stop_ceph_mon["result"]
    # we asked to stop just a single mon ... make this assumption explicit
    assert len(affected_mons) == 1
    affected_mon = affected_mons[0]
    # translate this into ceph daemon name
    ceph_daemon = "mon.{}".format(affected_mon[len("rook-ceph-mon-"):])
    logger.info(
        f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query="ceph_health_status",
        start=measure_stop_ceph_mon["start"],
        end=measure_stop_ceph_mon["stop"],
        step=query_step,
    )
    health_validation = check_query_range_result_enum(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_good_time=measure_stop_ceph_mon["min_downtime"],
        exp_delay=expected_delay,
    )
    health_msg = "health status should be affected by missing mon"

    logger.info("let's check that mon quorum status value was affected")
    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_mon["start"],
        end=measure_stop_ceph_mon["stop"],
        step=query_step,
        validate=False,
    )
    mon_validation = check_query_range_result_enum(
        result=mon_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_good_time=measure_stop_ceph_mon["min_downtime"],
        exp_delay=expected_delay,
    )
    mon_msg = "ceph_mon_quorum_status value should be affected by missing mon"

    # checking validation results when both queries are performed makes sure
    # that there is evidence for both mon and health queries in the test case
    # logs in case of an assert failure
    assert health_validation, health_msg
    assert mon_validation, mon_msg

    # since we don't do strict result validation in the previous query, we
    # are going to check the min. expected size of the reply explicitly, taking
    # into account the min. expected downtime of the affected ceph mon
    assert len(mon_result) == 1, "there should be one metric for one mon"
    min_mon_samples = measure_stop_ceph_mon["min_downtime"] / query_step
    mon_sample_size = len(mon_result[0]["values"])
    assert mon_sample_size >= min_mon_samples
Example #10
0
def test_monitoring_reporting_ok_when_idle(workload_idle):
    """
    When nothing is happening, OCP Prometheus reports OCS status as OK.

    If this test case fails, the status is either reported wrong or the
    cluster is in a broken state. Either way, a failure here is not good.
    """
    prometheus = PrometheusAPI()

    health_result = prometheus.query_range(
        query="ceph_health_status",
        start=workload_idle["start"],
        end=workload_idle["stop"],
        step=15,
    )
    health_validation = check_query_range_result_enum(
        result=health_result, good_values=[0], bad_values=[1], exp_metric_num=1
    )
    health_msg = "ceph_health_status {} report 0 (health ok) as expected"
    if health_validation:
        health_msg = health_msg.format("does")
        logger.info(health_msg)
    else:
        health_msg = health_msg.format("should")
        logger.error(health_msg)

    mon_result = prometheus.query_range(
        query="ceph_mon_quorum_status",
        start=workload_idle["start"],
        end=workload_idle["stop"],
        step=15,
    )
    mon_validation = check_query_range_result_enum(
        result=mon_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=workload_idle["result"]["mon_num"],
    )
    mon_msg = "ceph_mon_quorum_status {} indicate no problems with quorum"
    if mon_validation:
        mon_msg = mon_msg.format("does")
        logger.info(mon_msg)
    else:
        mon_msg = mon_msg.format("should")
        logger.error(mon_msg)

    osd_validations = []
    for metric in ("ceph_osd_up", "ceph_osd_in"):
        osd_result = prometheus.query_range(
            query=metric,
            start=workload_idle["start"],
            end=workload_idle["stop"],
            step=15,
        )
        osd_validation = check_query_range_result_enum(
            result=osd_result,
            good_values=[1],
            bad_values=[0],
            exp_metric_num=workload_idle["result"]["osd_num"],
        )
        osd_validations.append(osd_validation)
        osd_msg = "{} metric {} indicate no problems with OSDs"
        if osd_validation:
            osd_msg = osd_msg.format(metric, "does")
            logger.info(osd_msg)
        else:
            osd_msg = osd_msg.format(metric, "should")
            logger.error(osd_msg)

    # after logging everything properly, make the test fail if necessary
    # see ERRORs reported in the test log for details
    assert health_validation, health_msg
    assert mon_validation, mon_msg
    osds_msg = "ceph_osd_{up,in} metrics should indicate no OSD issues"
    assert all(osd_validations), osds_msg