Example #1
0
def test_monitoring_shows_osd_down(measure_stop_ceph_osd):
    """
    Make sure simple problems with OSD daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60

    affected_osd = measure_stop_ceph_osd['result']
    # translate this into ceph daemon name
    ceph_daemon = "osd.{}".format(int(affected_osd[len('rook-ceph-osd-'):]))
    logger.info(
        f"affected osd was {affected_osd}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query='ceph_health_status',
        start=measure_stop_ceph_osd['start'],
        end=measure_stop_ceph_osd['stop'],
        step=15)
    health_validation = prometheus.check_query_range_result(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_delay=expected_delay)
    health_msg = "health status should be affected by missing osd"
    assert health_validation, health_msg

    logger.info("let's check that osd up value was affected")
    osd_up_result = prometheus.query_range(
        query='ceph_osd_up{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd['start'],
        end=measure_stop_ceph_osd['stop'],
        step=15)
    osd_up_validation = prometheus.check_query_range_result(
        result=osd_up_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_delay=expected_delay)
    osd_up_msg = "ceph_osd_up value should be affected by missing osd"
    assert osd_up_validation, osd_up_msg

    logger.info("let's check that osd in value was not affected")
    # osd in value is not affected because we just stopped the osd, we
    # haven't removed it from the luster
    osd_in_result = prometheus.query_range(
        query='ceph_osd_in{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd['start'],
        end=measure_stop_ceph_osd['stop'],
        step=15)
    osd_in_validation = prometheus.check_query_range_result(
        result=osd_in_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1)
    osd_in_msg = "ceph_osd_in value should not be affected by missing osd"
    assert osd_in_validation, osd_in_msg
Example #2
0
def test_monitoring_shows_mon_down(measure_stop_ceph_mon):
    """
    Make sure simple problems with MON daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60

    affected_mons = measure_stop_ceph_mon['result']
    # we asked to stop just a single mon ... make this assumption explicit
    assert len(affected_mons) == 1
    affected_mon = affected_mons[0]
    # translate this into ceph daemon name
    ceph_daemon = "mon.{}".format(affected_mon[len('rook-ceph-mon-'):])
    logger.info(
        f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query='ceph_health_status',
        start=measure_stop_ceph_mon['start'],
        end=measure_stop_ceph_mon['stop'],
        step=15)
    health_validation = prometheus.check_query_range_result(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_delay=expected_delay)
    health_msg = "health status should be affected by missing mon"
    assert health_validation, health_msg

    logger.info("let's check that mon quorum status value was affected")
    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_mon['start'],
        end=measure_stop_ceph_mon['stop'],
        step=15)
    mon_validation = prometheus.check_query_range_result(
        result=mon_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_delay=expected_delay)
    mon_msg = "ceph_osd_up value should be affected by missing osd"
    assert mon_validation, mon_msg
def test_monitoring_reporting_ok_when_idle(workload_idle):
    """
    When nothing is happening, OCP Prometheus reports OCS status as OK.

    If this test case fails, the status is either reported wrong or the
    cluster is in a broken state. Either way, a failure here is not good.
    """
    prometheus = PrometheusAPI()

    health_result = prometheus.query_range(query='ceph_health_status',
                                           start=workload_idle['start'],
                                           end=workload_idle['stop'],
                                           step=15)
    health_validation = prometheus.check_query_range_result(
        result=health_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1)
    health_msg = "ceph_health_status {} report 0 (health ok) as expected"
    if health_validation:
        health_msg = health_msg.format('does')
        logger.info(health_msg)
    else:
        health_msg = health_msg.format('should')
        logger.error(health_msg)

    mon_result = prometheus.query_range(query='ceph_mon_quorum_status',
                                        start=workload_idle['start'],
                                        end=workload_idle['stop'],
                                        step=15)
    mon_validation = prometheus.check_query_range_result(
        result=mon_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=workload_idle['result']['mon_num'])
    mon_msg = "ceph_mon_quorum_status {} indicate no problems with quorum"
    if mon_validation:
        mon_msg = mon_msg.format('does')
        logger.info(mon_msg)
    else:
        mon_msg = mon_msg.format('should')
        logger.error(mon_msg)

    osd_validations = []
    for metric in ("ceph_osd_up", "ceph_osd_in"):
        osd_result = prometheus.query_range(query=metric,
                                            start=workload_idle['start'],
                                            end=workload_idle['stop'],
                                            step=15)
        osd_validation = prometheus.check_query_range_result(
            result=osd_result,
            good_values=[1],
            bad_values=[0],
            exp_metric_num=workload_idle['result']['osd_num'])
        osd_validations.append(osd_validation)
        osd_msg = "{} metric {} indicate no problems with OSDs"
        if osd_validation:
            osd_msg = osd_msg.format(metric, 'does')
            logger.info(osd_msg)
        else:
            osd_msg = osd_msg.format(metric, 'should')
            logger.error(osd_msg)

    # after logging everything properly, make the test fail if necessary
    # see ERRORs reported in the test log for details
    assert health_validation, health_msg
    assert mon_validation, mon_msg
    osds_msg = "ceph_osd_{up,in} metrics should indicate no OSD issues"
    assert all(osd_validations), osds_msg