def test_ceph_monitor_stopped(workload_stop_ceph_mon):
    """
    Test that there is appropriate alert related to ceph monitor quorum
    when there is even number of ceph monitors and that this alert
    is cleared when monitors are back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = workload_stop_ceph_mon.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity in [
        (
            constants.ALERT_MONQUORUMATRISK,
            'Storage quorum at risk',
            ['pending'],
            'error'
        ),
        (
            constants.ALERT_CLUSTERWARNINGSTATE,
            'Storage cluster is in degraded state',
            ['pending', 'firing'],
            'warning'
        )
    ]:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity
        )
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=workload_stop_ceph_mon.get('stop')
        )
Example #2
0
def test_rgw_unavailable(measure_stop_rgw):
    """
    Test that there is appropriate alert when RGW is unavailable and that
    this alert is cleared when the RGW interface is back online.

    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_rgw.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE
    target_msg = (
        "Cluster Object Store is in unhealthy state for more than 15s. "
        "Please check Ceph cluster health or RGW connection."
    )
    states = ["pending", "firing"]

    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=states,
        severity="error",
    )
    api.check_alert_cleared(
        label=target_label, measure_end_time=measure_stop_rgw.get("stop")
    )
def test_ceph_monitor_stopped(measure_stop_ceph_mon):
    """
    Test that there is appropriate alert related to ceph monitor quorum
    when there is even number of ceph monitors and that this alert
    is cleared when monitors are back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_mon.get("prometheus_alerts")
    for target_label, target_msg, target_states, target_severity in [
        (
            constants.ALERT_MONQUORUMATRISK,
            "Storage quorum at risk",
            ["pending"],
            "error",
        ),
        (
            constants.ALERT_CLUSTERWARNINGSTATE,
            "Storage cluster is in degraded state",
            ["pending"],
            "warning",
        ),
    ]:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
        )
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_stop_ceph_mon.get("stop"))
Example #4
0
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota):
    """
    Test that there are appropriate alerts when NooBaa Bucket Quota is reached.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_noobaa_exceed_bucket_quota.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity in [
        (constants.ALERT_BUCKETREACHINGQUOTASTATE,
         'A NooBaa Bucket Is In Reaching Quota State', ['firing'], 'warning'),
        (constants.ALERT_BUCKETERRORSTATE, 'A NooBaa Bucket Is In Error State',
         ['pending', 'firing'], 'warning'),
        (constants.ALERT_BUCKETEXCEEDINGQUOTASTATE,
         'A NooBaa Bucket Is In Exceeding Quota State', ['firing'], 'warning')
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity)
        # the time to wait is increased because it takes more time for OCS
        # cluster to resolve its issues
        pg_wait = 480
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_noobaa_exceed_bucket_quota.get('stop'),
            time_min=pg_wait)
Example #5
0
def test_corrupt_pg_alerts(measure_corrupt_pg):
    """
    Test that there are appropriate alerts when Placement group
    on one OSD is corrupted.ceph manager
    is unavailable and that this alert is cleared when the manager
    is back online.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_corrupt_pg.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity in [
        (constants.ALERT_PGREPAIRTAKINGTOOLONG, 'Self heal problems detected',
         ['pending'], 'warning'),
        (constants.ALERT_CLUSTERERRORSTATE,
         'Storage cluster is in error state', ['pending', 'firing'], 'error')
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity)
        # the time to wait is increased because it takes more time for Ceph
        # cluster to resolve its issues
        pg_wait = 360
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_corrupt_pg.get('stop'),
            time_min=pg_wait)
def test_ceph_osd_stopped(measure_stop_ceph_osd):
    """
    Test that there is appropriate alert related to situation when ceph osd
    is down. Alert is cleared when osd disk is back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_osd.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity, ignore in [
        (constants.ALERT_OSDDISKNOTRESPONDING, 'Disk not responding',
         ['pending', 'firing'], 'error', False),
        (constants.ALERT_DATARECOVERYTAKINGTOOLONG, 'Data recovery is slow',
         ['pending'], 'warning', True),
        (constants.ALERT_CLUSTERWARNINGSTATE,
         'Storage cluster is in degraded state', ['pending',
                                                  'firing'], 'warning', False)
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity,
                                    ignore_more_occurences=ignore)
        # the time to wait is increased because it takes more time for osd pod
        # to be ready than for other pods
        osd_up_wait = 360
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_stop_ceph_osd.get('stop'),
            time_min=osd_up_wait)
Example #7
0
def test_rgw_unavailable(measure_stop_rgw):
    """
    Test that there is appropriate alert when RGW is unavailable and that
    this alert is cleared when the RGW interface is back online.

    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_rgw.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE
    # The alert message is changed since OCS 4.7
    ocs_version = config.ENV_DATA["ocs_version"]
    if Version.coerce(ocs_version) < Version.coerce("4.7"):
        target_msg = (
            "Cluster Object Store is in unhealthy state for more than 15s. "
            "Please check Ceph cluster health or RGW connection.")
    else:
        target_msg = "Cluster Object Store is in unhealthy state. Please check Ceph cluster health."
    states = ["pending", "firing"]

    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=states,
        severity="error",
    )
    api.check_alert_cleared(label=target_label,
                            measure_end_time=measure_stop_rgw.get("stop"),
                            time_min=300)
Example #8
0
def test_rbd_capacity_workload_alerts(workload_storageutilization_95p_rbd):
    """
    Test that there are appropriate alerts when ceph cluster is utilized
    via RBD interface.
    """
    api = prometheus.PrometheusAPI()
    measure_end_time = workload_storageutilization_95p_rbd.get("stop")

    # Check utilization on 95%
    alerts = workload_storageutilization_95p_rbd.get("prometheus_alerts")

    if config.ENV_DATA.get("ocs_version") == "4.2":
        nearfull_message = "Storage cluster is nearing full. Expansion is required."
        criticallfull_mesage = (
            "Storage cluster is critically full and needs immediate expansion"
        )
    else:
        # since OCS 4.3
        nearfull_message = (
            "Storage cluster is nearing full. Data deletion or cluster "
            "expansion is required."
        )
        criticallfull_mesage = (
            "Storage cluster is critically full and needs immediate data "
            "deletion or cluster expansion."
        )

    for target_label, target_msg, target_states, target_severity in [
        (
            constants.ALERT_CLUSTERNEARFULL,
            nearfull_message,
            ["pending", "firing"],
            "warning",
        ),
        (
            constants.ALERT_CLUSTERCRITICALLYFULL,
            criticallfull_mesage,
            ["pending", "firing"],
            "error",
        ),
    ]:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
            ignore_more_occurences=True,
        )
        # the time to wait is increased because it takes more time for Ceph
        # cluster to delete all data
        pg_wait = 300
        api.check_alert_cleared(
            label=target_label, measure_end_time=measure_end_time, time_min=pg_wait
        )
def test_capacity_workload_alerts(workload_storageutilization_95p_rbd,
                                  workload_storageutilization_95p_cephfs,
                                  interface):
    """
    Test that there are appropriate alerts when ceph cluster is utilized.
    """
    api = prometheus.PrometheusAPI()
    measure_end_time = max([
        workload_storageutilization_95p_rbd.get('stop'),
        workload_storageutilization_95p_cephfs.get('stop'),
    ])
    if interface == 'rbd':
        workload_storageutilization_95p = workload_storageutilization_95p_rbd
    elif interface == 'cephfs':
        workload_storageutilization_95p = workload_storageutilization_95p_cephfs

    # Check utilization on 95%
    alerts = workload_storageutilization_95p.get('prometheus_alerts')
    # TODO(fbalak): it seems that CephFS utilization triggers only firing
    # alerts. This needs to be more investigated.

    if config.ENV_DATA.get('ocs_version') == '4.2':
        nearfull_message = (
            'Storage cluster is nearing full. Expansion is required.')
        criticallfull_mesage = (
            'Storage cluster is critically full and needs immediate expansion')
    else:
        # since OCS 4.3
        nearfull_message = (
            'Storage cluster is nearing full. Data deletion or cluster '
            'expansion is required.')
        criticallfull_mesage = (
            'Storage cluster is critically full and needs immediate data '
            'deletion or cluster expansion.')

    for target_label, target_msg, target_states, target_severity in [
        (constants.ALERT_CLUSTERNEARFULL, nearfull_message,
         ['pending', 'firing'], 'warning'),
        (constants.ALERT_CLUSTERCRITICALLYFULL, criticallfull_mesage,
         ['pending', 'firing'], 'error'),
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity,
                                    ignore_more_occurences=True)
        # the time to wait is increased because it takes more time for Ceph
        # cluster to delete all data
        pg_wait = 300
        api.check_alert_cleared(label=target_label,
                                measure_end_time=measure_end_time,
                                time_min=pg_wait)
Example #10
0
def test_hpa_maxreplica_alert():
    """
    Test to verify that no HPA max replica alert is triggered
    """
    api = prometheus.PrometheusAPI()

    logger.info(f"Verifying whether {constants.ALERT_KUBEHPAREPLICASMISMATCH} "
                f"has not been triggered")
    alerts = api.wait_for_alert(name=constants.ALERT_KUBEHPAREPLICASMISMATCH,
                                timeout=10,
                                sleep=1)
    if len(alerts) > 0:
        assert (
            False
        ), f"Failed: There should be no {constants.ALERT_KUBEHPAREPLICASMISMATCH} alert"
Example #11
0
def test_ceph_health(measure_stop_ceph_mon, measure_corrupt_pg):
    """
    Test that there are appropriate alerts for Ceph health triggered.
    For this check of Ceph Warning state is used measure_stop_ceph_mon
    utilization monitor and for Ceph Error state is used measure_corrupt_pg
    utilization.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_stop_ceph_mon.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTERWARNINGSTATE
    target_msg = "Storage cluster is in degraded state"
    target_states = ["pending", "firing"]
    target_severity = "warning"
    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=target_states,
        severity=target_severity,
    )
    api.check_alert_cleared(
        label=target_label,
        measure_end_time=measure_stop_ceph_mon.get("stop"),
    )

    alerts = measure_corrupt_pg.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTERERRORSTATE
    target_msg = "Storage cluster is in error state"
    target_states = ["pending", "firing"]
    target_severity = "error"
    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=target_states,
        severity=target_severity,
    )
    # the time to wait is increased because it takes more time for Ceph
    # cluster to resolve its issues
    pg_wait = 360
    api.check_alert_cleared(
        label=target_label,
        measure_end_time=measure_corrupt_pg.get("stop"),
        time_min=pg_wait,
    )
Example #12
0
def test_noobaa_ns_bucket(measure_noobaa_ns_target_bucket_deleted):
    """
    Test that there are appropriate alerts when target bucket used of
    namespace store used in namespace bucket is deleted.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_noobaa_ns_target_bucket_deleted.get("prometheus_alerts")

    expected_alerts = [
        (
            constants.ALERT_NAMESPACEBUCKETERRORSTATE,
            "A NooBaa Namespace Bucket Is In Error State",
            ["pending", "firing"],
            "warning",
        ),
        (
            constants.ALERT_NAMESPACERESOURCEERRORSTATE,
            "A NooBaa Namespace Resource Is In Error State",
            ["pending", "firing"],
            "warning",
        ),
    ]

    for target_label, target_msg, target_states, target_severity in expected_alerts:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
        )
        # the time to wait is increased because it takes more time for NooBaa
        # to clear the alert
        pg_wait = 600
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_noobaa_ns_target_bucket_deleted.get(
                "stop"),
            time_min=pg_wait,
        )
def test_ceph_manager_stopped(measure_stop_ceph_mgr):
    """
    Test that there is appropriate alert when ceph manager
    is unavailable and that this alert is cleared when the manager
    is back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_mgr.get('prometheus_alerts')
    target_label = constants.ALERT_MGRISABSENT
    target_msg = 'Storage metrics collector service not available anymore.'
    states = ['pending', 'firing']

    prometheus.check_alert_list(label=target_label,
                                msg=target_msg,
                                alerts=alerts,
                                states=states,
                                severity='critical')
    api.check_alert_cleared(label=target_label,
                            measure_end_time=measure_stop_ceph_mgr.get('stop'))
Example #14
0
def test_ceph_mons_quorum_lost(measure_stop_ceph_mon):
    """
    Test to verify that CephMonQuorumLost alert is seen and
    that this alert is cleared when monitors are back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_mon.get("prometheus_alerts")
    target_label = constants.ALERT_MONQUORUMLOST
    target_msg = "Storage quorum is lost"
    target_states = ["pending", "firing"]

    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=target_states,
        severity="critical",
    )
    api.check_alert_cleared(label=target_label,
                            measure_end_time=measure_stop_ceph_mon.get("stop"))
Example #15
0
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota):
    """
    Test that there are appropriate alerts when NooBaa Bucket Quota is reached.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_noobaa_exceed_bucket_quota.get("prometheus_alerts")

    # since version 4.5 all NooBaa alerts have defined Pending state
    if version.get_semantic_ocs_version_from_config() < version.VERSION_4_5:
        expected_alerts = [
            (
                constants.ALERT_BUCKETREACHINGQUOTASTATE,
                "A NooBaa Bucket Is In Reaching Quota State",
                ["firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETERRORSTATE,
                "A NooBaa Bucket Is In Error State",
                ["pending", "firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETEXCEEDINGQUOTASTATE,
                "A NooBaa Bucket Is In Exceeding Quota State",
                ["firing"],
                "warning",
            ),
        ]
    else:
        expected_alerts = [
            (
                constants.ALERT_BUCKETREACHINGQUOTASTATE,
                "A NooBaa Bucket Is In Reaching Quota State",
                ["pending", "firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETERRORSTATE,
                "A NooBaa Bucket Is In Error State",
                ["pending", "firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETEXCEEDINGQUOTASTATE,
                "A NooBaa Bucket Is In Exceeding Quota State",
                ["pending", "firing"],
                "warning",
            ),
        ]

    for target_label, target_msg, target_states, target_severity in expected_alerts:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
        )
        # the time to wait is increased because it takes more time for OCS
        # cluster to resolve its issues
        pg_wait = 480
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_noobaa_exceed_bucket_quota.get("stop"),
            time_min=pg_wait,
        )