def test_ceph_monitor_stopped(workload_stop_ceph_mon): """ Test that there is appropriate alert related to ceph monitor quorum when there is even number of ceph monitors and that this alert is cleared when monitors are back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = workload_stop_ceph_mon.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity in [ ( constants.ALERT_MONQUORUMATRISK, 'Storage quorum at risk', ['pending'], 'error' ), ( constants.ALERT_CLUSTERWARNINGSTATE, 'Storage cluster is in degraded state', ['pending', 'firing'], 'warning' ) ]: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity ) api.check_alert_cleared( label=target_label, measure_end_time=workload_stop_ceph_mon.get('stop') )
def test_rgw_unavailable(measure_stop_rgw): """ Test that there is appropriate alert when RGW is unavailable and that this alert is cleared when the RGW interface is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_rgw.get("prometheus_alerts") target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE target_msg = ( "Cluster Object Store is in unhealthy state for more than 15s. " "Please check Ceph cluster health or RGW connection." ) states = ["pending", "firing"] prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=states, severity="error", ) api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_rgw.get("stop") )
def test_ceph_monitor_stopped(measure_stop_ceph_mon): """ Test that there is appropriate alert related to ceph monitor quorum when there is even number of ceph monitors and that this alert is cleared when monitors are back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mon.get("prometheus_alerts") for target_label, target_msg, target_states, target_severity in [ ( constants.ALERT_MONQUORUMATRISK, "Storage quorum at risk", ["pending"], "error", ), ( constants.ALERT_CLUSTERWARNINGSTATE, "Storage cluster is in degraded state", ["pending"], "warning", ), ]: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"))
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota): """ Test that there are appropriate alerts when NooBaa Bucket Quota is reached. """ api = prometheus.PrometheusAPI() alerts = measure_noobaa_exceed_bucket_quota.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity in [ (constants.ALERT_BUCKETREACHINGQUOTASTATE, 'A NooBaa Bucket Is In Reaching Quota State', ['firing'], 'warning'), (constants.ALERT_BUCKETERRORSTATE, 'A NooBaa Bucket Is In Error State', ['pending', 'firing'], 'warning'), (constants.ALERT_BUCKETEXCEEDINGQUOTASTATE, 'A NooBaa Bucket Is In Exceeding Quota State', ['firing'], 'warning') ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity) # the time to wait is increased because it takes more time for OCS # cluster to resolve its issues pg_wait = 480 api.check_alert_cleared( label=target_label, measure_end_time=measure_noobaa_exceed_bucket_quota.get('stop'), time_min=pg_wait)
def test_corrupt_pg_alerts(measure_corrupt_pg): """ Test that there are appropriate alerts when Placement group on one OSD is corrupted.ceph manager is unavailable and that this alert is cleared when the manager is back online. """ api = prometheus.PrometheusAPI() alerts = measure_corrupt_pg.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity in [ (constants.ALERT_PGREPAIRTAKINGTOOLONG, 'Self heal problems detected', ['pending'], 'warning'), (constants.ALERT_CLUSTERERRORSTATE, 'Storage cluster is in error state', ['pending', 'firing'], 'error') ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity) # the time to wait is increased because it takes more time for Ceph # cluster to resolve its issues pg_wait = 360 api.check_alert_cleared( label=target_label, measure_end_time=measure_corrupt_pg.get('stop'), time_min=pg_wait)
def test_ceph_osd_stopped(measure_stop_ceph_osd): """ Test that there is appropriate alert related to situation when ceph osd is down. Alert is cleared when osd disk is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_osd.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity, ignore in [ (constants.ALERT_OSDDISKNOTRESPONDING, 'Disk not responding', ['pending', 'firing'], 'error', False), (constants.ALERT_DATARECOVERYTAKINGTOOLONG, 'Data recovery is slow', ['pending'], 'warning', True), (constants.ALERT_CLUSTERWARNINGSTATE, 'Storage cluster is in degraded state', ['pending', 'firing'], 'warning', False) ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ignore_more_occurences=ignore) # the time to wait is increased because it takes more time for osd pod # to be ready than for other pods osd_up_wait = 360 api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_ceph_osd.get('stop'), time_min=osd_up_wait)
def test_rgw_unavailable(measure_stop_rgw): """ Test that there is appropriate alert when RGW is unavailable and that this alert is cleared when the RGW interface is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_rgw.get("prometheus_alerts") target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE # The alert message is changed since OCS 4.7 ocs_version = config.ENV_DATA["ocs_version"] if Version.coerce(ocs_version) < Version.coerce("4.7"): target_msg = ( "Cluster Object Store is in unhealthy state for more than 15s. " "Please check Ceph cluster health or RGW connection.") else: target_msg = "Cluster Object Store is in unhealthy state. Please check Ceph cluster health." states = ["pending", "firing"] prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=states, severity="error", ) api.check_alert_cleared(label=target_label, measure_end_time=measure_stop_rgw.get("stop"), time_min=300)
def test_rbd_capacity_workload_alerts(workload_storageutilization_95p_rbd): """ Test that there are appropriate alerts when ceph cluster is utilized via RBD interface. """ api = prometheus.PrometheusAPI() measure_end_time = workload_storageutilization_95p_rbd.get("stop") # Check utilization on 95% alerts = workload_storageutilization_95p_rbd.get("prometheus_alerts") if config.ENV_DATA.get("ocs_version") == "4.2": nearfull_message = "Storage cluster is nearing full. Expansion is required." criticallfull_mesage = ( "Storage cluster is critically full and needs immediate expansion" ) else: # since OCS 4.3 nearfull_message = ( "Storage cluster is nearing full. Data deletion or cluster " "expansion is required." ) criticallfull_mesage = ( "Storage cluster is critically full and needs immediate data " "deletion or cluster expansion." ) for target_label, target_msg, target_states, target_severity in [ ( constants.ALERT_CLUSTERNEARFULL, nearfull_message, ["pending", "firing"], "warning", ), ( constants.ALERT_CLUSTERCRITICALLYFULL, criticallfull_mesage, ["pending", "firing"], "error", ), ]: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ignore_more_occurences=True, ) # the time to wait is increased because it takes more time for Ceph # cluster to delete all data pg_wait = 300 api.check_alert_cleared( label=target_label, measure_end_time=measure_end_time, time_min=pg_wait )
def test_capacity_workload_alerts(workload_storageutilization_95p_rbd, workload_storageutilization_95p_cephfs, interface): """ Test that there are appropriate alerts when ceph cluster is utilized. """ api = prometheus.PrometheusAPI() measure_end_time = max([ workload_storageutilization_95p_rbd.get('stop'), workload_storageutilization_95p_cephfs.get('stop'), ]) if interface == 'rbd': workload_storageutilization_95p = workload_storageutilization_95p_rbd elif interface == 'cephfs': workload_storageutilization_95p = workload_storageutilization_95p_cephfs # Check utilization on 95% alerts = workload_storageutilization_95p.get('prometheus_alerts') # TODO(fbalak): it seems that CephFS utilization triggers only firing # alerts. This needs to be more investigated. if config.ENV_DATA.get('ocs_version') == '4.2': nearfull_message = ( 'Storage cluster is nearing full. Expansion is required.') criticallfull_mesage = ( 'Storage cluster is critically full and needs immediate expansion') else: # since OCS 4.3 nearfull_message = ( 'Storage cluster is nearing full. Data deletion or cluster ' 'expansion is required.') criticallfull_mesage = ( 'Storage cluster is critically full and needs immediate data ' 'deletion or cluster expansion.') for target_label, target_msg, target_states, target_severity in [ (constants.ALERT_CLUSTERNEARFULL, nearfull_message, ['pending', 'firing'], 'warning'), (constants.ALERT_CLUSTERCRITICALLYFULL, criticallfull_mesage, ['pending', 'firing'], 'error'), ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ignore_more_occurences=True) # the time to wait is increased because it takes more time for Ceph # cluster to delete all data pg_wait = 300 api.check_alert_cleared(label=target_label, measure_end_time=measure_end_time, time_min=pg_wait)
def test_hpa_maxreplica_alert(): """ Test to verify that no HPA max replica alert is triggered """ api = prometheus.PrometheusAPI() logger.info(f"Verifying whether {constants.ALERT_KUBEHPAREPLICASMISMATCH} " f"has not been triggered") alerts = api.wait_for_alert(name=constants.ALERT_KUBEHPAREPLICASMISMATCH, timeout=10, sleep=1) if len(alerts) > 0: assert ( False ), f"Failed: There should be no {constants.ALERT_KUBEHPAREPLICASMISMATCH} alert"
def test_ceph_health(measure_stop_ceph_mon, measure_corrupt_pg): """ Test that there are appropriate alerts for Ceph health triggered. For this check of Ceph Warning state is used measure_stop_ceph_mon utilization monitor and for Ceph Error state is used measure_corrupt_pg utilization. """ api = prometheus.PrometheusAPI() alerts = measure_stop_ceph_mon.get("prometheus_alerts") target_label = constants.ALERT_CLUSTERWARNINGSTATE target_msg = "Storage cluster is in degraded state" target_states = ["pending", "firing"] target_severity = "warning" prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"), ) alerts = measure_corrupt_pg.get("prometheus_alerts") target_label = constants.ALERT_CLUSTERERRORSTATE target_msg = "Storage cluster is in error state" target_states = ["pending", "firing"] target_severity = "error" prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) # the time to wait is increased because it takes more time for Ceph # cluster to resolve its issues pg_wait = 360 api.check_alert_cleared( label=target_label, measure_end_time=measure_corrupt_pg.get("stop"), time_min=pg_wait, )
def test_noobaa_ns_bucket(measure_noobaa_ns_target_bucket_deleted): """ Test that there are appropriate alerts when target bucket used of namespace store used in namespace bucket is deleted. """ api = prometheus.PrometheusAPI() alerts = measure_noobaa_ns_target_bucket_deleted.get("prometheus_alerts") expected_alerts = [ ( constants.ALERT_NAMESPACEBUCKETERRORSTATE, "A NooBaa Namespace Bucket Is In Error State", ["pending", "firing"], "warning", ), ( constants.ALERT_NAMESPACERESOURCEERRORSTATE, "A NooBaa Namespace Resource Is In Error State", ["pending", "firing"], "warning", ), ] for target_label, target_msg, target_states, target_severity in expected_alerts: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) # the time to wait is increased because it takes more time for NooBaa # to clear the alert pg_wait = 600 api.check_alert_cleared( label=target_label, measure_end_time=measure_noobaa_ns_target_bucket_deleted.get( "stop"), time_min=pg_wait, )
def test_ceph_manager_stopped(measure_stop_ceph_mgr): """ Test that there is appropriate alert when ceph manager is unavailable and that this alert is cleared when the manager is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mgr.get('prometheus_alerts') target_label = constants.ALERT_MGRISABSENT target_msg = 'Storage metrics collector service not available anymore.' states = ['pending', 'firing'] prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=states, severity='critical') api.check_alert_cleared(label=target_label, measure_end_time=measure_stop_ceph_mgr.get('stop'))
def test_ceph_mons_quorum_lost(measure_stop_ceph_mon): """ Test to verify that CephMonQuorumLost alert is seen and that this alert is cleared when monitors are back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mon.get("prometheus_alerts") target_label = constants.ALERT_MONQUORUMLOST target_msg = "Storage quorum is lost" target_states = ["pending", "firing"] prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity="critical", ) api.check_alert_cleared(label=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"))
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota): """ Test that there are appropriate alerts when NooBaa Bucket Quota is reached. """ api = prometheus.PrometheusAPI() alerts = measure_noobaa_exceed_bucket_quota.get("prometheus_alerts") # since version 4.5 all NooBaa alerts have defined Pending state if version.get_semantic_ocs_version_from_config() < version.VERSION_4_5: expected_alerts = [ ( constants.ALERT_BUCKETREACHINGQUOTASTATE, "A NooBaa Bucket Is In Reaching Quota State", ["firing"], "warning", ), ( constants.ALERT_BUCKETERRORSTATE, "A NooBaa Bucket Is In Error State", ["pending", "firing"], "warning", ), ( constants.ALERT_BUCKETEXCEEDINGQUOTASTATE, "A NooBaa Bucket Is In Exceeding Quota State", ["firing"], "warning", ), ] else: expected_alerts = [ ( constants.ALERT_BUCKETREACHINGQUOTASTATE, "A NooBaa Bucket Is In Reaching Quota State", ["pending", "firing"], "warning", ), ( constants.ALERT_BUCKETERRORSTATE, "A NooBaa Bucket Is In Error State", ["pending", "firing"], "warning", ), ( constants.ALERT_BUCKETEXCEEDINGQUOTASTATE, "A NooBaa Bucket Is In Exceeding Quota State", ["pending", "firing"], "warning", ), ] for target_label, target_msg, target_states, target_severity in expected_alerts: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) # the time to wait is increased because it takes more time for OCS # cluster to resolve its issues pg_wait = 480 api.check_alert_cleared( label=target_label, measure_end_time=measure_noobaa_exceed_bucket_quota.get("stop"), time_min=pg_wait, )