def test_ceph_osd_stopped_pd(measure_stop_ceph_osd): """ Test that there are appropriate incidents in PagerDuty when ceph osd is unavailable and that these incidents are cleared when the osd is back online. """ api = pagerduty.PagerDutyAPI() # get incidents from time when osd deployment was scaled down incidents = measure_stop_ceph_osd.get("pagerduty_incidents") # check that incidents CephOSDDisdNotResponding and CephClusterWarningState # alert are correctly raised for target_label in [ constants.ALERT_OSDDISKNOTRESPONDING, constants.ALERT_CLUSTERWARNINGSTATE, ]: assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_stop_ceph_osd.get("stop"))
def test_ceph_monitor_stopped_pd(measure_stop_ceph_mon): """ Test that there are appropriate incidents in PagerDuty when ceph monitor is unavailable and that these incidents are cleared when the monitor is back online. """ api = pagerduty.PagerDutyAPI() # get incidents from time when monitor deployment was scaled down incidents = measure_stop_ceph_mon.get("pagerduty_incidents") # check that incidents CephMonQuorumAtRisk and CephClusterWarningState # alert are correctly raised for target_label in [ constants.ALERT_MONQUORUMATRISK, constants.ALERT_CLUSTERWARNINGSTATE, ]: assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"))
def test_ceph_osd_stopped_pd(measure_stop_ceph_osd): """ Test that there is appropriate incident in PagerDuty when ceph osd is unavailable and that this incident is cleared when the osd is back online. """ api = pagerduty.PagerDutyAPI() # get incidents from time when manager deployment was scaled down incidents = measure_stop_ceph_osd.get("pagerduty_incidents") target_label = constants.ALERT_OSDDISKNOTRESPONDING # TODO(fbalak): check the whole string in summary and incident alerts assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_stop_ceph_osd.get("stop"))
def test_ceph_mons_quorum_lost_pd(measure_stop_ceph_mon): """ Test that there are appropriate incidents in PagerDuty when ceph monitors except one are unavailable and that these incidents are cleared when the monitor is back online. """ api = pagerduty.PagerDutyAPI() # get incidents from time when monitor deployments were scaled down incidents = measure_stop_ceph_mon.get("pagerduty_incidents") # check that incident CephMonQuorumLost is correctly raised target_label = constants.ALERT_MONQUORUMLOST assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"))
def test_corrupt_pg_pd(measure_corrupt_pg): """ Test that there is appropriate incident in PagerDuty when Placement group on one OSD is corrupted and that this incident is cleared when the corrupted ceph pool is removed. """ api = pagerduty.PagerDutyAPI() # get incidents from time when manager deployment was scaled down incidents = measure_corrupt_pg.get("pagerduty_incidents") target_label = constants.ALERT_CLUSTERERRORSTATE # TODO(fbalak): check the whole string in summary and incident alerts assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_corrupt_pg.get("stop"), )
def test_stop_worker_nodes_pd(measure_stop_worker_nodes): """ Test that there are appropriate incidents in PagerDuty when two worker nodes are unavailable and that these incidents are cleared when those nodes are back online. """ api = pagerduty.PagerDutyAPI() # get incidents from time when node is down incidents = measure_stop_worker_nodes.get("pagerduty_incidents") # check that incident CephNodeDown is correctly raised for target_label in [ constants.ALERT_NODEDOWN, ]: assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_stop_worker_nodes.get("stop"))
def test_ceph_osd_stopped_pd(measure_stop_ceph_osd): """ Test that there are appropriate incidents in PagerDuty when ceph osd is unavailable and that these incidents are cleared when the osd is back online. """ api = pagerduty.PagerDutyAPI() # get incidents from time when osd deployment was scaled down incidents = measure_stop_ceph_osd.get("pagerduty_incidents") # check that incident CephOSDDisdUnavailable is correctly raised for target_label in [ constants.ALERT_OSDDISKUNAVAILABLE, ]: assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_stop_ceph_osd.get("stop") )
def test_ceph_manager_stopped_pd(measure_stop_ceph_mgr): """ Test that there is appropriate incident in PagerDuty when ceph manager is unavailable and that this incident is cleared when the manager is back online. """ api = pagerduty.PagerDutyAPI() # get incidents from time when manager deployment was scaled down incidents = measure_stop_ceph_mgr.get("pagerduty_incidents") for target_label in [ constants.ALERT_MGRISABSENT, constants.ALERT_MGRISMISSINGREPLICAS, ]: # TODO(fbalak): check the whole string in summary and incident alerts assert pagerduty.check_incident_list( summary=target_label, incidents=incidents, urgency="high", ) api.check_incident_cleared( summary=target_label, measure_end_time=measure_stop_ceph_mgr.get("stop") )