def test_ceph_osd_stopped_pd(measure_stop_ceph_osd):
    """
    Test that there are appropriate incidents in PagerDuty when ceph osd
    is unavailable and that these incidents are cleared when the osd
    is back online.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when osd deployment was scaled down
    incidents = measure_stop_ceph_osd.get("pagerduty_incidents")

    # check that incidents CephOSDDisdNotResponding and CephClusterWarningState
    # alert are correctly raised
    for target_label in [
            constants.ALERT_OSDDISKNOTRESPONDING,
            constants.ALERT_CLUSTERWARNINGSTATE,
    ]:
        assert pagerduty.check_incident_list(
            summary=target_label,
            incidents=incidents,
            urgency="high",
        )
        api.check_incident_cleared(
            summary=target_label,
            measure_end_time=measure_stop_ceph_osd.get("stop"))
def test_ceph_monitor_stopped_pd(measure_stop_ceph_mon):
    """
    Test that there are appropriate incidents in PagerDuty when ceph monitor
    is unavailable and that these incidents are cleared when the monitor
    is back online.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when monitor deployment was scaled down
    incidents = measure_stop_ceph_mon.get("pagerduty_incidents")

    # check that incidents CephMonQuorumAtRisk and CephClusterWarningState
    # alert are correctly raised
    for target_label in [
            constants.ALERT_MONQUORUMATRISK,
            constants.ALERT_CLUSTERWARNINGSTATE,
    ]:
        assert pagerduty.check_incident_list(
            summary=target_label,
            incidents=incidents,
            urgency="high",
        )
        api.check_incident_cleared(
            summary=target_label,
            measure_end_time=measure_stop_ceph_mon.get("stop"))
Example #3
0
def test_ceph_osd_stopped_pd(measure_stop_ceph_osd):
    """
    Test that there is appropriate incident in PagerDuty when ceph osd
    is unavailable and that this incident is cleared when the osd
    is back online.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when manager deployment was scaled down
    incidents = measure_stop_ceph_osd.get("pagerduty_incidents")
    target_label = constants.ALERT_OSDDISKNOTRESPONDING

    # TODO(fbalak): check the whole string in summary and incident alerts
    assert pagerduty.check_incident_list(
        summary=target_label,
        incidents=incidents,
        urgency="high",
    )
    api.check_incident_cleared(
        summary=target_label,
        measure_end_time=measure_stop_ceph_osd.get("stop"))
def test_ceph_mons_quorum_lost_pd(measure_stop_ceph_mon):
    """
    Test that there are appropriate incidents in PagerDuty when ceph monitors
    except one are unavailable and that these incidents are cleared when the
    monitor is back online.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when monitor deployments were scaled down
    incidents = measure_stop_ceph_mon.get("pagerduty_incidents")

    # check that incident CephMonQuorumLost is correctly raised
    target_label = constants.ALERT_MONQUORUMLOST
    assert pagerduty.check_incident_list(
        summary=target_label,
        incidents=incidents,
        urgency="high",
    )
    api.check_incident_cleared(
        summary=target_label,
        measure_end_time=measure_stop_ceph_mon.get("stop"))
Example #5
0
def test_corrupt_pg_pd(measure_corrupt_pg):
    """
    Test that there is appropriate incident in PagerDuty when Placement group
    on one OSD is corrupted and that this incident is cleared when the corrupted
    ceph pool is removed.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when manager deployment was scaled down
    incidents = measure_corrupt_pg.get("pagerduty_incidents")
    target_label = constants.ALERT_CLUSTERERRORSTATE

    # TODO(fbalak): check the whole string in summary and incident alerts
    assert pagerduty.check_incident_list(
        summary=target_label,
        incidents=incidents,
        urgency="high",
    )
    api.check_incident_cleared(
        summary=target_label,
        measure_end_time=measure_corrupt_pg.get("stop"),
    )
def test_stop_worker_nodes_pd(measure_stop_worker_nodes):
    """
    Test that there are appropriate incidents in PagerDuty when two worker
    nodes are unavailable and that these incidents are cleared when those nodes
    are back online.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when node is down
    incidents = measure_stop_worker_nodes.get("pagerduty_incidents")

    # check that incident CephNodeDown is correctly raised
    for target_label in [
            constants.ALERT_NODEDOWN,
    ]:
        assert pagerduty.check_incident_list(
            summary=target_label,
            incidents=incidents,
            urgency="high",
        )
        api.check_incident_cleared(
            summary=target_label,
            measure_end_time=measure_stop_worker_nodes.get("stop"))
Example #7
0
def test_ceph_osd_stopped_pd(measure_stop_ceph_osd):
    """
    Test that there are appropriate incidents in PagerDuty when ceph osd
    is unavailable and that these incidents are cleared when the osd
    is back online.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when osd deployment was scaled down
    incidents = measure_stop_ceph_osd.get("pagerduty_incidents")

    # check that incident CephOSDDisdUnavailable is correctly raised
    for target_label in [
        constants.ALERT_OSDDISKUNAVAILABLE,
    ]:
        assert pagerduty.check_incident_list(
            summary=target_label,
            incidents=incidents,
            urgency="high",
        )
        api.check_incident_cleared(
            summary=target_label, measure_end_time=measure_stop_ceph_osd.get("stop")
        )
Example #8
0
def test_ceph_manager_stopped_pd(measure_stop_ceph_mgr):
    """
    Test that there is appropriate incident in PagerDuty when ceph manager
    is unavailable and that this incident is cleared when the manager
    is back online.
    """
    api = pagerduty.PagerDutyAPI()

    # get incidents from time when manager deployment was scaled down
    incidents = measure_stop_ceph_mgr.get("pagerduty_incidents")
    for target_label in [
        constants.ALERT_MGRISABSENT,
        constants.ALERT_MGRISMISSINGREPLICAS,
    ]:

        # TODO(fbalak): check the whole string in summary and incident alerts
        assert pagerduty.check_incident_list(
            summary=target_label,
            incidents=incidents,
            urgency="high",
        )
        api.check_incident_cleared(
            summary=target_label, measure_end_time=measure_stop_ceph_mgr.get("stop")
        )