Beispiel #1
0
def workload_idle(measurement_dir):
    """
    This workload represents a relative long timeframe when nothing special is
    happening, for test cases checking default status of various components
    (eg. no error alert is reported out of sudden, ceph should be healthy ...).

    Besides sheer waiting, this workload also checks that the number of ceph
    components (OSD and MON only) is the same at start and end of this wait,
    and passess the numbers to the test. If the number changes, something not
    exactly expected was happening with the cluster (eg. some node got offline,
    or cluster was expanded, ...) which doesn't match the idea of idle waiting
    and *invalidates the expectations of this workload*. Running test cases
    which expects idle workload in such case would be misleading, so we fail
    the workload in such case.
    """
    def count_ceph_components():
        ct_pod = pod.get_ceph_tools_pod()
        ceph_osd_ls_list = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd ls")
        logger.debug(f"ceph osd ls output: {ceph_osd_ls_list}")
        # the "+ 1" is a WORKAROUND for a bug in exec_ceph_cmd()
        # https://github.com/red-hat-storage/ocs-ci/issues/1152
        osd_num = len(ceph_osd_ls_list) + 1
        mon_num = len(ct_pod.exec_ceph_cmd(ceph_cmd="ceph mon metadata"))
        logger.info(f"There are {osd_num} OSDs, {mon_num} MONs")
        return osd_num, mon_num

    def do_nothing():
        sleep_time = 60 * 15  # seconds
        logger.info(f"idle workload is about to sleep for {sleep_time} s")
        osd_num_1, mon_num_1 = count_ceph_components()
        time.sleep(sleep_time)
        osd_num_2, mon_num_2 = count_ceph_components()
        # If this fails, we are likely observing an infra error or unsolicited
        # interference with test cluster from the outside. It could also be a
        # product bug, but this is less likely. See also docstring of this
        # workload fixture.
        msg = ("Assumption that nothing serious is happening not met, "
               "number of selected ceph components should be the same")
        assert osd_num_1 == osd_num_2, msg
        assert mon_num_1 == mon_num_2, msg
        assert osd_num_1 >= 3, "OCS cluster should have at least 3 OSDs"
        result = {'osd_num': osd_num_1, 'mon_num': mon_num_1}
        return result

    test_file = os.path.join(measurement_dir, 'measure_workload_idle.json')
    measured_op = measure_operation(do_nothing, test_file)
    return measured_op
Beispiel #2
0
def measure_stop_ceph_mgr(measurement_dir):
    """
    Downscales Ceph Manager deployment, measures the time when it was
    downscaled and monitors alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            Ceph Manager pod
    """
    oc = ocp.OCP(
        kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"]
    )
    mgr_deployments = oc.get(selector=constants.MGR_APP_LABEL)["items"]
    mgr = mgr_deployments[0]["metadata"]["name"]

    def stop_mgr():
        """
        Downscale Ceph Manager deployment for 6 minutes. First 5 minutes
        the alert should be in 'Pending'.
        After 5 minutes it should be 'Firing'.
        This configuration of monitoring can be observed in ceph-mixins which
        are used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L25

        Returns:
            str: Name of downscaled deployment
        """
        # run_time of operation
        run_time = 60 * 6
        nonlocal oc
        nonlocal mgr
        logger.info(f"Downscaling deployment {mgr} to 0")
        oc.exec_oc_cmd(f"scale --replicas=0 deployment/{mgr}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return oc.get(mgr)

    test_file = os.path.join(measurement_dir, "measure_stop_ceph_mgr.json")
    measured_op = measure_operation(stop_mgr, test_file)
    logger.info(f"Upscaling deployment {mgr} back to 1")
    oc.exec_oc_cmd(f"scale --replicas=1 deployment/{mgr}")

    # wait for ceph to return into HEALTH_OK state after mgr deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
Beispiel #3
0
def measure_stop_worker_node(measurement_dir, nodes):
    """
    Stop one worker node, measure the time when it was stopped and monitors
    alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            worker node

    """
    node = get_nodes(node_type="worker")[0]

    def stop_node():
        """
        Turn off one worker node for 6 minutes.

        Returns:
            str: Node that was turned down

        """
        # run_time of operation
        run_time = 60 * 6
        nonlocal node
        logger.info(f"Turning off node {node.name}")
        nodes.stop_nodes(nodes=[node])
        # Validate node reached NotReady state
        wait_for_nodes_status(node_names=[node.name], status=constants.NODE_NOT_READY)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return node.name

    test_file = os.path.join(measurement_dir, "measure_stop_node.json")
    measured_op = measure_operation(stop_node, test_file)
    logger.info(f"Turning on node {node.name}")
    nodes.start_nodes(nodes=[node])
    # Validate all nodes are in READY state and up
    retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)(
        wait_for_nodes_status
    )(timeout=900)

    # wait for ceph to return into HEALTH_OK state after mgr deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
Beispiel #4
0
def measure_stop_rgw(measurement_dir, request, rgw_deployments):
    """
    Downscales RGW deployments, measures the time when it was
    downscaled and monitors alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            RGW pods

    """
    oc = ocp.OCP(
        kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"]
    )

    def stop_rgw():
        """
        Downscale RGW interface deployments for 5 minutes.

        Returns:
            str: Name of downscaled deployment

        """
        # run_time of operation
        run_time = 60 * 5
        nonlocal oc
        nonlocal rgw_deployments
        for rgw_deployment in rgw_deployments:
            rgw = rgw_deployment["metadata"]["name"]
            logger.info(f"Downscaling deployment {rgw} to 0")
            oc.exec_oc_cmd(f"scale --replicas=0 deployment/{rgw}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return rgw_deployments

    test_file = os.path.join(measurement_dir, "measure_stop_rgw.json")
    measured_op = measure_operation(stop_rgw, test_file)

    logger.info("Return RGW pods")
    for rgw_deployment in rgw_deployments:
        rgw = rgw_deployment["metadata"]["name"]
        logger.info(f"Upscaling deployment {rgw} to 1")
        oc.exec_oc_cmd(f"scale --replicas=1 deployment/{rgw}")

    return measured_op
Beispiel #5
0
def workload_fio_storageutilization(
    fixture_name,
    project,
    fio_pvc_dict,
    fio_job_dict,
    fio_configmap_dict,
    measurement_dir,
    tmp_path,
    target_percentage=None,
    target_size=None,
    with_checksum=False,
):
    """
    This function implements core functionality of fio storage utilization
    workload fixtures. This is necessary because we can't parametrize single
    general fixture over multiple parameters (it would mess with test case id
    and polarion test case tracking).

    It works as a workload fixture, as understood by
    :py:mod:`ocs_ci.utility.workloadfixture` module.

    When ``target_percentage`` is specified, the goal of the fixture is to fill
    whatever is left so that total cluster utilization reaches the target
    percentage. This means that in this mode, number of data written depends
    on both total capacity and current utilization. If the current storage
    utilization already exceeds the target, the test is skipped.

    On the other hand with ``target_size``, you can specify the size of data
    written by fio directly.

    Args:
        fixture_name (str): name of the fixture using this function (for
            logging and k8s object labeling purposes)
        project (ocs_ci.ocs.ocp.OCP): OCP object of project in which the Job is
            deployed, as created by ``project_factory`` or ``project`` fixture
        fio_pvc_dict (dict): PVC k8s struct for fio target volume
        fio_job_dict (dict): Job k8s struct for fio job
        fio_configmap_dict (dict): configmap k8s struct with fio config file
        measurement_dir (str): reference to a fixture which represents a
            directory where measurement results are stored, see also
            :py:func:`ocs_ci.utility.workloadfixture.measure_operation()`
        tmp_path (pathlib.PosixPath): reference to pytest ``tmp_path`` fixture
        target_percentage (float): target utilization as percentage wrt all
            usable OCS space, eg. 0.50 means a request to reach 50% of total
            OCS storage utilization (wrt usable space)
        target_size (int): target size of the PVC for fio to use, eg. 10 means
            a request for fio to write 10GiB of data
        with_checksum (bool): if true, sha1 checksum of the data written by
            fio is stored on the volume, and reclaim policy of the volume is
            changed to ``Retain`` so that the volume is not removed during test
            teardown for later verification runs

    Returns:
        dict: measurement results with timestamps and other medatada from
            :py:func:`ocs_ci.utility.workloadfixture.measure_operation()`

    """
    val_err_msg = "Specify either target_size or target_percentage"
    if target_size is None and target_percentage is None:
        raise ValueError(
            val_err_msg +
            ", it's not clear how much storage space should be used.")
    if target_size is not None and target_percentage is not None:
        raise ValueError(val_err_msg + ", not both.")

    # TODO: move out storage class names
    if fixture_name.endswith("rbd"):
        storage_class_name = "ocs-storagecluster-ceph-rbd"
        ceph_pool_name = "ocs-storagecluster-cephblockpool"
    elif fixture_name.endswith("cephfs"):
        storage_class_name = "ocs-storagecluster-cephfs"
        ceph_pool_name = "ocs-storagecluster-cephfilesystem-data0"
    else:
        raise UnexpectedVolumeType(
            "unexpected volume type, ocs-ci code is wrong")

    # make sure we communicate what is going to happen
    logger.info((f"starting {fixture_name} fixture, "
                 f"using {storage_class_name} storage class "
                 f"backed by {ceph_pool_name} ceph pool"))

    # log ceph mon_osd_*_ratio values for QE team to understand behaviour of
    # ceph cluster during high utilization levels (for expected values, consult
    # BZ 1775432 and check that there is no more recent BZ or JIRA in this
    # area)
    ceph_full_ratios = [
        'mon_osd_full_ratio',
        'mon_osd_backfillfull_ratio',
        'mon_osd_nearfull_ratio',
    ]
    ct_pod = pod.get_ceph_tools_pod()
    for ceph_ratio in ceph_full_ratios:
        logger.info("checking value of %s", ceph_ratio)
        value = ct_pod.exec_ceph_cmd(f'ceph config get mon.* {ceph_ratio}')
        logger.info(f"{ceph_ratio} is {value}")

    if target_size is not None:
        pvc_size = target_size
    else:
        pvc_size = get_storageutilization_size(target_percentage,
                                               ceph_pool_name)

    # To handle use case of test_workload_rbd_cephfs_minimal which writes data
    # to reach a small fraction of the total capacity only (eg. 5%), the test
    # is going increase the target 2x and try again.
    if pvc_size <= 0 and target_percentage is not None and target_percentage <= 0.10:
        new_target_percentage = 2 * target_percentage
        logger.info(
            "increasing storage utilization target percentage from %.2f to %.2f",
            target_percentage, new_target_percentage)
        target_percentage = new_target_percentage
        pvc_size = get_storageutilization_size(target_percentage,
                                               ceph_pool_name)
    # If this is still not enough, the test will be skipped, because the idea
    # of tests reaching a small total utilization is to do just that.
    # Moreover this will also skip this test case for any other utilization
    # level, which is easier to read in the test report than the actual
    # failure with negative pvc size.
    if pvc_size <= 0 and target_percentage is not None:
        skip_msg = (
            "current total storage utilization is too high, "
            f"the target utilization {target_percentage*100}% is already met")
        logger.warning(skip_msg)
        pytest.skip(skip_msg)

    fio_conf = textwrap.dedent("""
        [simple-write]
        readwrite=write
        buffered=1
        blocksize=4k
        ioengine=libaio
        directory=/mnt/target
        """)

    # When we ask for checksum to be generated for all files written in the
    # /mnt/target directory, we need to keep some space free so that the
    # checksum file would fit there. We overestimate this free space so that
    # it works both with CephFS and RBD volumes, as with RBD volumes actuall
    # usable capacity is smaller because of filesystem overhead (pvc size
    # defines size of a block device, on which local ext4 filesystem is
    # formatted).
    if with_checksum:
        # assume 4% fs overhead, and double to it make it safe
        fs_overhead = 0.08
        # size of file created by fio in MiB
        fio_size = int((pvc_size * (1 - fs_overhead)) * 2**10)
        fio_conf += f"size={fio_size}M\n"
    # Otherwise, we are tryting to write as much data as possible and fill the
    # persistent volume entirely.
    # For cephfs we can't use fill_fs because of BZ 1763808 (the process
    # will get *Disk quota exceeded* error instead of *No space left on
    # device* error).
    # On the other hand, we can't use size={pvc_size} for rbd, as we can't
    # write pvc_size bytes to a filesystem on a block device of {pvc_size}
    # size (obviously, some space is used by filesystem metadata).
    elif fixture_name.endswith("rbd"):
        fio_conf += "fill_fs=1\n"
    else:
        fio_conf += f"size={pvc_size}G\n"

    # When we ask for checksum to be generated for all files written in the
    # /mnt/target directory, we change the command of the container to run
    # both fio and sha1 checksum tool in the target directory. To do that,
    # we use '/bin/sh -c' hack.
    if with_checksum:
        container = fio_job_dict['spec']['template']['spec']['containers'][0]
        fio_command = " ".join(container['command'])
        sha_command = ("sha1sum /mnt/target/simple-write.*"
                       " > /mnt/target/fio.sha1sum"
                       " 2> /mnt/target/fio.stderr")
        shell_command = fio_command + " && " + sha_command
        container['command'] = ["/bin/bash", "-c", shell_command]

    # put the dicts together into yaml file of the Job
    fio_configmap_dict["data"]["workload.fio"] = fio_conf
    fio_pvc_dict["spec"]["storageClassName"] = storage_class_name
    fio_pvc_dict["spec"]["resources"]["requests"]["storage"] = f"{pvc_size}Gi"
    fio_objs = [fio_pvc_dict, fio_configmap_dict, fio_job_dict]
    fio_job_file = ObjectConfFile(fixture_name, fio_objs, project, tmp_path)

    fio_min_mbps = config.ENV_DATA['fio_storageutilization_min_mbps']
    write_timeout = get_timeout(fio_min_mbps, pvc_size)

    test_file = os.path.join(measurement_dir, f"{fixture_name}.json")

    measured_op = measure_operation(lambda: write_data_via_fio(
        fio_job_file, write_timeout, pvc_size, target_percentage),
                                    test_file,
                                    measure_after=True,
                                    minimal_time=480)

    # we don't need to delete anything if this fixture has been already
    # executed
    if not measured_op['first_run']:
        return measured_op

    # measure MAX AVAIL value just before reclamaion of data written by fio
    _, max_avail_before_delete = get_ceph_storage_stats(ceph_pool_name)

    def is_storage_reclaimed():
        """
        Check whether data created by the Job were actually deleted.
        """
        _, max_avail = get_ceph_storage_stats(ceph_pool_name)
        reclaimed_size = round((max_avail - max_avail_before_delete) / 2**30)
        logger.info("%d Gi of %d Gi (PVC size) seems already reclaimed",
                    reclaimed_size, pvc_size)
        result = reclaimed_size >= pvc_size * 0.9
        if result:
            logger.info("Storage for the PVC was at least 90% reclaimed.")
        else:
            logger.info("Storage for the PVC was not yet reclaimed enough.")
        return result

    if with_checksum:
        # Let's get the name of the PV via the PVC.
        ocp_pvc = ocp.OCP(kind=constants.PVC, namespace=project.namespace)
        pvc_data = ocp_pvc.get()
        # Explicit list of assumptions, if these assumptions are not met, the
        # code won't work and it either means that something went terrible
        # wrong or that the code needs to be changed.
        assert pvc_data['kind'] == "List"
        assert len(pvc_data['items']) == 1
        pvc_dict = pvc_data['items'][0]
        assert pvc_dict['kind'] == constants.PVC
        pv_name = pvc_dict['spec']['volumeName']
        logger.info("Identified PV of the finished fio Job: %s", pv_name)
        # We change reclaim policy of the volume, so that we can reuse it
        # later, while everyting but the volume will be deleted during project
        # teardown. Note that while a standard way of doing this would be via
        # custom storage class with redefined reclaim policy, we need to do
        # this on this single volume only here, so editing volume directly is
        # more straightforward.
        logger.info("Changing persistentVolumeReclaimPolicy of %s", pv_name)
        ocp_pv = ocp.OCP(kind=constants.PV)
        patch_success = ocp_pv.patch(
            resource_name=pv_name,
            params='{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}')
        if patch_success:
            logger.info('Reclaim policy of %s was changed.', pv_name)
        else:
            logger.error('Reclaim policy of %s failed to be changed.', pv_name)
        label = f'fixture={fixture_name}'
        ocp_pv.add_label(pv_name, label)
    else:
        # Without checksum, we just need to make sure that data were deleted
        # and wait for this to happen to avoid conflicts with tests executed
        # right after this one.
        delete_fio_data(fio_job_file, is_storage_reclaimed)

    return measured_op
Beispiel #6
0
def measure_stop_worker_nodes(request, measurement_dir, nodes):
    """
    Stop worker nodes that doesn't contain RGW (so that alerts are triggered
    correctly), measure the time when it was stopped and monitors alerts that
    were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            worker node

    """
    mgr_pod = pod.get_mgr_pods()[0]
    mgr_node = pod.get_pod_node(mgr_pod)
    test_nodes = [
        worker_node
        for worker_node in get_nodes(node_type=constants.WORKER_MACHINE)
        if worker_node.name != mgr_node.name
    ]

    def stop_nodes():
        """
        Turn off test nodes for 5 minutes.

        Returns:
            list: Names of nodes that were turned down

        """
        # run_time of operation
        run_time = 60 * 5
        nonlocal test_nodes
        node_names = [node.name for node in test_nodes]
        logger.info(f"Turning off nodes {node_names}")
        nodes.stop_nodes(nodes=test_nodes)
        # Validate node reached NotReady state
        wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return node_names

    def finalizer():
        nodes.restart_nodes_by_stop_and_start_teardown()
        assert ceph_health_check(), "Ceph cluster health is not OK"
        logger.info("Ceph cluster health is OK")

    request.addfinalizer(finalizer)

    test_file = os.path.join(measurement_dir, "measure_stop_nodes.json")
    if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS:
        # It seems that it takes longer to propagate incidents to PagerDuty.
        # Adding 3 extra minutes
        measured_op = measure_operation(stop_nodes, test_file, minimal_time=60 * 8)
    else:
        measured_op = measure_operation(stop_nodes, test_file)
    logger.info("Turning on nodes")
    try:
        nodes.start_nodes(nodes=test_nodes)
    except CommandFailed:
        logger.warning(
            "Nodes were not found: they were probably recreated. Check ceph health below"
        )
    # Validate all nodes are in READY state and up
    retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)(
        wait_for_nodes_status
    )(timeout=900)

    # wait for ceph to return into HEALTH_OK state after mgr deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
Beispiel #7
0
def workload_idle(measurement_dir):
    """
    This workload represents a relative long timeframe when nothing special is
    happening, for test cases checking default status of various components
    (eg. no error alert is reported out of sudden, ceph should be healthy ...).

    Besides sheer waiting, this workload also checks that the number of ceph
    components (OSD and MON only) is the same at start and end of this wait,
    and passess the numbers to the test. If the number changes, something not
    exactly expected was happening with the cluster (eg. some node got offline,
    or cluster was expanded, ...) which doesn't match the idea of idle waiting
    and *invalidates the expectations of this workload*. Running test cases
    which expects idle workload in such case would be misleading, so we fail
    the workload in such case.
    """

    def count_ceph_components():
        ct_pod = pod.get_ceph_tools_pod()
        ceph_osd_ls_list = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd ls")
        logger.debug(f"ceph osd ls output: {ceph_osd_ls_list}")
        # the "+ 1" is a WORKAROUND for a bug in exec_ceph_cmd()
        # https://github.com/red-hat-storage/ocs-ci/issues/1152
        osd_num = len(ceph_osd_ls_list) + 1
        mon_num = len(ct_pod.exec_ceph_cmd(ceph_cmd="ceph mon metadata"))
        logger.info(f"There are {osd_num} OSDs, {mon_num} MONs")
        return osd_num, mon_num

    def do_nothing():
        sleep_time = 60 * 15  # seconds
        logger.info(f"idle workload is about to sleep for {sleep_time} s")
        osd_num_1, mon_num_1 = count_ceph_components()
        time.sleep(sleep_time)
        osd_num_2, mon_num_2 = count_ceph_components()
        # If this fails, we are likely observing an infra error or unsolicited
        # interference with test cluster from the outside. It could also be a
        # product bug, but this is less likely. See also docstring of this
        # workload fixture.
        msg = (
            "Assumption that nothing serious is happening not met, "
            "number of selected ceph components should be the same"
        )
        assert osd_num_1 == osd_num_2, msg
        assert mon_num_1 == mon_num_2, msg
        assert osd_num_1 >= 3, "OCS cluster should have at least 3 OSDs"
        result = {"osd_num": osd_num_1, "mon_num": mon_num_1}
        return result

    test_file = os.path.join(measurement_dir, "measure_workload_idle.json")

    # if io_in_bg detected, request and wait for it's temporary shutdown
    # but only if the fixture will actually run and measure the workload
    restart_io_in_bg = False
    if not is_measurement_done(test_file) and config.RUN.get("io_in_bg"):
        logger.info("io_in_bg detected, trying to pause it via load_status")
        config.RUN["load_status"] = "to_be_paused"
        restart_io_in_bg = True
        timeout = 600
        sleep_time = 60
        ts = TimeoutSampler(timeout, sleep_time, config.RUN.get, "load_status")
        try:
            for load_status in ts:
                if load_status == "paused":
                    logger.info("io_in_bg seems paused now")
                    break
        except ocs_ci.ocs.exceptions.TimeoutExpiredError as ex:
            error_msg = (
                f"io_in_bf failed to stop after {timeout} timeout, "
                "bug in io_in_bf (of ocs-ci) prevents execution of "
                "test cases which uses this fixture, rerun the affected "
                "test cases in a dedicated run and consider ocs-ci fix"
            )
            logger.error(ex)
            logger.error(error_msg)
            raise Exception(error_msg)
    else:
        logger.debug("io_in_bg not detected, good")

    measured_op = measure_operation(do_nothing, test_file)
    if restart_io_in_bg:
        logger.info("reverting load_status to resume io_in_bg")
        config.RUN["load_status"] = "to_be_resumed"
    return measured_op
Beispiel #8
0
def measure_noobaa_exceed_bucket_quota(measurement_dir, request, mcg_obj, awscli_pod):
    """
    Create NooBaa bucket, set its capacity quota to 2GB and fill it with data.

    Returns:
        dict: Contains information about `start` and `stop` time for
        corrupting Ceph Placement Group
    """
    bucket_name = create_unique_resource_name(
        resource_description="bucket", resource_type="s3"
    )
    bucket = MCGS3Bucket(bucket_name, mcg=mcg_obj)
    mcg_obj.send_rpc_query(
        "bucket_api",
        "update_bucket",
        {"name": bucket_name, "quota": {"unit": "GIGABYTE", "size": 2}},
    )
    bucket_info = mcg_obj.get_bucket_info(bucket.name)
    logger.info(f"Bucket {bucket.name} storage: {bucket_info['storage']}")
    logger.info(f"Bucket {bucket.name} data: {bucket_info['data']}")

    def teardown():
        """
        Delete test bucket.
        """
        bucket.delete()

    request.addfinalizer(teardown)

    def exceed_bucket_quota():
        """
        Upload 5 files with 500MB size into bucket that has quota set to 2GB.

        Returns:
            str: Name of utilized bucket
        """
        nonlocal mcg_obj
        nonlocal bucket_name
        nonlocal awscli_pod
        # run_time of operation
        run_time = 60 * 14
        awscli_pod.exec_cmd_on_pod("dd if=/dev/zero of=/tmp/testfile bs=1M count=500")
        for i in range(1, 6):
            awscli_pod.exec_cmd_on_pod(
                craft_s3_command(
                    f"cp /tmp/testfile s3://{bucket_name}/testfile{i}", mcg_obj
                ),
                out_yaml_format=False,
                secrets=[
                    mcg_obj.access_key_id,
                    mcg_obj.access_key,
                    mcg_obj.s3_endpoint,
                ],
            )

        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return bucket_name

    test_file = os.path.join(
        measurement_dir, "measure_noobaa_exceed__bucket_quota.json"
    )
    measured_op = measure_operation(exceed_bucket_quota, test_file)

    bucket_info = mcg_obj.get_bucket_info(bucket.name)
    logger.info(f"Bucket {bucket.name} storage: {bucket_info['storage']}")
    logger.info(f"Bucket {bucket.name} data: {bucket_info['data']}")

    logger.info(f"Deleting data from bucket {bucket_name}")
    for i in range(1, 6):
        awscli_pod.exec_cmd_on_pod(
            craft_s3_command(f"rm s3://{bucket_name}/testfile{i}", mcg_obj),
            out_yaml_format=False,
            secrets=[mcg_obj.access_key_id, mcg_obj.access_key, mcg_obj.s3_endpoint],
        )
    return measured_op
Beispiel #9
0
def measure_corrupt_pg(request, measurement_dir):
    """
    Create Ceph pool and corrupt Placement Group on one of OSDs, measures the
    time when it was corrupted and records alerts that were triggered during
    this event.

    Returns:
        dict: Contains information about `start` and `stop` time for
            corrupting Ceph Placement Group
    """
    osd_deployment = deployment.get_osd_deployments()[0]
    original_deployment_revision = osd_deployment.revision
    ct_pod = pod.get_ceph_tools_pod()
    pool_name = helpers.create_unique_resource_name("corrupted", "pool")
    ct_pod.exec_ceph_cmd(f"ceph osd pool create {pool_name} 1 1")
    ct_pod.exec_ceph_cmd(f"ceph osd pool application enable {pool_name} rbd")

    def teardown():
        """
        Make sure that corrupted pool is deleted and ceph health is ok
        """
        nonlocal pool_name
        nonlocal osd_deployment
        nonlocal original_deployment_revision
        logger.info(f"Deleting pool {pool_name}")
        ct_pod.exec_ceph_cmd(
            f"ceph osd pool delete {pool_name} {pool_name} "
            f"--yes-i-really-really-mean-it"
        )
        logger.info("Unsetting osd noout flag")
        ct_pod.exec_ceph_cmd("ceph osd unset noout")
        logger.info("Unsetting osd noscrub flag")
        ct_pod.exec_ceph_cmd("ceph osd unset noscrub")
        logger.info("Unsetting osd nodeep-scrub flag")
        ct_pod.exec_ceph_cmd("ceph osd unset nodeep-scrub")
        logger.info(f"Checking that pool {pool_name} is deleted")
        logger.info(
            f"Restoring deployment {osd_deployment.name} "
            f"to its original revision: {original_deployment_revision}"
        )
        if original_deployment_revision:
            osd_deployment.set_revision(original_deployment_revision)
            # unset original_deployment_revision because revision number is deleted when used
            original_deployment_revision = False
        # wait for ceph to return into HEALTH_OK state after osd deployment
        # is returned back to normal
        ceph_health_check(tries=20, delay=15)

    request.addfinalizer(teardown)
    logger.info("Setting osd noout flag")
    ct_pod.exec_ceph_cmd("ceph osd set noout")
    logger.info(f"Put object into {pool_name}")
    pool_object = "test_object"
    ct_pod.exec_ceph_cmd(f"rados -p {pool_name} put {pool_object} /etc/passwd")

    def corrupt_pg():
        """
        Corrupt PG on one OSD in Ceph pool for 14 minutes and measure it.
        There should be only CephPGRepairTakingTooLong Pending alert as
        it takes 2 hours for it to become Firing.
        This configuration of alert can be observed in ceph-mixins which
        is used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L23
        There should be also CephClusterErrorState alert that takes 10
        minutest to start firing.

        Returns:
            str: Name of corrupted pod
        """
        # run_time of operation
        run_time = 60 * 14
        nonlocal pool_name
        nonlocal pool_object
        nonlocal osd_deployment

        logger.info(f"Corrupting pool {pool_name} on {osd_deployment.name}")
        rados_utils.corrupt_pg(osd_deployment, pool_name, pool_object)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return osd_deployment.name

    test_file = os.path.join(measurement_dir, "measure_corrupt_pg.json")

    if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS:
        # It seems that it takes longer to propagate incidents to PagerDuty.
        # Adding 3 extra minutes
        measured_op = measure_operation(corrupt_pg, test_file, minimal_time=60 * 17)
    else:
        measured_op = measure_operation(corrupt_pg, test_file)

    teardown()

    return measured_op
Beispiel #10
0
def measure_stop_ceph_osd(measurement_dir, threading_lock):
    """
    Downscales Ceph osd deployment, measures the time when it was
    downscaled and alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            Ceph osd pod
    """
    oc = ocp.OCP(
        kind=constants.DEPLOYMENT,
        namespace=config.ENV_DATA.get("cluster_namespace"),
        threading_lock=threading_lock,
    )
    osd_deployments = oc.get(selector=constants.OSD_APP_LABEL).get("items")
    osds = [deployment.get("metadata").get("name") for deployment in osd_deployments]

    # get osd deployments to stop, leave even number of osd
    osd_to_stop = osds[-1]
    logger.info(f"osd disks to stop: {osd_to_stop}")
    logger.info(f"osd disks left to run: {osds[:-1]}")

    def stop_osd():
        """
        Downscale Ceph osd deployments for 11 minutes. First 1 minutes
        the alert CephOSDDiskNotResponding should be in 'Pending'.
        After 1 minute the alert turns into 'Firing' state.
        This configuration of osd can be observed in ceph-mixins which
        is used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L21
        There should be also CephClusterWarningState alert that takes 10
        minutest to be firing.

        Returns:
            str: Names of downscaled deployments
        """
        # run_time of operation
        run_time = 60 * 16
        nonlocal oc
        nonlocal osd_to_stop
        logger.info(f"Downscaling deployment {osd_to_stop} to 0")
        oc.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_to_stop}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return osd_to_stop

    test_file = os.path.join(measurement_dir, "measure_stop_ceph_osd.json")
    if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS:
        # It seems that it takes longer to propagate incidents to PagerDuty.
        # Adding 3 extra minutes
        measured_op = measure_operation(stop_osd, test_file, minimal_time=60 * 19)
    else:
        measured_op = measure_operation(stop_osd, test_file)
    logger.info(f"Upscaling deployment {osd_to_stop} back to 1")
    oc.exec_oc_cmd(f"scale --replicas=1 deployment/{osd_to_stop}")

    # wait for ceph to return into HEALTH_OK state after osd deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
Beispiel #11
0
def measure_stop_ceph_mon(measurement_dir, create_mon_quorum_loss, threading_lock):
    """
    Downscales Ceph Monitor deployment, measures the time when it was
    downscaled and monitors alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            Ceph Monitor pod
    """
    oc = ocp.OCP(
        kind=constants.DEPLOYMENT,
        namespace=config.ENV_DATA["cluster_namespace"],
        threading_lock=threading_lock,
    )
    mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"]
    mons = [deployment["metadata"]["name"] for deployment in mon_deployments]

    # get monitor deployments to stop,
    # if mon quorum to be lost split_index will be 1
    # else leave even number of monitors
    split_index = (
        1 if create_mon_quorum_loss else len(mons) // 2 if len(mons) > 3 else 2
    )
    mons_to_stop = mons[split_index:]
    logger.info(f"Monitors to stop: {mons_to_stop}")
    logger.info(f"Monitors left to run: {mons[:split_index]}")

    # run_time of operation
    run_time = 60 * 14

    def stop_mon():
        """
        Downscale Ceph Monitor deployments for 14 minutes. First 15 minutes
        the alert CephMonQuorumAtRisk should be in 'Pending'. After 15 minutes
        the alert turns into 'Firing' state.
        This configuration of monitoring can be observed in ceph-mixins which
        are used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L16
        `Firing` state shouldn't actually happen because monitor should be
        automatically redeployed shortly after 10 minutes.

        Returns:
            str: Names of downscaled deployments
        """
        nonlocal oc
        nonlocal mons_to_stop
        for mon in mons_to_stop:
            logger.info(f"Downscaling deployment {mon} to 0")
            oc.exec_oc_cmd(f"scale --replicas=0 deployment/{mon}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return mons_to_stop

    test_file = os.path.join(
        measurement_dir, f"measure_stop_ceph_mon_{split_index}.json"
    )
    if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS:
        # It seems that it takes longer to propagate incidents to PagerDuty.
        # Adding 6 extra minutes so that alert is actually triggered and
        # unscheduling worker nodes so that monitor is not replaced
        worker_node_names = [
            node.name for node in get_nodes(node_type=constants.WORKER_MACHINE)
        ]
        unschedule_nodes(worker_node_names)
        measured_op = measure_operation(stop_mon, test_file, minimal_time=60 * 20)
        schedule_nodes(worker_node_names)
    else:
        measured_op = measure_operation(stop_mon, test_file)

    # expected minimal downtime of a mon inflicted by this fixture
    measured_op["min_downtime"] = run_time - (60 * 2)

    # get new list of monitors to make sure that new monitors were deployed
    mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"]
    mons = [deployment["metadata"]["name"] for deployment in mon_deployments]

    # check that downscaled monitors are removed as OCS should redeploy them
    # but only when we are running this for the first time
    check_old_mons_deleted = all(mon not in mons for mon in mons_to_stop)
    if measured_op["first_run"] and not check_old_mons_deleted:
        for mon in mons_to_stop:
            logger.info(f"Upscaling deployment {mon} back to 1")
            oc.exec_oc_cmd(f"scale --replicas=1 deployment/{mon}")
        if (
            not split_index == 1
            and config.ENV_DATA["platform"].lower()
            not in constants.MANAGED_SERVICE_PLATFORMS
        ):
            msg = f"Downscaled monitors {mons_to_stop} were not replaced"
            assert check_old_mons_deleted, msg

    # wait for ceph to return into HEALTH_OK state after mon deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
Beispiel #12
0
def measure_stop_ceph_mon(measurement_dir):
    """
    Downscales Ceph Monitor deployment, measures the time when it was
    downscaled and monitors alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            Ceph Monitor pod
    """
    oc = ocp.OCP(kind=constants.DEPLOYMENT,
                 namespace=config.ENV_DATA['cluster_namespace'])
    mon_deployments = oc.get(selector=constants.MON_APP_LABEL)['items']
    mons = [deployment['metadata']['name'] for deployment in mon_deployments]

    # get monitor deployments to stop, leave even number of monitors
    split_index = len(mons) // 2 if len(mons) > 3 else 2
    mons_to_stop = mons[split_index:]
    logger.info(f"Monitors to stop: {mons_to_stop}")
    logger.info(f"Monitors left to run: {mons[:split_index]}")

    # run_time of operation
    run_time = 60 * 14

    def stop_mon():
        """
        Downscale Ceph Monitor deployments for 14 minutes. First 15 minutes
        the alert CephMonQuorumAtRisk should be in 'Pending'. After 15 minutes
        the alert turns into 'Firing' state.
        This configuration of monitoring can be observed in ceph-mixins which
        are used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L16
        `Firing` state shouldn't actually happen because monitor should be
        automatically redeployed shortly after 10 minutes.

        Returns:
            str: Names of downscaled deployments
        """
        nonlocal oc
        nonlocal mons_to_stop
        for mon in mons_to_stop:
            logger.info(f"Downscaling deployment {mon} to 0")
            oc.exec_oc_cmd(f"scale --replicas=0 deployment/{mon}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return mons_to_stop

    test_file = os.path.join(measurement_dir, 'measure_stop_ceph_mon.json')
    measured_op = measure_operation(stop_mon, test_file)

    # expected minimal downtime of a mon inflicted by this fixture
    measured_op['min_downtime'] = run_time - (60 * 2)

    # get new list of monitors to make sure that new monitors were deployed
    mon_deployments = oc.get(selector=constants.MON_APP_LABEL)['items']
    mons = [deployment['metadata']['name'] for deployment in mon_deployments]

    # check that downscaled monitors are removed as OCS should redeploy them
    # but only when we are running this for the first time
    check_old_mons_deleted = all(mon not in mons for mon in mons_to_stop)
    if measured_op['first_run'] and not check_old_mons_deleted:
        for mon in mons_to_stop:
            logger.info(f"Upscaling deployment {mon} back to 1")
            oc.exec_oc_cmd(f"scale --replicas=1 deployment/{mon}")
        msg = f"Downscaled monitors {mons_to_stop} were not replaced"
        assert check_old_mons_deleted, msg

    return measured_op
Beispiel #13
0
def measure_corrupt_pg(measurement_dir):
    """
    Create Ceph pool and corrupt Placement Group on one of OSDs, measures the
    time when it was corrupted and records alerts that were triggered during
    this event.

    Returns:
        dict: Contains information about `start` and `stop` time for
        corrupting Ceph Placement Group
    """
    oc = ocp.OCP(kind=constants.DEPLOYMENT,
                 namespace=config.ENV_DATA.get('cluster_namespace'))
    osd_deployments = oc.get(selector=constants.OSD_APP_LABEL).get('items')
    osd_deployment = osd_deployments[0].get('metadata').get('name')
    ct_pod = pod.get_ceph_tools_pod()
    pool_name = helpers.create_unique_resource_name('corrupted', 'pool')
    ct_pod.exec_ceph_cmd(f"ceph osd pool create {pool_name} 1 1")
    logger.info('Setting osd noout flag')
    ct_pod.exec_ceph_cmd('ceph osd set noout')
    logger.info(f"Put object into {pool_name}")
    pool_object = 'test_object'
    ct_pod.exec_ceph_cmd(f"rados -p {pool_name} put {pool_object} /etc/passwd")
    logger.info(f"Looking for Placement Group with {pool_object} object")
    pg = ct_pod.exec_ceph_cmd(
        f"ceph osd map {pool_name} {pool_object}")['pgid']
    logger.info(f"Found Placement Group: {pg}")

    dummy_deployment, dummy_pod = helpers.create_dummy_osd(osd_deployment)

    def corrupt_pg():
        """
        Corrupt PG on one OSD in Ceph pool for 12 minutes and measure it.
        There should be only CephPGRepairTakingTooLong Pending alert as
        it takes 2 hours for it to become Firing.
        This configuration of alert can be observed in ceph-mixins which
        is used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L23
        There should be also CephClusterErrorState alert that takes 10
        minutest to start firing.

        Returns:
            str: Name of corrupted deployment
        """
        # run_time of operation
        run_time = 60 * 12
        nonlocal oc
        nonlocal pool_name
        nonlocal pool_object
        nonlocal dummy_pod
        nonlocal pg
        nonlocal osd_deployment
        nonlocal dummy_deployment

        logger.info(f"Corrupting {pg} PG on {osd_deployment}")
        dummy_pod.exec_sh_cmd_on_pod(
            f"ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-"
            f"{osd_deployment.split('-')[-1]} --pgid {pg} {pool_object} "
            f"set-bytes /etc/shadow --no-mon-config")
        logger.info('Unsetting osd noout flag')
        ct_pod.exec_ceph_cmd('ceph osd unset noout')
        ct_pod.exec_ceph_cmd(f"ceph pg deep-scrub {pg}")
        oc.exec_oc_cmd(f"scale --replicas=0 deployment/{dummy_deployment}")
        oc.exec_oc_cmd(f"scale --replicas=1 deployment/{osd_deployment}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return osd_deployment

    test_file = os.path.join(measurement_dir, 'measure_corrupt_pg.json')
    measured_op = measure_operation(corrupt_pg, test_file)
    logger.info(f"Deleting pool {pool_name}")
    ct_pod.exec_ceph_cmd(f"ceph osd pool delete {pool_name} {pool_name} "
                         f"--yes-i-really-really-mean-it")
    logger.info(f"Checking that pool {pool_name} is deleted")

    logger.info(f"Deleting deployment {dummy_deployment}")
    oc.delete(resource_name=dummy_deployment)

    return measured_op