Example #1
0
def write_empty_files_to_bucket(mcg_obj, awscli_pod_session, bucket_name,
                                test_directory_setup):
    """
    Write empty files to bucket and verify if they are created.

    Args:
        mcg_obj (MCG) : An MCG object containing the MCG S3 connection credentials
        awscli_pod_session : Fixture to create a new AWSCLI pod for relaying commands.
        bucket_name (str) : Name of the bucket on which files are to be written.
        test_directory_setup : Fixture to setup test DIRs.

    Raises:
        UnexpectedBehaviour : Raises an exception if files are not created.

    Returns:
        Set: A set of names of all bucket objects.

    """

    full_object_path = f"s3://{bucket_name}"
    data_dir = test_directory_setup.origin_dir

    # Touch create 1000 empty files in bucket
    command = f"for file_no in $(seq 1 1000); do touch {data_dir}/test$file_no; done"
    awscli_pod_session.exec_sh_cmd_on_pod(command=command, sh="sh")
    # Write all empty objects to the bucket
    sync_object_directory(awscli_pod_session, data_dir, full_object_path,
                          mcg_obj)

    log.info("Successfully created files.")

    obj_set = set(
        obj.key for obj in mcg_obj.s3_list_all_objects_in_bucket(bucket_name))
    test_set = set("test" + str(file_no + 1) for file_no in range(1000))

    if test_set != obj_set:
        raise ex.UnexpectedBehaviour("File name set does not match")
    log.info("File name set match")

    return obj_set
Example #2
0
def get_job_pods(job_name, namespace, names_only=False):
    """
    Get list of pods of given job (via job-name pod selector).

    Args:
        job_name (str): name of the job to wait for
        namespace (str): name of the namespace where the job is running

    Returns:
        list: list of pod names (if names_only is True) or full item dicts
    """
    ocp_pod = OCP(kind="Pod", namespace=namespace)
    oc_result = ocp_pod.get(selector=f"job-name={job_name}")
    if oc_result["kind"] != "List":
        error_msg = "oc get should return List item"
        log.error(error_msg)
        log.debug(oc_result)
        raise exceptions.UnexpectedBehaviour(error_msg)
    if names_only:
        result = [item["metadata"]["name"] for item in oc_result["items"]]
    else:
        result = oc_result["items"]
    return result
Example #3
0
    def log_reader_writer_parallel(self):
        """
        Write and read logfile stored on cephfs volume, from all worker nodes of a
        cluster via k8s Deployment.

        Raise:
            NotFoundError: When given volume is not found in given spec
            UnexpectedBehaviour: When an unexpected problem with starting the workload occurred

        """

        # get deployment dict for the reproducer logwriter workload
        with open(constants.LOGWRITER_CEPHFS_REPRODUCER,
                  "r") as deployment_file:
            self.deploy_dict = yaml.safe_load(deployment_file.read())
        # if we are running in disconnected environment, we need to mirror the
        # container image first, and then use the mirror instead of the original
        if config.DEPLOYMENT.get("disconnected"):
            update_container_with_mirrored_image(
                self.deploy_dict["spec"]["template"])
        # we need to match deployment replicas with number of worker nodes
        self.deploy_dict["spec"]["replicas"] = len(get_worker_nodes())
        # drop topology spread constraints related to zones
        topology.drop_topology_constraint(
            self.deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL)
        # and link the deployment with the pvc
        try:
            link_spec_volume(
                self.deploy_dict["spec"]["template"]["spec"],
                "logwriter-cephfs-volume",
                self.pvc_dict["metadata"]["name"],
            )
        except (exceptions.NotFoundError, KeyError) as ex:
            logger.warning(
                "Failed to link the deployment with the pvc. We may need to check if the "
                "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test"
            )
            raise ex

        # prepare k8s yaml file for deployment
        self.workload_file = ObjectConfFile(
            "log_reader_writer_parallel",
            [self.pvc_dict, self.deploy_dict],
            self.project,
            self.tmp_path,
        )
        # deploy the workload, starting the log reader/writer pods
        logger.info(
            "starting log reader/writer workload via Deployment, one pod per worker"
        )
        self.workload_file.create()

        logger.info("waiting for all pods of the workload Deployment to run")
        self.ocp_pod = ocp.OCP(kind="Pod", namespace=self.project.namespace)
        try:
            self.ocp_pod.wait_for_resource(
                resource_count=self.deploy_dict["spec"]["replicas"],
                condition=constants.STATUS_RUNNING,
                error_condition=constants.STATUS_ERROR,
                timeout=300,
                sleep=30,
            )
        except Exception as ex:
            # this is not a problem with feature under test, but with infra,
            # cluster configuration or unrelated bug which must have happened
            # before this test case
            error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken"
            logger.exception(error_msg)
            logger.debug(self.workload_file.describe())
            raise exceptions.UnexpectedBehaviour(error_msg) from ex
Example #4
0
def test_log_reader_writer_parallel(project, tmp_path):
    """
    Write and read logfile stored on cephfs volume, from all worker nodes of a
    cluster via k8s Deployment, while fetching content of the stored data via
    oc rsync to check the data locally.

    Reproduces BZ 1989301. Test failure means new blocker high priority bug.
    """
    pvc_dict = get_pvc_dict()
    # we need to mount the volume on every worker node, so RWX/cephfs
    pvc_dict["metadata"]["name"] = "logwriter-cephfs-many"
    pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX]
    if (
        config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS
    ) and storagecluster_independent_check():
        sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS
    else:
        sc_name = constants.CEPHFILESYSTEM_SC
    pvc_dict["spec"]["storageClassName"] = sc_name
    # there is no need for lot of storage capacity for this test
    pvc_dict["spec"]["resources"]["requests"]["storage"] = "1Gi"

    # get deployment dict for the reproducer logwriter workload
    with open(constants.LOGWRITER_CEPHFS_REPRODUCER, "r") as deployment_file:
        deploy_dict = yaml.safe_load(deployment_file.read())
    # if we are running in disconnected environment, we need to mirror the
    # container image first, and then use the mirror instead of the original
    if config.DEPLOYMENT.get("disconnected"):
        update_container_with_mirrored_image(deploy_dict["spec"]["template"])
    # we need to match deployment replicas with number of worker nodes
    deploy_dict["spec"]["replicas"] = len(get_worker_nodes())
    # drop topology spread constraints related to zones
    topology.drop_topology_constraint(
        deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL
    )
    # and link the deployment with the pvc
    try:
        link_spec_volume(
            deploy_dict["spec"]["template"]["spec"],
            "logwriter-cephfs-volume",
            pvc_dict["metadata"]["name"],
        )
    except Exception as ex:
        error_msg = "LOGWRITER_CEPHFS_REPRODUCER no longer matches code of this test"
        raise Exception(error_msg) from ex

    # prepare k8s yaml file for deployment
    workload_file = ObjectConfFile(
        "log_reader_writer_parallel", [pvc_dict, deploy_dict], project, tmp_path
    )
    # deploy the workload, starting the log reader/writer pods
    logger.info(
        "starting log reader/writer workload via Deployment, one pod per worker"
    )
    workload_file.create()

    logger.info("waiting for all pods of the workload Deployment to run")
    ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace)
    try:
        ocp_pod.wait_for_resource(
            resource_count=deploy_dict["spec"]["replicas"],
            condition=constants.STATUS_RUNNING,
            error_condition=constants.STATUS_ERROR,
            timeout=300,
            sleep=30,
        )
    except Exception as ex:
        # this is not a problem with feature under test, but with infra,
        # cluster configuration or unrelated bug which must have happened
        # before this test case
        error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken"
        logger.exception(error_msg)
        logger.debug(workload_file.describe())
        raise exceptions.UnexpectedBehaviour(error_msg) from ex

    # while the workload is running, we will try to fetch and validate data
    # from the cephfs volume of the workload 120 times (this number of retries
    # is a bit larger than usual number required to reproduce bug from
    # BZ 1989301, but we need to be sure here)
    number_of_fetches = 120
    # if given fetch fail, we will ignore the failure unless the number of
    # failures is too high (this has no direct impact on feature under test,
    # we should be able to detect the bug even with 10% of rsync failures,
    # since data corruption doesn't simply go away ...)
    number_of_failures = 0
    allowed_failures = 12
    is_local_data_ok = True
    local_dir = tmp_path / "logwriter"
    local_dir.mkdir()
    workload_pods = ocp_pod.get()
    workload_pod_name = workload_pods["items"][0]["metadata"]["name"]
    logger.info(
        "while the workload is running, we will fetch and check data from the cephfs volume %d times",
        number_of_fetches,
    )
    for _ in range(number_of_fetches):
        # fetch data from cephfs volume into the local dir
        oc_cmd = [
            "oc",
            "rsync",
            "--loglevel=4",
            "-n",
            project.namespace,
            f"pod/{workload_pod_name}:/mnt/target",
            local_dir,
        ]
        try:
            run_cmd(cmd=oc_cmd, timeout=300)
        except Exception as ex:
            number_of_failures += 1
            # in case this fails, we are going to fetch extra evidence, that
            # said such failure is most likely related to OCP or infrastructure
            error_msg = "oc rsync failed: something is wrong with the cluster"
            logger.exception(error_msg)
            logger.debug(workload_file.describe())
            oc_rpm_debug = [
                "oc",
                "rsh",
                "-n",
                project.namespace,
                f"pod/{workload_pod_name}",
                "bash",
                "-c",
                ";".join(
                    [
                        "rpm -qa",
                        "rpm -qaV",
                        "type -a tar",
                        "tar --version",
                        "type -a rsync",
                        "rsync --version",
                    ]
                ),
            ]
            try:
                run_cmd(cmd=oc_rpm_debug, timeout=600)
            except Exception:
                # if fetch of additional evidence fails, log and ignore the
                # exception (so that we can retry if needed)
                logger.exception("failed to fetch additional evidence")
            # in case the rsync run failed because of a container restart,
            # we assume the pod name hasn't changed, and just wait for the
            # container to be running again - unless the number of rsync
            # failures is too high
            if number_of_failures > allowed_failures:
                logger.error("number of ignored rsync failures is too high")
            else:
                ocp_pod.wait_for_resource(
                    resource_count=deploy_dict["spec"]["replicas"],
                    condition=constants.STATUS_RUNNING,
                    error_condition=constants.STATUS_ERROR,
                    timeout=300,
                    sleep=30,
                )
                continue
            logger.debug(
                "before this failure, we ignored %d previous failures",
                number_of_failures,
            )
            raise exceptions.UnexpectedBehaviour(error_msg) from ex
        # look for null bytes in the just fetched local files in target dir,
        # and if these binary bytes are found, the test failed (the bug
        # was reproduced)
        target_dir = os.path.join(local_dir, "target")
        for file_name in os.listdir(target_dir):
            with open(os.path.join(target_dir, file_name), "r") as fo:
                data = fo.read()
                if "\0" in data:
                    is_local_data_ok = False
                    logger.error(
                        "file %s is corrupted: null byte found in a text file",
                        file_name,
                    )
        # is_local_data_ok = False
        assert is_local_data_ok, "data corruption detected"
        time.sleep(2)

    logger.debug("number of ignored rsync failures: %d", number_of_failures)

    # if no obvious problem was detected, run the logreader job to validate
    # checksums in the log files (so that we are 100% sure that nothing went
    # wrong with the IO or the data)
    with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file:
        job_dict = yaml.safe_load(job_file.read())
    # mirroring for disconnected environment, if necessary
    if config.DEPLOYMENT.get("disconnected"):
        update_container_with_mirrored_image(job_dict["spec"]["template"])
    # drop topology spread constraints related to zones
    topology.drop_topology_constraint(
        job_dict["spec"]["template"]["spec"], topology.ZONE_LABEL
    )
    # we need to match number of jobs with the number used in the workload
    job_dict["spec"]["completions"] = deploy_dict["spec"]["replicas"]
    job_dict["spec"]["parallelism"] = deploy_dict["spec"]["replicas"]
    # and reffer to the correct pvc name
    try:
        link_spec_volume(
            job_dict["spec"]["template"]["spec"],
            "logwriter-cephfs-volume",
            pvc_dict["metadata"]["name"],
        )
    except Exception as ex:
        error_msg = "LOGWRITER_CEPHFS_READER no longer matches code of this test"
        raise Exception(error_msg) from ex
    # prepare k8s yaml file for the job
    job_file = ObjectConfFile("log_reader", [job_dict], project, tmp_path)
    # deploy the job, starting the log reader pods
    logger.info(
        "starting log reader data validation job to fully check the log data",
    )
    job_file.create()
    # wait for the logreader job to complete (this should be rather quick)
    try:
        job.wait_for_job_completion(
            job_name=job_dict["metadata"]["name"],
            namespace=project.namespace,
            timeout=300,
            sleep_time=30,
        )
    except exceptions.TimeoutExpiredError:
        error_msg = (
            "verification failed to complete in time: data loss or broken cluster?"
        )
        logger.exception(error_msg)
    # and then check that the job completed with success
    logger.info("checking the result of data validation job")
    logger.debug(job_file.describe())
    ocp_job = ocp.OCP(
        kind="Job",
        namespace=project.namespace,
        resource_name=job_dict["metadata"]["name"],
    )
    job_status = ocp_job.get()["status"]
    logger.info("last status of data verification job: %s", job_status)
    if (
        "failed" in job_status
        or job_status["succeeded"] != deploy_dict["spec"]["replicas"]
    ):
        error_msg = "possible data corruption: data verification job failed!"
        logger.error(error_msg)
        job.log_output_of_job_pods(
            job_name=job_dict["metadata"]["name"], namespace=project.namespace
        )
        raise Exception(error_msg)
Example #5
0
    def factory(
        num_of_pvcs=100,
        pvc_size=2,
        bulk=False,
        project=None,
        measure=True,
        delete=True,
        file_name=None,
        fio_percentage=25,
        verify_fio=False,
        expand=False,
    ):
        """
        Args:
            num_of_pvcs (int) : Number of PVCs / PODs we want to create.
            pvc_size (int) : Size of each PVC in GB.
            bulk (bool) : True for bulk operations, False otherwise.
            project (obj) : Project obj inside which the PODs/PVCs are created.
            measure (bool) : True if we want to measure the PVC creation/deletion time and POD to PVC attach time,
                                False otherwise.
            delete (bool) : True if we want to delete PVCs and PODs, False otherwise
            file_name (str) : Name of the file on which FIO is performed.
            fio_percentage (float) : Percentage of PVC space we want to be utilized for FIO.
            verify_fio (bool) : True if we want to verify FIO, False otherwise.
            expand (bool) : True if we want to verify_fio for expansion of PVCs operation, False otherwise.

        """

        if not project:
            project = project_factory("longevity")
        pvc_objs = list()
        executor = ThreadPoolExecutor(max_workers=1)
        start_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
        for interface in (constants.CEPHFILESYSTEM, constants.CEPHBLOCKPOOL):
            if interface == constants.CEPHFILESYSTEM:
                access_modes = [
                    constants.ACCESS_MODE_RWO, constants.ACCESS_MODE_RWX
                ]
                num_of_pvc = num_of_pvcs // 2
            else:
                access_modes = [
                    constants.ACCESS_MODE_RWO,
                    constants.ACCESS_MODE_RWO + "-" +
                    constants.VOLUME_MODE_BLOCK,
                    constants.ACCESS_MODE_RWX + "-" +
                    constants.VOLUME_MODE_BLOCK,
                ]
                num_of_pvc = num_of_pvcs - num_of_pvcs // 2

            # Create PVCs
            if num_of_pvc > 0:
                pvc_objs_tmp = multi_pvc_factory(
                    interface=interface,
                    size=pvc_size,
                    project=project,
                    access_modes=access_modes,
                    status=constants.STATUS_BOUND,
                    num_of_pvc=num_of_pvc,
                    wait_each=not bulk,
                )
                log.info("PVC creation was successful.")
                pvc_objs.extend(pvc_objs_tmp)

                if measure:
                    # Measure PVC Creation Time
                    measure_pvc_creation_time(interface, pvc_objs_tmp,
                                              start_time)

            else:
                log.info(
                    f"Num of PVCs of interface - {interface} = {num_of_pvc}. So no PVCs created."
                )

        # PVC and PV Teardown
        for pvc_obj in pvc_objs:
            teardown_factory(pvc_obj)
            teardown_factory(pvc_obj.backed_pv_obj)

        # Create PODs
        pod_objs = list()
        for pvc_obj in pvc_objs:
            if pvc_obj.get_pvc_vol_mode == constants.VOLUME_MODE_BLOCK:
                if not bulk:
                    pod_objs.append(
                        pod_factory(
                            pvc=pvc_obj,
                            raw_block_pv=True,
                            status=constants.STATUS_RUNNING,
                            pod_dict_path=constants.PERF_BLOCK_POD_YAML,
                        ))
                else:
                    pod_objs.append(
                        pod_factory(
                            pvc=pvc_obj,
                            raw_block_pv=True,
                            pod_dict_path=constants.PERF_BLOCK_POD_YAML,
                        ))
            else:
                if not bulk:
                    pod_objs.append(
                        pod_factory(
                            pvc=pvc_obj,
                            status=constants.STATUS_RUNNING,
                            pod_dict_path=constants.PERF_POD_YAML,
                        ))
                else:
                    pod_objs.append(
                        pod_factory(pvc=pvc_obj,
                                    pod_dict_path=constants.PERF_POD_YAML))

            log.info(f"POD {pod_objs[-1].name} creation was successful.")
        log.info("All PODs are created.")

        if bulk:
            for pod_obj in pod_objs:
                executor.submit(
                    helpers.wait_for_resource_state,
                    pod_obj,
                    constants.STATUS_RUNNING,
                    timeout=300,
                )
                log.info(f"POD {pod_obj.name} reached Running State.")

            log.info("All PODs reached Running State.")

        if measure:
            # Measure POD to PVC attach time
            measure_pod_to_pvc_attach_time(pod_objs)

        # POD Teardown
        for pod_obj in pod_objs:
            teardown_factory(pod_obj)

        # Run FIO on PODs
        fio_size = int((fio_percentage / 100) * pvc_size * 1000)
        for pod_obj in pod_objs:
            storage_type = ("block" if pod_obj.pvc.get_pvc_vol_mode
                            == constants.VOLUME_MODE_BLOCK else "fs")
            pod_obj.wl_setup_done = True
            pod_obj.wl_obj = workload.WorkLoad(
                "test_workload_fio",
                pod_obj.get_storage_path(storage_type),
                "fio",
                storage_type,
                pod_obj,
                1,
            )
            if not file_name:
                pod_obj.run_io(storage_type, f"{fio_size}M")
            else:
                pod_obj.run_io(
                    storage_type=storage_type,
                    size=f"{fio_size}M",
                    runtime=20,
                    fio_filename=file_name,
                    end_fsync=1,
                )

        if verify_fio:
            log.info(
                "Waiting for IO to complete on all pods to utilise 25% of PVC used space"
            )

            for pod_obj in pod_objs:
                # Wait for IO to finish
                pod_obj.get_fio_results(3600)
                log.info(f"IO finished on pod {pod_obj.name}")
                is_block = (True if pod_obj.pvc.get_pvc_vol_mode
                            == constants.VOLUME_MODE_BLOCK else False)
                file_name_pod = (file_name
                                 if not is_block else pod_obj.get_storage_path(
                                     storage_type="block"))
                # Verify presence of the file
                file_path = (file_name_pod if is_block else pod.get_file_path(
                    pod_obj, file_name_pod))
                log.info(f"Actual file path on the pod {file_path}")
                assert pod.check_file_existence(
                    pod_obj, file_path), f"File {file_name_pod} does not exist"
                log.info(f"File {file_name_pod} exists in {pod_obj.name}")

                if expand and is_block:
                    # Read IO from block PVCs using dd and calculate md5sum.
                    # This dd command reads the data from the device, writes it to
                    # stdout, and reads md5sum from stdin.
                    pod_obj.pvc.md5sum = pod_obj.exec_sh_cmd_on_pod(
                        command=(f"dd iflag=direct if={file_path} bs=10M "
                                 f"count={fio_size // 10} | md5sum"))
                    log.info(
                        f"md5sum of {file_name_pod}: {pod_obj.pvc.md5sum}")
                else:
                    # Calculate md5sum of the file
                    pod_obj.pvc.md5sum = pod.cal_md5sum(pod_obj, file_name_pod)

        log.info("POD FIO was successful.")

        if delete:
            # Delete PODs
            pod_delete = executor.submit(delete_pods, pod_objs, wait=not bulk)
            pod_delete.result()

            log.info("Verified: Pods are deleted.")

            # Delete PVCs
            pvc_delete = executor.submit(delete_pvcs,
                                         pvc_objs,
                                         concurrent=bulk)
            res = pvc_delete.result()
            if not res:
                raise ex.UnexpectedBehaviour("Deletion of PVCs failed")
            log.info("PVC deletion was successful.")

            # Validate PV Deletion
            for pvc_obj in pvc_objs:
                helpers.validate_pv_delete(pvc_obj.backed_pv)
            log.info("PV deletion was successful.")

            if measure:
                # Measure PVC Deletion Time
                for interface in (constants.CEPHFILESYSTEM,
                                  constants.CEPHBLOCKPOOL):
                    if interface == constants.CEPHFILESYSTEM:
                        measure_pvc_deletion_time(
                            interface,
                            pvc_objs[:num_of_pvcs // 2],
                        )
                    else:
                        measure_pvc_deletion_time(
                            interface,
                            pvc_objs[num_of_pvcs // 2:],
                        )

            log.info(f"Successfully deleted {num_of_pvcs} PVCs")
        else:
            return pvc_objs, pod_objs