Example #1
0
def wait_for_replication_resources_deletion(namespace, timeout, check_state=True):
    """
    Wait for replication resources to be deleted

    Args:
        namespace (str): the namespace of the resources'
        timeout (int): time in seconds to wait for resources to reach expected
            state or deleted
        check_state (bool): True for checking resources state before deletion, False otherwise

    Raises:
        TimeoutExpiredError: In case replication resources not deleted

    """
    if check_state:
        logger.info("Waiting for all VRs to reach secondary state")
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_vr_state,
            state="secondary",
            namespace=namespace,
        )
        if not sample.wait_for_func_status(result=True):
            error_msg = "One or more VR haven't reached expected state secondary within the time limit."
            logger.error(error_msg)
            raise TimeoutExpiredError(error_msg)

        logger.info("Waiting for VRG to reach secondary state")
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_vrg_state,
            state="secondary",
            namespace=namespace,
        )
        if not sample.wait_for_func_status(result=True):
            error_msg = (
                "VRG hasn't reached expected state secondary within the time limit."
            )
            logger.info(error_msg)
            raise TimeoutExpiredError(error_msg)

    logger.info("Waiting for VRG to be deleted")
    sample = TimeoutSampler(
        timeout=timeout, sleep=5, func=check_vrg_existence, namespace=namespace
    )
    if not sample.wait_for_func_status(result=False):
        error_msg = "VRG resource not deleted"
        logger.info(error_msg)
        raise TimeoutExpiredError(error_msg)

    logger.info("Waiting for all VRs to be deleted")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=get_vr_count,
        namespace=namespace,
    )
    sample.wait_for_func_value(0)
Example #2
0
def wait_for_replication_resources_creation(vr_count, namespace, timeout):
    """
    Wait for replication resources to be created

    Args:
        vr_count (int): Expected number of VR resources
        namespace (str): the namespace of the VR resources
        timeout (int): time in seconds to wait for VR resources to be created
            or reach expected state

    Raises:
        TimeoutExpiredError: In case replication resources not created

    """
    logger.info("Waiting for VRG to be created")
    sample = TimeoutSampler(
        timeout=timeout, sleep=5, func=check_vrg_existence, namespace=namespace
    )
    if not sample.wait_for_func_status(result=True):
        error_msg = "VRG resource is not created"
        logger.error(error_msg)
        raise TimeoutExpiredError(error_msg)

    logger.info(f"Waiting for {vr_count} VRs to be created")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=get_vr_count,
        namespace=namespace,
    )
    sample.wait_for_func_value(vr_count)

    logger.info(f"Waiting for {vr_count} VRs to reach primary state")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=check_vr_state,
        state="primary",
        namespace=namespace,
    )
    if not sample.wait_for_func_status(result=True):
        error_msg = "One or more VR haven't reached expected state primary within the time limit."
        logger.error(error_msg)
        raise TimeoutExpiredError(error_msg)

    logger.info("Waiting for VRG to reach primary state")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=check_vrg_state,
        state="primary",
        namespace=namespace,
    )
    if not sample.wait_for_func_status(result=True):
        error_msg = "VRG hasn't reached expected state primary within the time limit."
        logger.error(error_msg)
        raise TimeoutExpiredError(error_msg)
Example #3
0
    def verify_operator_succeeded(self,
                                  operator=OCS_OPERATOR,
                                  timeout_install=300,
                                  sleep=20):
        """
        Verify Operator Installation

        Args:
            operator (str): type of operator
            timeout_install (int): Time in seconds to wait
            sleep (int): Sampling time in seconds

        """
        self.search_operator_installed_operators_page(operator=operator)
        time.sleep(5)
        sample = TimeoutSampler(
            timeout=timeout_install,
            sleep=sleep,
            func=self.check_element_text,
            expected_text="Succeeded",
        )
        if not sample.wait_for_func_status(result=True):
            logger.error(
                f"{operator} Installation status is not Succeeded after {timeout_install} seconds"
            )
            self.take_screenshot()
            raise TimeoutExpiredError(
                f"{operator} Installation status is not Succeeded after {timeout_install} seconds"
            )
        self.take_screenshot()
Example #4
0
    def check_new_pid(self, node_name=None):
        """
        Check if the pid of the daemon has changed from the initially selected pid(daemon_pid attribute)

        Args:
            node_name (str): Name of node in which the resource daemon is running

        """
        node_name = node_name or self.resource_obj[0].pod_data.get("spec").get(
            "nodeName")
        awk_print = "'{print $1}'"
        pid_cmd = (
            f"oc {self.kubeconfig_parameter()}debug node/{node_name} -- chroot /host ps ax | grep"
            f" ' ceph-{self.resource} --' | grep -v grep | awk {awk_print}")
        try:
            for pid_proc in TimeoutSampler(60, 2, run_async, command=pid_cmd):
                ret, pid, err = pid_proc.async_communicate()

                # Consider scenario where more than one self.resource pod
                # is running on one node. eg:More than one osd on same node
                pids = pid.strip().split()
                pids = [pid.strip() for pid in pids]
                if len(pids) != len(self.pids):
                    continue
                new_pid = [pid for pid in pids if pid not in self.pids]
                assert len(new_pid) == 1, "Found more than one new pid."
                new_pid = new_pid[0]
                if new_pid.isdigit() and (new_pid != self.daemon_pid):
                    log.info(f"New pid of ceph-{self.resource} is {new_pid}")
                    break
        except TimeoutExpiredError:
            raise TimeoutExpiredError(
                f"Waiting for pid of ceph-{self.resource} in {node_name}")
Example #5
0
    def destroy(self, log_level="DEBUG"):
        """
        Destroy OCP cluster specific

        Args:
            log_level (str): log level openshift-installer (default: DEBUG)

        """
        cluster_details = ocm.get_cluster_details(self.cluster_name)
        cluster_id = cluster_details.get("id")
        delete_status = rosa.destroy_appliance_mode_cluster(self.cluster_name)
        if not delete_status:
            ocm.destroy_cluster(self.cluster_name)
        logger.info("Waiting for ROSA cluster to be uninstalled")
        sample = TimeoutSampler(
            timeout=7200,
            sleep=30,
            func=self.cluster_present,
            cluster_name=self.cluster_name,
        )
        if not sample.wait_for_func_status(result=False):
            err_msg = f"Failed to delete {self.cluster_name}"
            logger.error(err_msg)
            raise TimeoutExpiredError(err_msg)
        rosa.delete_operator_roles(cluster_id)
        rosa.delete_oidc_provider(cluster_id)
Example #6
0
    def kill_daemon(self, node_name=None, check_new_pid=True, kill_signal="9"):
        """
        Kill self.resource daemon

        Args:
            node_name (str): Name of node in which the resource daemon has
                to be killed
            check_new_pid (bool): True to check for new pid after killing the
                daemon. False to skip the check.
            kill_signal (str): kill signal type
        """
        node_name = node_name or self.resource_obj[0].pod_data.get("spec").get(
            "nodeName")
        if not self.daemon_pid:
            self.select_daemon(node_name=node_name)

        # Command to kill the daemon
        kill_cmd = (f"oc debug node/{node_name} -- chroot /host  "
                    f"kill -{kill_signal} {self.daemon_pid}")
        daemon_kill = run_cmd(kill_cmd)

        # 'daemon_kill' will be an empty string if command is success
        assert isinstance(daemon_kill, str) and (not daemon_kill), (
            f"Failed to kill ceph-{self.resource} daemon in {node_name}. "
            f"Daemon kill command output - {daemon_kill}")
        log.info(f"Killed ceph-{self.resource} daemon on node {node_name}")

        if check_new_pid:
            awk_print = "'{print $1}'"
            pid_cmd = (
                f"oc debug node/{node_name} -- chroot /host ps ax | grep"
                f" ' ceph-{self.resource} --' | grep -v grep | awk {awk_print}"
            )
            try:
                for pid_proc in TimeoutSampler(60,
                                               2,
                                               run_async,
                                               command=pid_cmd):
                    ret, pid, err = pid_proc.async_communicate()

                    # Consider scenario where more than one self.resource pod
                    # is running on one node. eg:More than one osd on same node
                    pids = pid.strip().split()
                    pids = [pid.strip() for pid in pids]
                    if len(pids) != len(self.pids):
                        continue
                    new_pid = [pid for pid in pids if pid not in self.pids]
                    assert len(new_pid) == 1, "Found more than one new pid."
                    new_pid = new_pid[0]
                    if new_pid.isdigit() and (new_pid != self.daemon_pid):
                        log.info(
                            f"New pid of ceph-{self.resource} is {new_pid}")
                        break
            except TimeoutExpiredError:
                raise TimeoutExpiredError(
                    f"Waiting for pid of ceph-{self.resource} in {node_name}")
Example #7
0
    def _create_nss(method, nss_dict):
        """
        Tracks creation and cleanup of all the namespace stores that were created in the current scope

        Args:
            method (str): String for selecting method of backing store creation (CLI/OC)
            nss_dict (dict): Dictionary containing storage provider as key and a list of tuples
            as value.
            Namespace store dictionary examples - 'CloudName': [(amount, region), (amount, region)]
            i.e. - 'aws': [(3, us-west-1),(2, eu-west-2)]

        Returns:
            list: A list of the NamespaceStore objects created by the factory in the current scope

        """
        current_call_created_nss = []
        for platform, nss_lst in nss_dict.items():
            for nss_tup in nss_lst:
                # Create the actual namespace resource
                nss_name = create_unique_resource_name(constants.MCG_NSS, platform)

                target_bucket_name = cmdMap[method.lower()](
                    nss_name, nss_tup[1], cld_mgr, cloud_uls_factory, platform
                )

                # TODO: Check platform exists in endpointMap

                sample = TimeoutSampler(
                    timeout=60,
                    sleep=5,
                    func=mcg_obj.check_ns_resource_validity,
                    ns_resource_name=nss_name,
                    target_bucket_name=target_bucket_name,
                    endpoint=endpointMap[platform],
                )
                if not sample.wait_for_func_status(result=True):
                    err_msg = f"{nss_name} failed its verification check"
                    log.error(err_msg)
                    raise TimeoutExpiredError(err_msg)

                nss_obj = NamespaceStore(
                    name=nss_name,
                    method=method.lower(),
                    mcg_obj=mcg_obj,
                    uls_name=target_bucket_name,
                )

                nss_obj.verify_health()

                created_nss.append(nss_obj)
                current_call_created_nss.append(nss_obj)

        return current_call_created_nss
Example #8
0
def wait_for_mirroring_status_ok(replaying_images=None, timeout=300):
    """
    Wait for mirroring status to reach health OK and expected number of replaying
    images for each of the ODF cluster

    Args:
        replaying_images (int): Expected number of images in replaying state
        timeout (int): time in seconds to wait for mirroring status reach OK

    Returns:
        bool: True if status contains expected health and states values

    Raises:
        TimeoutExpiredError: In case of unexpected mirroring status

    """
    restore_index = config.cur_index
    if not replaying_images:
        replaying_images = 0
        for cluster in get_non_acm_cluster_config():
            config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
            replaying_images += len(
                get_all_pvcs_in_storageclass(constants.CEPHBLOCKPOOL_SC)
            )
        replaying_images -= 2  # Ignore db-noobaa-db-pg-0 PVCs

    for cluster in get_non_acm_cluster_config():
        config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
        logger.info(
            f"Validating mirroring status on cluster {cluster.ENV_DATA['cluster_name']}"
        )
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_mirroring_status_ok,
            replaying_images=replaying_images,
        )
        if not sample.wait_for_func_status(result=True):
            error_msg = (
                "The mirroring status does not have expected values within the time"
                f" limit on cluster {cluster.ENV_DATA['cluster_name']}"
            )
            logger.error(error_msg)
            raise TimeoutExpiredError(error_msg)

    config.switch_ctx(restore_index)
    return True
Example #9
0
    def kill_daemon(self, node_name=None, check_new_pid=True):
        """
        Kill self.resource daemon

        Args:
            node_name (str): Name of node in which the resource daemon has
                to be killed
            check_new_pid (bool): True to check for new pid after killing the
                daemon. False to skip the check.
        """
        node_name = node_name or self.resource_obj[0].pod_data.get('spec').get(
            'nodeName')
        if not self.daemon_pid:
            self.select_daemon(node_name=node_name)

        # Command to kill the daemon
        kill_cmd = (f'oc debug node/{node_name} -- chroot /host  '
                    f'kill -9 {self.daemon_pid}')
        daemon_kill = run_cmd(kill_cmd)

        # 'daemon_kill' will be an empty string if command is success
        assert isinstance(daemon_kill, str) and (not daemon_kill), (
            f"Failed to kill ceph-{self.resource} daemon in {node_name}. "
            f"Daemon kill command output - {daemon_kill}")
        log.info(f"Killed ceph-{self.resource} daemon on node {node_name}")

        if check_new_pid:
            awk_print = "'{print $1}'"
            pid_cmd = (
                f"oc debug node/{node_name} -- chroot /host ps ax | grep"
                f" ' ceph-{self.resource} --' | grep -v grep | awk {awk_print}"
            )
            try:
                for pid_proc in TimeoutSampler(60,
                                               2,
                                               run_async,
                                               command=pid_cmd):
                    ret, new_pid, err = pid_proc.async_communicate()
                    new_pid = new_pid.strip()
                    if new_pid and (new_pid != self.daemon_pid):
                        log.info(
                            f"New pid of ceph-{self.resource} is {new_pid}")
                        break
            except TimeoutExpiredError:
                raise TimeoutExpiredError(
                    f"Waiting for pid of ceph-{self.resource} in {node_name}")
Example #10
0
        def _wait_for_pv_backingstore_resource_deleted(namespace=None):
            """
            wait for pv backing store resources to be deleted at the end of test teardown

            Args:
                backingstore_name (str): backingstore name
                namespace (str): backing store's namespace

            """
            namespace = namespace or config.ENV_DATA["cluster_namespace"]
            sample = TimeoutSampler(
                timeout=120,
                sleep=15,
                func=_check_resources_deleted,
                namespace=namespace,
            )
            if not sample.wait_for_func_status(result=True):
                err_msg = f"{self.name} was not deleted properly, leftovers were found"
                log.error(err_msg)
                raise TimeoutExpiredError(err_msg)
Example #11
0
def run_io_in_bg(pod_obj, expect_to_fail=False, fedora_dc=False):
    """
    Run I/O in the background

    Args:
        pod_obj (Pod): The object of the pod
        expect_to_fail (bool): True for the command to be expected to fail
            (disruptive operations), False otherwise
        fedora_dc (bool): set to False by default. If set to True, it runs IO in
            background on a fedora dc pod.

    Returns:
        Thread: A thread of the I/O execution
    """
    logger.info(f"Running I/O on pod {pod_obj.name}")

    def exec_run_io_cmd(pod_obj, expect_to_fail, fedora_dc):
        """
        Execute I/O
        """
        try:
            # Writing content to a new file every 0.01 seconds.
            # Without sleep, the device will run out of space very quickly -
            # 5-10 seconds for a 5GB device
            if fedora_dc:
                FILE = FEDORA_TEST_FILE
            else:
                FILE = TEST_FILE
            pod_obj.exec_cmd_on_pod(
                command=f"bash -c \"let i=0; while true; do echo "
                f"{TEXT_CONTENT} >> {FILE}$i; let i++; sleep 0.01; done\"",
                timeout=2400)
        # Once the pod gets deleted, the I/O execution will get terminated.
        # Hence, catching this exception
        except CommandFailed as ex:
            if expect_to_fail:
                if re.search("code 137", str(ex)) or (re.search(
                        "code 143", str(ex))):
                    logger.info("I/O command got terminated as expected")
                    return
            raise ex

    thread = Thread(target=exec_run_io_cmd,
                    args=(pod_obj, expect_to_fail, fedora_dc))
    thread.start()
    time.sleep(2)

    # Checking file existence
    if fedora_dc:
        FILE = FEDORA_TEST_FILE
    else:
        FILE = TEST_FILE
    test_file = FILE + "1"

    # Check I/O started
    try:
        for sample in TimeoutSampler(timeout=20,
                                     sleep=1,
                                     func=check_file_existence,
                                     pod_obj=pod_obj,
                                     file_path=test_file):
            if sample:
                break
            logger.info(f"Waiting for I/O to start inside {pod_obj.name}")
    except TimeoutExpiredError:
        logger.error(
            f"Wait timeout: I/O failed to start inside {pod_obj.name}. "
            "Collect file list.")
        parent_dir = os.path.join(TEST_FILE, os.pardir)
        pod_obj.exec_cmd_on_pod(command=f'ls -l {os.path.abspath(parent_dir)}',
                                out_yaml_format=False)
        raise TimeoutExpiredError(f"I/O failed to start inside {pod_obj.name}")
    return thread
Example #12
0
    def test_noobaa_sts_host_node_failure(
        self,
        noobaa_sts,
        respin_noobaa_operator,
        mcg_obj,
        bucket_factory,
        nodes,
        node_restart_teardown,
    ):
        """
        Test case to fail node where NooBaa Statefulset pod (noobaa-core, noobaa-db)
        is hosted and verify the pod is rescheduled on a healthy node

        """
        executor = ThreadPoolExecutor(max_workers=1)
        pod_obj = OCP(kind=constants.POD,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)

        # Get noobaa statefulset pod and node where it is hosted
        noobaa_sts_pod = get_noobaa_pods(
            noobaa_label=self.labels_map[noobaa_sts])[0]
        noobaa_sts_pod_node = get_pod_node(noobaa_sts_pod)
        log.info(
            f"{noobaa_sts_pod.name} is running on {noobaa_sts_pod_node.name}")

        # Get the NooBaa operator pod and node where it is hosted
        # Check if NooBaa operator and statefulset pod are hosted on same node
        noobaa_operator_pod = get_noobaa_pods(noobaa_label=self.labels_map[
            constants.NOOBAA_OPERATOR_DEPLOYMENT])[0]
        noobaa_operator_pod_node = get_pod_node(noobaa_operator_pod)
        log.info(
            f"{noobaa_operator_pod.name} is running on {noobaa_operator_pod_node.name}"
        )
        if noobaa_sts_pod_node.name == noobaa_operator_pod_node.name:
            operator_on_same_node = True
            log.info(
                f"{noobaa_sts_pod.name} and {noobaa_operator_pod.name} are running on same node."
            )
        else:
            operator_on_same_node = False
            log.info(
                f"{noobaa_sts_pod.name} and {noobaa_operator_pod.name} are running on different node."
            )

        # Stop the node
        log.info(
            f"Stopping {noobaa_sts_pod_node.name} where {noobaa_sts_pod.name} is hosted"
        )
        stop_thread = executor.submit(nodes.stop_nodes,
                                      nodes=[noobaa_sts_pod_node])
        node.wait_for_nodes_status(node_names=[noobaa_sts_pod_node.name],
                                   status=constants.NODE_NOT_READY)

        # Disrupt NooBaa operator
        if respin_noobaa_operator:
            noobaa_operator_pod.delete(force=True)

        # Check result of 'stop_thread'
        stop_thread.result()

        # Wait for NooBaa operator pod to reach terminating state if on same node
        # and not respun
        if operator_on_same_node and not respin_noobaa_operator:
            wait_for_resource_state(
                resource=noobaa_operator_pod,
                state=constants.STATUS_TERMINATING,
                timeout=360,
            )

        # Wait for NooBaa operator pod to reach running state
        pod_obj.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=self.labels_map[constants.NOOBAA_OPERATOR_DEPLOYMENT],
            resource_count=1,
        )

        # Verify NooBaa statefulset pod reschedules on another node
        try:
            for pod_list in TimeoutSampler(
                    60,
                    3,
                    get_noobaa_pods,
                    noobaa_label=self.labels_map[noobaa_sts],
            ):
                if len(pod_list) == 1:
                    pod_node = get_pod_node(pod_list[0])
                    if pod_node.name != noobaa_sts_pod_node.name:
                        log.info(
                            f"{pod_list[0].name} has been rescheduled on {pod_node.name}"
                        )
                        break
                    log.info(
                        f"Waiting for {noobaa_sts_pod.name} pod to be rescheduled"
                    )
        except TimeoutExpiredError:
            raise TimeoutExpiredError(
                f"{noobaa_sts_pod.name} pod not rescheduled within 60 seconds")

        # Wait for rescheduled pod to reach Running state.
        # For noobaa-db pod which is attached to a PV it may take more time (~8 minutes)
        # until the new pod can attach to the PV
        pod_obj.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=self.labels_map[noobaa_sts],
            resource_count=1,
            timeout=800
            if noobaa_sts == constants.NOOBAA_DB_STATEFULSET else 60,
            sleep=30 if noobaa_sts == constants.NOOBAA_DB_STATEFULSET else 3,
        )

        # Start the node
        log.info(
            f"Starting {noobaa_sts_pod_node.name} where {noobaa_sts_pod.name} was hosted"
        )
        nodes.start_nodes(nodes=[noobaa_sts_pod_node])
        node.wait_for_nodes_status(node_names=[noobaa_sts_pod_node.name],
                                   status=constants.NODE_READY)

        log.info("Wait for all pods to be in running state")
        wait_for_pods_to_be_running(timeout=300)

        # Check cluster health
        self.sanity_helpers.health_check()

        # Creates bucket then writes, reads and deletes objects
        self.sanity_helpers.obc_put_obj_create_delete(mcg_obj, bucket_factory)
Example #13
0
    def delete(self, retry=True):
        """
        Deletes the current namespacestore by using OC/CLI commands

        Args:
            retry (bool): Whether to retry the deletion if it fails

        """
        log.info(f"Cleaning up namespacestore {self.name}")

        def _oc_deletion_flow():
            try:
                OCP(
                    kind="namespacestore",
                    namespace=config.ENV_DATA["cluster_namespace"],
                ).delete(resource_name=self.name)
                return True
            except CommandFailed as e:
                if "not found" in str(e).lower():
                    log.warning(
                        f"Namespacestore {self.name} was already deleted.")
                    return True
                elif all(err in e.args[0] for err in
                         ["cannot complete because pool", "in", "state"]):
                    if retry:
                        log.warning(
                            f"Deletion of {self.name} failed due to its state; Retrying"
                        )
                        return False
                    else:
                        raise
                else:
                    raise

        def _cli_deletion_flow():
            try:
                self.mcg_obj.exec_mcg_cmd(f"namespacestore delete {self.name}")
                return True
            except CommandFailed as e:
                if "being used by one or more buckets" in str(e).lower():
                    log.warning(
                        f"Deletion of {self.name} failed because it's being used by a bucket. "
                        "Retrying...")
                else:
                    log.warning(
                        f"Deletion of self.name failed. Error:\n{str(e)}")
                return False

        cmdMap = {
            "oc": _oc_deletion_flow,
            "cli": _cli_deletion_flow,
        }
        if retry:
            sample = TimeoutSampler(
                timeout=120,
                sleep=20,
                func=cmdMap[self.method],
            )
            if not sample.wait_for_func_status(result=True):
                err_msg = f"Failed to delete {self.name}"
                log.error(err_msg)
                raise TimeoutExpiredError(err_msg)
        else:
            cmdMap[self.method]()

        log.info(
            f"Verifying whether namespacestore {self.name} exists after deletion"
        )
        ns_deleted_successfully = False

        if self.method == "oc":
            try:
                OCP(
                    kind=constants.NAMESPACESTORE,
                    namespace=config.ENV_DATA["cluster_namespace"],
                    resource_name=self.name,
                ).get()
            except CommandFailed as e:
                if "not found" in str(e).lower():
                    log.info(f"Namespacestore {self.name} was deleted.")
                    ns_deleted_successfully = True
                else:
                    raise
        elif self.method == "cli":
            if self.name not in self.mcg_obj.exec_mcg_cmd(
                    "namespacestore list").stdout:
                ns_deleted_successfully = True

        assert (ns_deleted_successfully
                ), f"Namespacestore {self.name} was not deleted successfully"
Example #14
0
    def delete(self):
        log.info(f"Cleaning up backingstore {self.name}")
        # If the backingstore utilizes a PV, save its PV name for deletion verification
        if self.type == "pv":
            try:
                backingstore_pvc = OCP(
                    kind=constants.PVC,
                    selector=f"pool={self.name}",
                    namespace=config.ENV_DATA["cluster_namespace"],
                ).get()["items"][0]
            except IndexError:
                log.error(
                    f"Could not find the OCP object for {self.name}, proceeding without removal"
                )
                return True
            except Exception as e:
                raise e
            pv_name = backingstore_pvc["spec"]["volumeName"]

        if self.method == "oc":
            OCP(kind="backingstore",
                namespace=config.ENV_DATA["cluster_namespace"]).delete(
                    resource_name=self.name)
        elif self.method == "cli":

            def _cli_deletion_flow():
                try:
                    self.mcg_obj.exec_mcg_cmd(
                        f"backingstore delete {self.name}")
                    return True
                except CommandFailed as e:
                    if "being used by one or more buckets" in str(e).lower():
                        log.warning(
                            f"Deletion of {self.name} failed because it's being used by a bucket. "
                            "Retrying...")
                        return False

            sample = TimeoutSampler(
                timeout=120,
                sleep=20,
                func=_cli_deletion_flow,
            )
            if not sample.wait_for_func_status(result=True):
                err_msg = f"Failed to {self.name}"
                log.error(err_msg)
                raise TimeoutExpiredError(err_msg)

        # Verify deletion was successful
        log.info(
            f"Verifying whether backingstore {self.name} exists after deletion"
        )
        bs_deleted_successfully = False

        try:
            if self.method == "oc":
                OCP(
                    kind="backingstore",
                    namespace=config.ENV_DATA["cluster_namespace"],
                    resource_name=self.name,
                ).get()
            elif self.method == "cli":
                self.mcg_obj.exec_mcg_cmd(f"backingstore status {self.name}")

        except CommandFailed as e:
            if "Not Found" in str(e) or "NotFound" in str(e):
                bs_deleted_successfully = True
            else:
                raise

        assert (bs_deleted_successfully
                ), f"Backingstore {self.name} was not deleted successfully"

        def _wait_for_pv_backingstore_resource_deleted(namespace=None):
            """
            wait for pv backing store resources to be deleted at the end of test teardown

            Args:
                backingstore_name (str): backingstore name
                namespace (str): backing store's namespace

            """
            namespace = namespace or config.ENV_DATA["cluster_namespace"]
            sample = TimeoutSampler(
                timeout=120,
                sleep=15,
                func=_check_resources_deleted,
                namespace=namespace,
            )
            if not sample.wait_for_func_status(result=True):
                err_msg = f"{self.name} was not deleted properly, leftovers were found"
                log.error(err_msg)
                raise TimeoutExpiredError(err_msg)

        def _check_resources_deleted(namespace=None):
            """
            check if resources of the pv pool backingstore deleted properly

            Args:
                namespace (str): backing store's namespace

            Returns:
                bool: True if pvc(s) were deleted

            """
            try:
                OCP(kind=constants.PV, resource_name=pv_name).get()
                log.warning(f"Found PV leftovers belonging to {self.name}")
                return False
            except CommandFailed as e:
                if "not found" in str(e):
                    pass
                else:
                    raise
            pvcs = get_all_pvcs(namespace=namespace,
                                selector=f"pool={self.name}")
            pods = get_pods_having_label(namespace=namespace,
                                         label=f"pool={self.name}")
            return len(pvcs["items"]) == 0 and len(pods) == 0

        if self.type == "pv":
            log.info(
                f"Waiting for backingstore {self.name} resources to be deleted"
            )
            _wait_for_pv_backingstore_resource_deleted()
Example #15
0
    def delete(self):
        """
        Deletes the current namespacestore by using OC/CLI commands

        """
        log.info(f"Cleaning up namespacestore {self.name}")

        if self.method == "oc":
            try:
                OCP(
                    kind="namespacestore",
                    namespace=config.ENV_DATA["cluster_namespace"],
                ).delete(resource_name=self.name)
            except CommandFailed as e:
                if "not found" in str(e).lower():
                    log.warning(f"Namespacestore {self.name} was already deleted.")
                else:
                    raise

        elif self.method == "cli":

            def _cli_deletion_flow():
                try:
                    self.mcg_obj.exec_mcg_cmd(f"namespacestore delete {self.name}")
                    return True
                except CommandFailed as e:
                    if "being used by one or more buckets" in str(e).lower():
                        log.warning(
                            f"Deletion of {self.name} failed because it's being used by a bucket. "
                            "Retrying..."
                        )
                    else:
                        log.warning(f"Deletion of self.name failed. Error:\n{str(e)}")
                    return False

            sample = TimeoutSampler(
                timeout=120,
                sleep=20,
                func=_cli_deletion_flow,
            )
            if not sample.wait_for_func_status(result=True):
                err_msg = f"Failed to {self.name}"
                log.error(err_msg)
                raise TimeoutExpiredError(err_msg)

        log.info(f"Verifying whether namespacestore {self.name} exists after deletion")
        ns_deleted_successfully = False

        if self.method == "oc":
            try:
                OCP(
                    kind="namespacestore",
                    namespace=config.ENV_DATA["cluster_namespace"],
                    resource_name=self.name,
                ).get()
            except CommandFailed as e:
                if "not found" in str(e).lower():
                    log.info(f"Namespacestore {self.name} was deleted.")
                    ns_deleted_successfully = True
                else:
                    raise
        elif self.method == "cli":
            if self.name not in self.mcg_obj.exec_mcg_cmd("namespacestore list"):
                ns_deleted_successfully = True

        assert (
            ns_deleted_successfully
        ), f"Namespacestore {self.name} was not deleted successfully"
Example #16
0
    def test_rbd_based_rwo_pvc(self, reclaim_policy):
        """
        Verifies RBD Based RWO Dynamic PVC creation with Reclaim policy set to
        Delete/Retain

        Steps:
        1. Create Storage Class with reclaimPolicy: Delete/Retain
        2. Create PVC with 'accessModes' 'ReadWriteOnce'
        3. Create two pods using same PVC
        4. Run IO on first pod
        5. Verify second pod is not getting into Running state
        6. Delete first pod
        7. Verify second pod is in Running state
        8. Verify usage of volume in second pod is matching with usage in
           first pod
        9. Run IO on second pod
        10. Delete second pod
        11. Delete PVC
        12. Verify PV associated with deleted PVC is also deleted/released
        """
        # Create Storage Class with reclaimPolicy: Delete
        sc_obj = helpers.create_storage_class(
            interface_type=constants.CEPHBLOCKPOOL,
            interface_name=self.cbp_obj.name,
            secret_name=self.rbd_secret_obj.name,
            reclaim_policy=reclaim_policy
        )

        # Create PVC with 'accessModes' 'ReadWriteOnce'
        pvc_data = templating.load_yaml_to_dict(constants.CSI_PVC_YAML)
        pvc_data['metadata']['name'] = helpers.create_unique_resource_name(
            'test', 'pvc'
        )
        pvc_data['metadata']['namespace'] = self.namespace
        pvc_data['spec']['storageClassName'] = sc_obj.name
        pvc_data['spec']['accessModes'] = ['ReadWriteOnce']
        pvc_obj = PVC(**pvc_data)
        pvc_obj.create()

        # Create first pod
        log.info(f"Creating two pods which use PVC {pvc_obj.name}")
        pod_data = templating.load_yaml_to_dict(constants.CSI_RBD_POD_YAML)
        pod_data['metadata']['name'] = helpers.create_unique_resource_name(
            'test', 'pod'
        )
        pod_data['metadata']['namespace'] = self.namespace
        pod_data['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_obj.name

        pod_obj = Pod(**pod_data)
        pod_obj.create()
        assert helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)

        node_pod1 = pod_obj.get()['spec']['nodeName']

        # Create second pod
        # Try creating pod until it is on a different node than first pod
        for retry in range(1, 6):
            pod_data = templating.load_yaml_to_dict(constants.CSI_RBD_POD_YAML)
            pod_data['metadata']['name'] = helpers.create_unique_resource_name(
                'test', 'pod'
            )
            pod_data['metadata']['namespace'] = self.namespace
            pod_data['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_obj.name
            pod_obj2 = Pod(**pod_data)
            pod_obj2.create()
            assert helpers.wait_for_resource_state(pod_obj2, constants.STATUS_PENDING)

            node_pod2 = pod_obj2.get()['spec']['nodeName']
            if node_pod1 != node_pod2:
                break
            log.info(
                f"Both pods are on same node. Deleting second pod and "
                f"creating another pod. Retry count:{retry}"
            )
            pod_obj2.delete()
            if retry == 5:
                raise UnexpectedBehaviour(
                    "Second pod is always created on same node as of first "
                    "pod even after trying 5 times."
                )

        # Run IO on first pod
        log.info(f"Running IO on first pod {pod_obj.name}")
        pod_obj.run_io('fs', '1G')
        logging.info(f"Waiting for IO results from pod {pod_obj.name}")
        fio_result = pod_obj.get_fio_results()
        logging.info("IOPs after FIO:")
        logging.info(
            f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}"
        )
        logging.info(
            f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}"
        )

        # Fetch usage details
        mount_point = pod_obj.exec_cmd_on_pod(command="df -kh")
        mount_point = mount_point.split()
        usage = mount_point[mount_point.index('/var/lib/www/html') - 1]

        # Verify that second pod is not getting into Running state. Check it
        # for some period of time.
        try:
            assert not pod_obj2.ocp.wait_for_resource(
                condition='Running', resource_name=pod_obj2.name,
            ), "Unexpected: Second pod is in Running state"
        except TimeoutExpiredError:
            log.info(
                f"Verified: Second pod {pod_obj2.name} is not in "
                f"Running state"
            )

        # Delete first pod
        pod_obj.delete(wait=True)

        # Verify pod is deleted
        try:
            pod_obj.get()
            raise UnexpectedBehaviour(
                f"First pod {pod_obj.name} is not deleted."
            )
        except CommandFailed as exp:
            assert "not found" in str(exp), (
                "Failed to fetch pod details"
            )
            log.info(f"First pod {pod_obj.name} is deleted.")

        # Wait for second pod to be in Running state
        try:
            pod_obj2.ocp.wait_for_resource(
                condition='Running', resource_name=pod_obj2.name, timeout=180
            )
        except TimeoutExpiredError as exp:
            raise TimeoutExpiredError(
                f"Second pod {pod_obj2.name} is not in Running state "
                f"after deleting first pod."
            ) from exp
        log.info(
            f"Second pod {pod_obj2.name} is in Running state after "
            f"deleting the first pod."
        )

        # Verify that volume usage in second pod is matching with the usage in
        # first pod
        mount_point = pod_obj2.exec_cmd_on_pod(command="df -kh")
        mount_point = mount_point.split()
        usage_re = mount_point[mount_point.index('/var/lib/www/html') - 1]
        assert usage_re == usage, (
            "Use percentage in new pod is not matching with old pod"
        )

        # Run IO on second pod
        log.info(f"Running IO on second pod {pod_obj2.name}")
        pod_obj2.run_io('fs', '1G')
        logging.info(f"Waiting for IO results from pod {pod_obj2.name}")
        fio_result = pod_obj2.get_fio_results()
        logging.info("IOPs after FIO:")
        logging.info(
            f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}"
        )
        logging.info(
            f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}"
        )

        # Delete second pod
        pod_obj2.delete()

        # Verify pod is deleted
        try:
            pod_obj2.get()
            raise UnexpectedBehaviour(
                f"Second pod {pod_obj2.name} is not deleted."
            )
        except CommandFailed as exp:
            assert "not found" in str(exp), (
                "Failed to fetch pod details"
            )
            log.info(f"Second pod {pod_obj2.name} is deleted.")

        # Get PV name
        pvc_obj.reload()
        pv_name = pvc_obj.backed_pv

        # Delete PVC
        pvc_obj.delete()

        # Verify PVC is deleted
        try:
            pvc_obj.get()
            raise UnexpectedBehaviour(
                f"PVC {pvc_obj.name} is not deleted."
            )
        except CommandFailed as exp:
            assert "not found" in str(exp), (
                "Failed to verify PVC deletion."
            )
            log.info(f"PVC {pvc_obj.name} is deleted.")

        pv_obj = OCP(
            kind=constants.PV, namespace=self.namespace
        )

        if reclaim_policy == "Delete":
            # Verify PV is deleted
            for pv_info in TimeoutSampler(
                    30, 2, pv_obj.get, out_yaml_format=False
            ):
                if pv_name not in pv_info:
                    break
                log.warning(
                    f"PV {pv_name} exists after deleting PVC {pvc_obj.name}. "
                    f"Checking again."
                )

            # TODO: Verify PV using ceph toolbox. PV should be deleted.
            # Blocked by bz 1723656

        elif reclaim_policy == "Retain":
            # Wait for PV to be in Released state
            assert pv_obj.wait_for_resource(
                condition='Released', resource_name=pv_name
            )
            log.info(f"PV {pv_name} is in Released state")

            # TODO: Delete PV from backend and verify
            # Blocked by bz 1723656
            pv_obj.delete(resource_name=pv_name)

        # Delete Storage Class
        sc_obj.delete()