Esempio n. 1
0
def validate_cluster_import(cluster_name):
    """
    Validate ACM status of managed cluster

    Args:
        cluster_name: (str): cluster name to validate

    Assert:
        All conditions of selected managed cluster should be "True", Failed otherwise

    Return:
        True, if not AssertionError
    """
    config.switch_ctx(0)
    oc_obj = OCP(kind=ACM_MANAGED_CLUSTERS)
    conditions = oc_obj.exec_oc_cmd(
        f"get managedclusters {cluster_name} -ojsonpath='{{.status.conditions}}'"
    )
    log.debug(conditions)

    for dict_status in conditions:
        log.info(f"Message: {dict_status.get('message')}")
        log.info(f"Status: {dict_status.get('status')}")
        assert (dict_status.get("status") == "True"
                ), f"Status is not True, but: {dict_status.get('status')}"

    # Return true if Assertion error was not raised:
    return True
Esempio n. 2
0
def relocate(preferred_cluster, drpc_name, namespace):
    """
    Initiates Relocate action to the specified cluster

    Args:
        preferred_cluster (str): Cluster name to which the workload should be relocated
        drpc_name (str): Name of the DRPC resource to apply the patch
        namespace (str): Name of the namespace to use

    """
    prev_index = config.cur_index
    config.switch_acm_ctx()
    relocate_params = (
        f'{{"spec":{{"action":"Relocate","preferredCluster":"{preferred_cluster}"}}}}'
    )
    drpc_obj = ocp.OCP(
        kind=constants.DRPC, namespace=namespace, resource_name=drpc_name
    )
    drpc_obj._has_phase = True

    logger.info(f"Initiating relocate action to {preferred_cluster}")
    assert drpc_obj.patch(
        params=relocate_params, format_type="merge"
    ), f"Failed to patch {constants.DRPC}: {drpc_name}"

    logger.info(
        f"Wait for {constants.DRPC}: {drpc_name} to reach {constants.STATUS_RELOCATED} phase"
    )
    drpc_obj.wait_for_phase(constants.STATUS_RELOCATED)

    config.switch_ctx(prev_index)
 def finalizer():
     # Switching to provider cluster context will be done during the test case in certain cases.
     # Switch back to consumer cluster context after the test case.
     if self.provider_index:
         config.switch_ctx(self.initial_cluster_index)
     assert ceph_health_check(), "Ceph cluster health is not OK"
     log.info("Ceph cluster health is OK")
Esempio n. 4
0
def pytest_configure(config):
    """
    Load config files, and initialize ocs-ci library.

    Args:
        config (pytest.config): Pytest config object

    """
    set_log_level(config)
    # Somewhat hacky but this lets us differentiate between run-ci executions
    # and plain pytest unit test executions
    ocscilib_module = "ocs_ci.framework.pytest_customization.ocscilib"
    if ocscilib_module not in config.getoption("-p"):
        return
    for i in range(ocsci_config.nclusters):
        log.debug(f"Pytest configure switching to: cluster={i}")
        ocsci_config.switch_ctx(i)

        if not (config.getoption("--help") or config.getoption("collectonly")):
            process_cluster_cli_params(config)
            config_file = os.path.expanduser(
                os.path.join(
                    ocsci_config.RUN["log_dir"],
                    f"run-{ocsci_config.RUN['run_id']}-cl{i}-config.yaml",
                ))
            dump_config_to_file(config_file)
            log.info(f"Dump of the consolidated config file is located here: "
                     f"{config_file}")

            # Add OCS related versions to the html report and remove
            # extraneous metadata
            markers_arg = config.getoption("-m")

            # add logs url
            logs_url = ocsci_config.RUN.get("logs_url")
            if logs_url:
                config._metadata["Logs URL"] = logs_url

            if ocsci_config.RUN["cli_params"].get("teardown") or (
                    "deployment" in markers_arg
                    and ocsci_config.RUN["cli_params"].get("deploy")):
                log.info(
                    "Skipping versions collecting because: Deploy or destroy of "
                    "cluster is performed.")
                return
            elif ocsci_config.ENV_DATA["skip_ocs_deployment"]:
                log.info("Skipping version collection because we skipped "
                         "the OCS deployment")
                return
            elif ocsci_config.RUN["cli_params"].get("dev_mode"):
                log.info("Running in development mode")
                return
            print("Collecting Cluster versions")
            # remove extraneous metadata
            for extra_meta in ["Python", "Packages", "Plugins", "Platform"]:
                if config._metadata.get(extra_meta):
                    del config._metadata[extra_meta]

            config._metadata["Test Run Name"] = get_testrun_name()
            gather_version_info_for_report(config)
Esempio n. 5
0
def failover(failover_cluster, drpc_name, namespace):
    """
    Initiates Failover action to the specified cluster

    Args:
        failover_cluster (str): Cluster name to which the workload should be failed over
        drpc_name (str): Name of the DRPC resource to apply the patch
        namespace (str): Name of the namespace to use

    """
    prev_index = config.cur_index
    config.switch_acm_ctx()
    failover_params = (
        f'{{"spec":{{"action":"Failover","failoverCluster":"{failover_cluster}"}}}}'
    )
    drpc_obj = ocp.OCP(
        kind=constants.DRPC, namespace=namespace, resource_name=drpc_name
    )
    drpc_obj._has_phase = True

    logger.info(f"Initiating failover action to {failover_cluster}")
    assert drpc_obj.patch(
        params=failover_params, format_type="merge"
    ), f"Failed to patch {constants.DRPC}: {drpc_name}"

    logger.info(
        f"Wait for {constants.DRPC}: {drpc_name} to reach {constants.STATUS_FAILEDOVER} phase"
    )
    drpc_obj.wait_for_phase(constants.STATUS_FAILEDOVER)

    config.switch_ctx(prev_index)
    def check_scale_pods_and_pvcs_created_on_consumers(self):
        for consumer_i, fio_scale in self.consumer_i_per_fio_scale.items():
            config.switch_ctx(consumer_i)
            c_name = config.ENV_DATA.get("cluster_name")
            ocp_pvc = OCP(kind=constants.PVC, namespace=fio_scale.namespace)
            ocp_pvc.wait_for_resource(
                timeout=30,
                condition=constants.STATUS_BOUND,
                resource_count=self.scale_count,
            )
            log.info(
                f"All the PVCs were created successfully on the consumer {c_name}"
            )

            ocp_pod = OCP(kind=constants.POD, namespace=fio_scale.namespace)
            ocp_pod.wait_for_resource(
                timeout=30,
                condition=constants.STATUS_COMPLETED,
                resource_count=self.expected_pod_num,
            )
            log.info(
                f"All the pods were created successfully on the consumer {c_name}"
            )

        log.info(
            "All the pods and PVCs were created successfully on the consumers")
Esempio n. 7
0
def wait_for_mirroring_status_ok(replaying_images=None, timeout=300):
    """
    Wait for mirroring status to reach health OK and expected number of replaying
    images for each of the ODF cluster

    Args:
        replaying_images (int): Expected number of images in replaying state
        timeout (int): time in seconds to wait for mirroring status reach OK

    Returns:
        bool: True if status contains expected health and states values

    Raises:
        AssertionError: In case of unexpected mirroring status

    """
    for cluster in get_non_acm_cluster_config():
        config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
        logger.info(
            f"Validating mirroring status on cluster {cluster.ENV_DATA['cluster_name']}"
        )
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_mirroring_status_ok,
            replaying_images=replaying_images,
        )
        assert sample.wait_for_func_status(result=True), (
            "The mirroring status does not have expected values within the time"
            f" limit on cluster {cluster.ENV_DATA['cluster_name']}"
        )
    def test_create_scale_pods_and_pvcs_with_ms_consumers(
            self, create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers):
        """
        Test create scale pods and PVCs using a kube job with MS consumers
        """
        self.orig_index = config.cur_index
        self.consumer_i_per_fio_scale = (
            create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers(
                scale_count=self.scale_count,
                pvc_per_pod_count=self.pvc_per_pod_count,
            ))
        assert config.cur_index == self.orig_index, "The current index has changed"

        config.switch_to_provider()
        time_to_wait_for_io_running = 120
        log.info(f"Wait {time_to_wait_for_io_running} seconds for checking "
                 f"that the IO running as expected")
        sleep(time_to_wait_for_io_running)
        ceph_health_check()

        log.info("Checking the Ceph Health on the consumers")
        consumer_indexes = config.get_consumer_indexes_list()
        for i in consumer_indexes:
            config.switch_ctx(i)
            ceph_health_check()

        self.check_scale_pods_and_pvcs_created_on_consumers()
        log.info(
            "The scale pods and PVCs using a kube job with MS consumers created successfully"
        )
Esempio n. 9
0
    def test_automated_recovery_from_failed_nodes_reactive_ms(
        self,
        nodes,
        failure,
    ):
        """
        We have 3 test cases to check when running IO in the background:
            A) Automated recovery from stopped worker node
            B) Automated recovery from termination of a worker node
            C) Automated recovery from unschedule and reschedule a worker node.
        """
        self.create_resources()

        config.switch_to_provider()
        log.info("Start executing the node test function on the provider...")
        FAILURE_TYPE_FUNC_CALL_DICT[failure](nodes)

        # Verification steps after the automated recovery.
        assert check_pods_after_node_replacement(
        ), "Not all the pods are running"
        assert (verify_worker_nodes_security_groups()
                ), "Not all the worker nodes security groups set correctly"

        log.info("Checking that the ceph health is ok on the provider")
        ceph_health_check()

        log.info("Checking that the ceph health is ok on the consumers")
        consumer_indexes = config.get_consumer_indexes_list()
        for i in consumer_indexes:
            config.switch_ctx(i)
            ceph_health_check()
Esempio n. 10
0
        def finalizer():
            ocp_nodes = get_node_objs()
            for n in ocp_nodes:
                recover_node_to_ready_state(n)

            logger.info("Switch to the original cluster index")
            config.switch_ctx(self.orig_index)
            ceph_health_check()
Esempio n. 11
0
 def teardown():
     # ocs-operator pod deletion on consumer cluster will trigger rook-ceph-tools pod respin. Patching of
     # rook-ceph-tools pod is done in the test case after ocs-operator pod respin. But if the automatic
     # respin of rook-ceph-tools pod is delayed by few seconds, the patching step in the test case will not
     # run. So doing patch at the end of the test to ensure that the rook-ceph-tools pod on consumers
     # can run ceph command.
     for consumer_index in self.consumer_indexes:
         config.switch_ctx(consumer_index)
         patch_consumer_toolbox()
     # Switching cluster context will be done during the test case.
     # Switch back to current cluster context after the test case.
     config.switch_ctx(initial_cluster_index)
Esempio n. 12
0
def test_deployment(pvc_factory, pod_factory):
    deploy = config.RUN["cli_params"].get("deploy")
    teardown = config.RUN["cli_params"].get("teardown")
    if not teardown or deploy:
        log.info("Verifying OCP cluster is running")
        assert is_cluster_running(config.ENV_DATA["cluster_path"])
        if not config.ENV_DATA["skip_ocs_deployment"]:
            if config.multicluster:
                restore_ctx_index = config.cur_index
                for cluster in get_non_acm_cluster_config():
                    config.switch_ctx(
                        cluster.MULTICLUSTER["multicluster_index"])
                    log.info(
                        f"Sanity check for cluster: {cluster.ENV_DATA['cluster_name']}"
                    )
                    sanity_helpers = Sanity()
                    sanity_helpers.health_check()
                    sanity_helpers.delete_resources()
                config.switch_ctx(restore_ctx_index)
            else:
                ocs_registry_image = config.DEPLOYMENT.get(
                    "ocs_registry_image")
                if config.ENV_DATA["mcg_only_deployment"]:
                    mcg_only_install_verification(
                        ocs_registry_image=ocs_registry_image)
                    return
                else:
                    ocs_install_verification(
                        ocs_registry_image=ocs_registry_image)

                # Check basic cluster functionality by creating resources
                # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
                # run IO and delete the resources
                if config.DEPLOYMENT["external_mode"]:
                    sanity_helpers = SanityExternalCluster()
                else:
                    sanity_helpers = Sanity()
                if (config.ENV_DATA["platform"].lower()
                        in constants.MANAGED_SERVICE_PLATFORMS):
                    try:
                        sanity_helpers.health_check()
                    except exceptions.ResourceWrongStatusException as err_msg:
                        log.warning(err_msg)
                else:
                    sanity_helpers.health_check()
                sanity_helpers.delete_resources()
                # Verify ceph health
                log.info("Verifying ceph health after deployment")
                assert ceph_health_check(tries=10, delay=30)

    if teardown:
        log.info(
            "Cluster will be destroyed during teardown part of this test.")
Esempio n. 13
0
        def finalizer():
            config.switch_to_provider()
            log.info(
                "Verify that all the worker nodes are in a Ready state on the provider"
            )
            wnodes = get_nodes(node_type=constants.WORKER_MACHINE)
            for wnode in wnodes:
                is_recovered = recover_node_to_ready_state(wnode)
                if not is_recovered:
                    log.warning(f"The node {wnode.name} has failed to recover")

            log.info("Verify again that the ceph health is OK")
            ceph_health_check()

            config.switch_ctx(self.orig_index)
Esempio n. 14
0
def get_scheduling_interval(namespace):
    """
    Get scheduling interval for the workload in the given namespace

    Args:
        namespace (str): Name of the namespace

    Returns:
        int: scheduling interval value from DRPolicy

    """
    restore_index = config.cur_index
    drpolicy_obj = DRPC(namespace=namespace).drpolicy_obj
    interval_value = int(drpolicy_obj.get()["spec"]["schedulingInterval"][:-1])
    config.switch_ctx(restore_index)
    return interval_value
    def check_pods_and_pvcs_deleted_on_consumers(self):
        for consumer_i, fio_scale in self.consumer_i_per_fio_scale.items():
            config.switch_ctx(consumer_i)
            c_name = config.ENV_DATA.get("cluster_name")

            pvc_objs = get_all_pvcs(fio_scale.namespace)["items"]
            assert not pvc_objs, "There are still remaining PVCs"
            log.info(
                f"All the PVCs deleted successfully on the consumer {c_name}")

            pod_objs = get_all_pods(fio_scale.namespace)
            assert not pod_objs, "There are still remaining pods"
            log.info(
                f"All the pods deleted successfully on the consumer {c_name}")

        log.info(
            "All the pods and PVCs were deleted successfully on the consumers")
Esempio n. 16
0
    def post_deploy_ops(self):
        """
        1. Install ingress certificates on OCP clusters deployed through ACM
        2. Run post_ocp_deploy on OCP clusters

        """
        prev = config.cur_index
        for cluster in get_non_acm_cluster_config():
            config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
            ssl_key = config.DEPLOYMENT.get("ingress_ssl_key")
            ssl_cert = config.DEPLOYMENT.get("ingress_ssl_cert")
            for key in [ssl_key, ssl_cert]:
                if os.path.exists(key):
                    os.unlink(key)
            logger.info("Running post ocp deploy ops")
            self.post_ocp_deploy()
        config.switch_ctx(prev)
Esempio n. 17
0
    def post_destroy_ops(self):
        """
        Post destroy ops includes
        1. Deleting DNS entries
        2. Freeing the ips assigned

        """
        prev_ctx = config.cur_index
        config.switch_ctx(self.cluster_conf.MULTICLUSTER["multicluster_index"])
        vmware.delete_dns_records()
        ipam = IPAM(appiapp="address")
        hosts = [
            f"{config.ENV_DATA.get('cluster_name')}-{i}"
            for i in range(self.nvips)
        ]
        ipam.release_ips(hosts)
        config.switch_ctx(prev_ctx)
Esempio n. 18
0
    def setup(self, request, create_pvcs_and_pods):
        """
        Prepare pods for the test and add finalizer.

        """
        self.provider_cluster_index = config.get_provider_index()
        self.consumer_indexes = config.get_consumer_indexes_list()
        if config.ENV_DATA["platform"].lower(
        ) in constants.MANAGED_SERVICE_PLATFORMS:
            # Get the index of current cluster
            initial_cluster_index = config.cur_index

            def teardown():
                # ocs-operator pod deletion on consumer cluster will trigger rook-ceph-tools pod respin. Patching of
                # rook-ceph-tools pod is done in the test case after ocs-operator pod respin. But if the automatic
                # respin of rook-ceph-tools pod is delayed by few seconds, the patching step in the test case will not
                # run. So doing patch at the end of the test to ensure that the rook-ceph-tools pod on consumers
                # can run ceph command.
                for consumer_index in self.consumer_indexes:
                    config.switch_ctx(consumer_index)
                    patch_consumer_toolbox()
                # Switching cluster context will be done during the test case.
                # Switch back to current cluster context after the test case.
                config.switch_ctx(initial_cluster_index)

            request.addfinalizer(teardown)

        self.io_pods = list()
        for cluster_index in self.consumer_indexes:
            config.switch_ctx(cluster_index)
            consumer_cluster_kubeconfig = os.path.join(
                config.clusters[cluster_index].ENV_DATA["cluster_path"],
                config.clusters[cluster_index].RUN.get("kubeconfig_location"),
            )
            pvcs, io_pods = create_pvcs_and_pods(
                pvc_size=self.pvc_size,
                replica_count=1,
                pod_dict_path=constants.PERF_POD_YAML,
            )
            for pvc_obj in pvcs:
                pvc_obj.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig
            for io_pod in io_pods:
                io_pod.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig
            pvcs[0].project.cluster_kubeconfig = consumer_cluster_kubeconfig
            self.io_pods.extend(io_pods)
Esempio n. 19
0
def get_clusters_env():
    """
    Stores cluster's kubeconfig location and clusters name, in case of multi-cluster setup
        Returns after execution with cluster index zero as default context
    Returns:
        dict: with clusters names, clusters kubeconfig locations

    """
    clusters_env = {}
    for index in range(config.nclusters):
        config.switch_ctx(index=index)
        clusters_env[f"kubeconfig_location_c{index}"] = os.path.join(
            config.ENV_DATA["cluster_path"], config.RUN["kubeconfig_location"])
        clusters_env[f"cluster_name_{index}"] = config.ENV_DATA["cluster_name"]

    config.switch_ctx(index=0)

    return clusters_env
Esempio n. 20
0
def get_current_secondary_cluster_name(namespace):
    """
    Get current secondary cluster name based on workload namespace

    Args:
        namespace (str): Name of the namespace

    Returns:
        str: Current secondary cluster name

    """
    restore_index = config.cur_index
    primary_cluster_name = get_current_primary_cluster_name(namespace)
    drpolicy_data = DRPC(namespace=namespace).drpolicy_obj.get()
    config.switch_ctx(restore_index)
    for cluster_name in drpolicy_data["spec"]["drClusters"]:
        if not cluster_name == primary_cluster_name:
            return cluster_name
Esempio n. 21
0
def get_current_primary_cluster_name(namespace):
    """
    Get current primary cluster name based on workload namespace

    Args:
        namespace (str): Name of the namespace

    Returns:
        str: Current primary cluster name

    """
    restore_index = config.cur_index
    drpc_data = DRPC(namespace=namespace).get()
    if drpc_data.get("spec").get("action") == constants.ACTION_FAILOVER:
        cluster_name = drpc_data["spec"]["failoverCluster"]
    else:
        cluster_name = drpc_data["spec"]["preferredCluster"]
    config.switch_ctx(restore_index)
    return cluster_name
Esempio n. 22
0
    def submariner_configure_upstream(self):
        """
        Deploy and Configure upstream submariner

        Raises:
            DRPrimaryNotFoundException: If there is no designated primary cluster found

        """
        if self.designated_broker_cluster_index < 0:
            raise DRPrimaryNotFoundException(
                "Designated primary cluster not found")

        # Deploy broker on designated cluster
        # follow this config switch statement carefully to be mindful
        # about the context with which we are performing the operations
        config.switch_ctx(self.designated_broker_cluster_index)
        logger.info(
            f"Switched context: {config.cluster_ctx.ENV_DATA['cluster_name']}")

        deploy_broker_cmd = "deploy-broker"
        try:
            run_subctl_cmd(deploy_broker_cmd)
        except CommandFailed:
            logger.exception("Failed to deploy submariner broker")
            raise

        # Label the gateway nodes on all non acm cluster
        restore_index = config.cur_index
        for cluster in get_non_acm_cluster_config():
            config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
            gateway_node = self.get_default_gateway_node()
            label_nodes([gateway_node],
                        constants.SUBMARINER_GATEWAY_NODE_LABEL)
        config.switch_ctx(restore_index)

        # Join all the clusters (except ACM cluster in case of hub deployment)
        for cluster in config.clusters:
            print(len(config.clusters))
            cluster_index = cluster.MULTICLUSTER["multicluster_index"]
            if cluster_index != config.get_acm_index():
                join_cmd = (f"join --kubeconfig {cluster.RUN['kubeconfig']} "
                            f"{config.ENV_DATA['submariner_info_file']} "
                            f"--clusterid c{self.cluster_seq} --natt=false")
                try:
                    run_subctl_cmd(join_cmd, )
                    logger.info(
                        f"Subctl join succeded for {cluster.ENV_DATA['cluster_name']}"
                    )
                except CommandFailed:
                    logger.exception("Cluster failed to join")
                    raise

                self.cluster_seq = self.cluster_seq + 1
                self.dr_only_list.append(cluster_index)
        # Verify submariner connectivity between clusters(excluding ACM)
        kubeconf_list = []
        for i in self.dr_only_list:
            kubeconf_list.append(config.clusters[i].RUN["kubeconfig"])
        connct_check = f"verify {' '.join(kubeconf_list)} --only connectivity"
        run_subctl_cmd(connct_check)
Esempio n. 23
0
def failover(failover_cluster, namespace):
    """
    Initiates Failover action to the specified cluster

    Args:
        failover_cluster (str): Cluster name to which the workload should be failed over
        namespace (str): Namespace where workload is running

    """
    restore_index = config.cur_index
    config.switch_acm_ctx()
    failover_params = f'{{"spec":{{"action":"{constants.ACTION_FAILOVER}","failoverCluster":"{failover_cluster}"}}}}'
    drpc_obj = DRPC(namespace=namespace)
    drpc_obj.wait_for_peer_ready_status()
    logger.info(f"Initiating Failover action with failoverCluster:{failover_cluster}")
    assert drpc_obj.patch(
        params=failover_params, format_type="merge"
    ), f"Failed to patch {constants.DRPC}: {drpc_obj.resource_name}"

    logger.info(
        f"Wait for {constants.DRPC}: {drpc_obj.resource_name} to reach {constants.STATUS_FAILEDOVER} phase"
    )
    drpc_obj.wait_for_phase(constants.STATUS_FAILEDOVER)
    config.switch_ctx(restore_index)
Esempio n. 24
0
def get_admin_key_from_provider():
    """
    Get admin key from rook-ceph-tools pod on provider

    Returns:
        str: The admin key obtained from rook-ceph-tools pod on provider.
            Return empty string if admin key is not obtained.

    """
    initial_cluster_index = config.cur_index
    config.switch_to_provider()
    admin_key = ""
    try:
        # Get the key from provider cluster rook-ceph-tools pod
        provider_tools_pod = get_ceph_tools_pod()
        admin_key = (provider_tools_pod.exec_cmd_on_pod(
            "grep key /etc/ceph/keyring").strip().split()[-1])
    except Exception as exc:
        logger.error(
            f"Couldn't find admin key from provider due to the error:\n{str(exc)}"
        )
    finally:
        config.switch_ctx(initial_cluster_index)
        return admin_key
Esempio n. 25
0
def relocate(preferred_cluster, namespace):
    """
    Initiates Relocate action to the specified cluster

    Args:
        preferred_cluster (str): Cluster name to which the workload should be relocated
        namespace (str): Namespace where workload is running

    """
    restore_index = config.cur_index
    config.switch_acm_ctx()
    relocate_params = f'{{"spec":{{"action":"{constants.ACTION_RELOCATE}","preferredCluster":"{preferred_cluster}"}}}}'
    drpc_obj = DRPC(namespace=namespace)
    drpc_obj.wait_for_peer_ready_status()
    logger.info(f"Initiating Relocate action with preferredCluster:{preferred_cluster}")
    assert drpc_obj.patch(
        params=relocate_params, format_type="merge"
    ), f"Failed to patch {constants.DRPC}: {drpc_obj.resource_name}"

    logger.info(
        f"Wait for {constants.DRPC}: {drpc_obj.resource_name} to reach {constants.STATUS_RELOCATED} phase"
    )
    drpc_obj.wait_for_phase(constants.STATUS_RELOCATED)
    config.switch_ctx(restore_index)
Esempio n. 26
0
def wait_for_mirroring_status_ok(replaying_images=None, timeout=300):
    """
    Wait for mirroring status to reach health OK and expected number of replaying
    images for each of the ODF cluster

    Args:
        replaying_images (int): Expected number of images in replaying state
        timeout (int): time in seconds to wait for mirroring status reach OK

    Returns:
        bool: True if status contains expected health and states values

    Raises:
        TimeoutExpiredError: In case of unexpected mirroring status

    """
    restore_index = config.cur_index
    if not replaying_images:
        replaying_images = 0
        for cluster in get_non_acm_cluster_config():
            config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
            replaying_images += len(
                get_all_pvcs_in_storageclass(constants.CEPHBLOCKPOOL_SC)
            )
        replaying_images -= 2  # Ignore db-noobaa-db-pg-0 PVCs

    for cluster in get_non_acm_cluster_config():
        config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
        logger.info(
            f"Validating mirroring status on cluster {cluster.ENV_DATA['cluster_name']}"
        )
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_mirroring_status_ok,
            replaying_images=replaying_images,
        )
        if not sample.wait_for_func_status(result=True):
            error_msg = (
                "The mirroring status does not have expected values within the time"
                f" limit on cluster {cluster.ENV_DATA['cluster_name']}"
            )
            logger.error(error_msg)
            raise TimeoutExpiredError(error_msg)

    config.switch_ctx(restore_index)
    return True
Esempio n. 27
0
    def test_pod_disruptions(self, create_pvcs_and_pods):
        """
        Test to perform pod disruption in consumer and provider cluster

        """
        # List of pods to be disrupted. Using different list for consumer and provider for the easy implementation
        pods_on_consumer = [
            "alertmanager_managed_ocs_alertmanager",
            "ocs_osd_controller_manager",
            "prometheus_managed_ocs_prometheus",
            "prometheus_operator",
            "ocs_operator",
        ]
        pods_on_provider = [
            "alertmanager_managed_ocs_alertmanager",
            "ocs_osd_controller_manager",
            "prometheus_managed_ocs_prometheus",
            "prometheus_operator",
            "ocs_provider_server",
            "ocs_operator",
        ]
        disruption_on_consumer = []
        disruption_on_provider = []

        # Start I/O
        log.info("Starting fio on all pods")
        for pod_obj in self.io_pods:
            if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK:
                storage_type = "block"
                direct = 1
            else:
                storage_type = "fs"
                direct = 0
            pod_obj.run_io(
                storage_type=storage_type,
                size="10G",
                fio_filename=f"{pod_obj.name}",
                runtime=320,
                end_fsync=1,
                direct=direct,
                invalidate=0,
                fio_installed=True,
            )

        consumer_index_iter = cycle(self.consumer_indexes)

        # Create Disruptions instance for each pod to be disrupted on consumer
        for pod_type in pods_on_consumer:
            consumer_index = next(consumer_index_iter)
            config.switch_ctx(consumer_index)
            disruption_obj = disruption_helpers.Disruptions()
            # Select each pod to be disrupted from different consumers
            disruption_obj.set_resource(resource=pod_type,
                                        cluster_index=consumer_index)
            disruption_obj.index_of_consumer = consumer_index
            disruption_on_consumer.append(disruption_obj)

        # Create Disruptions instance for each pod to be disrupted on provider
        config.switch_to_provider()
        for pod_type in pods_on_provider:
            disruption_obj = disruption_helpers.Disruptions()
            disruption_obj.set_resource(
                resource=pod_type, cluster_index=self.provider_cluster_index)
            disruption_on_provider.append(disruption_obj)

        # Delete pods on consumer one at a time
        log.info("Starting pod disruptions on consumer clusters")
        for disruptions_obj in disruption_on_consumer:
            disruptions_obj.delete_resource()
            # ocs-operator respin will trigger rook-ceph-tools pod respin.
            # Patch rook-ceph-tools pod to run ceph commands.
            if disruptions_obj.resource == "ocs_operator":
                config.switch_ctx(disruptions_obj.index_of_consumer)
                patch_consumer_toolbox()

        # Delete pods on provider one at a time
        log.info("Starting pod disruptions on provider cluster")
        for disruptions_obj in disruption_on_provider:
            disruptions_obj.delete_resource()

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.io_pods:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on pod {pod_obj.name}")
        log.info("IO is successful on all pods")

        # Performs different checks in the clusters
        for cluster_index in [self.provider_cluster_index
                              ] + self.consumer_indexes:
            config.switch_ctx(cluster_index)

            # Verify managedocs components are Ready
            log.info("Verifying managedocs components state")
            managedocs_obj = OCP(
                kind="managedocs",
                resource_name="managedocs",
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            for component in {"alertmanager", "prometheus", "storageCluster"}:
                assert (
                    managedocs_obj.get()["status"]["components"][component]
                    ["state"] == "Ready"
                ), f"{component} status is {managedocs_obj.get()['status']['components'][component]['state']}"

            # Verify storagecluster status
            log.info("Verifying storagecluster status")
            verify_storage_cluster()

            # Verify CSV status
            for managed_csv in {
                    constants.OCS_CSV_PREFIX,
                    constants.OSD_DEPLOYER,
                    constants.OSE_PROMETHEUS_OPERATOR,
            }:
                csvs = csv.get_csvs_start_with_prefix(
                    managed_csv, constants.OPENSHIFT_STORAGE_NAMESPACE)
                assert (
                    len(csvs) == 1
                ), f"Unexpected number of CSVs with {managed_csv} prefix: {len(csvs)}"
                csv_name = csvs[0]["metadata"]["name"]
                csv_obj = csv.CSV(
                    resource_name=csv_name,
                    namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
                )
                log.info(f"Check if {csv_name} is in Succeeded phase.")
                csv_obj.wait_for_phase(phase="Succeeded", timeout=600)

            # Verify the phase of ceph cluster
            log.info("Verify the phase of ceph cluster")
            cephcluster = OCP(kind="CephCluster",
                              namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
            cephcluster_yaml = cephcluster.get().get("items")[0]
            expected_phase = "Connected"
            if cluster_index == self.provider_cluster_index:
                expected_phase = "Ready"
            assert (
                cephcluster_yaml["status"]["phase"] == expected_phase
            ), f"Status of cephcluster {cephcluster_yaml['metadata']['name']} is {cephcluster_yaml['status']['phase']}"

        # Create PVC and pods on all consumer clusters
        log.info("Creating new PVCs and pods")
        pods = list()
        for cluster_index in self.consumer_indexes:
            config.switch_ctx(cluster_index)
            consumer_cluster_kubeconfig = os.path.join(
                config.clusters[cluster_index].ENV_DATA["cluster_path"],
                config.clusters[cluster_index].RUN.get("kubeconfig_location"),
            )
            pvcs, io_pods = create_pvcs_and_pods(
                pvc_size=self.pvc_size,
                replica_count=1,
                pod_dict_path=constants.PERF_POD_YAML,
            )
            for pvc_obj in pvcs:
                pvc_obj.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig
            for io_pod in io_pods:
                io_pod.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig
            pvcs[0].project.cluster_kubeconfig = consumer_cluster_kubeconfig
            pods.extend(io_pods)

        # Run I/O on new pods
        log.info("Running I/O on new pods")
        for pod_obj in pods:
            if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK:
                storage_type = "block"
                direct = 1
            else:
                storage_type = "fs"
                direct = 0
            pod_obj.run_io(
                storage_type=storage_type,
                size="10G",
                fio_filename=f"{pod_obj.name}",
                runtime=320,
                end_fsync=1,
                direct=direct,
                invalidate=0,
                fio_installed=True,
            )

        log.info("Wait for I/O to complete on new pods")
        for pod_obj in pods:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on the new pod {pod_obj.name}")
        log.info("IO is successful on new pods")
Esempio n. 28
0
    def test_resource_deletion_during_pvc_clone(self, pvc_clone_factory,
                                                pod_factory):
        """
        Verify PVC clone will succeeded if rook-ceph, csi pods are re-spun
        while creating the clone

        """
        pods_to_delete = [
            "rbdplugin_provisioner",
            "cephfsplugin_provisioner",
            "cephfsplugin",
            "rbdplugin",
            "osd",
            "mgr",
        ]
        executor = ThreadPoolExecutor(max_workers=len(self.pvcs) +
                                      len(pods_to_delete))
        disruption_ops = [
            disruption_helpers.Disruptions() for _ in pods_to_delete
        ]
        file_name = "file_clone"

        # Run IO
        log.info("Running fio on all pods to create a file")
        for pod_obj in self.pods:
            storage_type = ("block" if
                            (pod_obj.pvc.volume_mode
                             == constants.VOLUME_MODE_BLOCK) else "fs")
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=30,
                fio_filename=file_name,
                end_fsync=1,
            )

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.pods:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on pod {pod_obj.name}")
            # Calculate md5sum
            file_name_pod = (file_name if
                             (pod_obj.pvc.volume_mode
                              == constants.VOLUME_MODE_FILESYSTEM) else
                             pod_obj.get_storage_path(storage_type="block"))
            pod_obj.pvc.md5sum = cal_md5sum(
                pod_obj,
                file_name_pod,
                pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
            )
            log.info(f"md5sum obtained from pod {pod_obj.name}")
        log.info("IO is successful on all pods")

        # Select the pods to be deleted
        for disruption, pod_type in zip(disruption_ops, pods_to_delete):
            cluster_index = None
            # 'provider_index' will not be None if the platform is Managed Services
            if self.provider_index is not None:
                if pod_type in ["osd", "mgr"]:
                    cluster_index = self.provider_index
                    config.switch_to_provider()
                else:
                    cluster_index = self.consumer_index
                    config.switch_ctx(cluster_index)

            disruption.set_resource(resource=pod_type,
                                    cluster_index=cluster_index)

        # Switch cluster context if the platform is MS. 'provider_index' will not be None if platform is MS.
        if self.provider_index is not None:
            config.switch_ctx(self.consumer_index)

        # Clone PVCs
        log.info("Start creating clone of PVCs")
        for pvc_obj in self.pvcs:
            log.info(f"Creating clone of PVC {pvc_obj.name}")
            pvc_obj.clone_proc = executor.submit(
                pvc_clone_factory,
                pvc_obj=pvc_obj,
                status="",
                access_mode=pvc_obj.get_pvc_access_mode,
                volume_mode=pvc_obj.volume_mode,
            )
        log.info("Started creating clone")

        # Delete the pods 'pods_to_delete'
        log.info(f"Deleting pods {pods_to_delete}")
        for disruption in disruption_ops:
            disruption.delete_proc = executor.submit(
                disruption.delete_resource)

        # Wait for delete and recovery
        [disruption.delete_proc.result() for disruption in disruption_ops]

        # Get cloned PVCs
        clone_pvc_objs = []
        for pvc_obj in self.pvcs:
            clone_obj = pvc_obj.clone_proc.result()
            clone_pvc_objs.append(clone_obj)
            log.info(f"Created clone {clone_obj.name} of PVC {pvc_obj.name}")
        log.info("Created clone of all PVCs")

        # Confirm that the cloned PVCs are Bound
        log.info("Verifying the cloned PVCs are Bound")
        for pvc_obj in clone_pvc_objs:
            wait_for_resource_state(resource=pvc_obj,
                                    state=constants.STATUS_BOUND,
                                    timeout=300)
            pvc_obj.reload()
            pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"]
        log.info("Verified: Cloned PVCs are Bound.")

        clone_pod_objs = []

        # Attach the cloned PVCs to pods
        log.info("Attach the cloned PVCs to pods")
        for pvc_obj in clone_pvc_objs:
            if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK:
                pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML
            else:
                pod_dict_path = ""
            restore_pod_obj = pod_factory(
                interface=pvc_obj.interface,
                pvc=pvc_obj,
                status="",
                pod_dict_path=pod_dict_path,
                raw_block_pv=pvc_obj.volume_mode ==
                constants.VOLUME_MODE_BLOCK,
            )
            clone_pod_objs.append(restore_pod_obj)

        # Verify the new pods are running
        log.info("Verify the new pods are running")
        for pod_obj in clone_pod_objs:
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
        log.info("Verified: New pods are running")

        # Verify md5sum
        log.info("Verify md5sum")
        for pod_obj in clone_pod_objs:
            file_name_pod = (file_name if
                             (pod_obj.pvc.volume_mode
                              == constants.VOLUME_MODE_FILESYSTEM) else
                             pod_obj.get_storage_path(storage_type="block"))
            verify_data_integrity(
                pod_obj,
                file_name_pod,
                pod_obj.pvc.parent.md5sum,
                pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
            )
            log.info(
                f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} "
                f"matches with the original md5sum")
        log.info("Data integrity check passed on all pods")

        # Run IO
        log.info("Running IO on new pods")
        for pod_obj in clone_pod_objs:
            storage_type = ("block" if
                            (pod_obj.pvc.volume_mode
                             == constants.VOLUME_MODE_BLOCK) else "fs")
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=20,
                fio_filename=file_name,
                end_fsync=1,
            )

        log.info("Wait for IO to complete on new pods")
        for pod_obj in clone_pod_objs:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on new pod {pod_obj.name}")
        log.info("IO to completed on new pods")
Esempio n. 29
0
 def finalizer():
     # Switching to provider cluster context will be done during the test case.
     # Switch back to consumer cluster context after the test case.
     config.switch_ctx(initial_cluster_index)
Esempio n. 30
0
    def create_cluster_prereq(self, timeout=600):
        """
        Perform all prereqs before vsphere cluster creation from ACM

        Args:
            timeout (int): Timeout for any UI operations

        """
        # Create vsphre credentials
        # Click on 'Add credential' in 'Infrastructure provider' page
        self.navigate_create_clusters_page()
        self.refresh_page()
        hard_timeout = config.ENV_DATA.get("acm_ui_hard_deadline", 1200)
        remaining = hard_timeout
        while True:
            ret = self.check_element_presence(
                (By.XPATH,
                 self.acm_page_nav[PLATFORM_XPATH_MAP[self.platform]][0]),
                timeout=300,
            )
            if ret:
                log.info("Found platform icon")
                break
            else:
                if remaining < 0:
                    raise TimeoutException(
                        "Timedout while waiting for platform icon")
                else:
                    remaining -= timeout
                    self.navigate_create_clusters_page()
                    self.refresh_page()

        self.do_click(
            locator=self.acm_page_nav[PLATFORM_XPATH_MAP[self.platform]],
            timeout=100)

        # "Basic vsphere credential info"
        # 1. credential name
        # 2. Namespace
        # 3. Base DNS domain
        self.do_click(locator=self.acm_page_nav["cc_provider_credentials"],
                      timeout=100)
        parent_tab = self.driver.current_window_handle
        tabs = self.driver.window_handles
        self.driver.switch_to.window(tabs[1])
        self.do_click(locator=self.acm_page_nav["cc_provider_creds_vsphere"])

        basic_cred_dict = {
            self.acm_page_nav["cc_provider_creds_vsphere_cred_name"]:
            self.platform_credential_name,
            self.acm_page_nav["cc_provider_creds_vsphere_base_dns"]:
            f"{self.cluster_conf.ENV_DATA['base_domain']}",
        }
        self.fill_multiple_textbox(basic_cred_dict)
        # Credential Namespace is not a text box but a dropdown
        self.do_click(
            self.acm_page_nav["cc_provider_creds_vsphere_cred_namespace"])
        self.do_click(self.acm_page_nav["cc_provider_creds_default_namespace"])

        # click on 'Next' button at the bottom
        self.click_next_button()

        # Detailed VMWare credentials section
        # 1. vCenter server
        # 2. vCenter username
        # 3. vCenter password
        # 4. cVenter root CA certificate
        # 5. vSphere cluster name
        # 6. vSphere datacenter
        # 7. vSphere default  Datastore
        with open(VSPHERE_CA_FILE_PATH, "r") as fp:
            vsphere_ca = fp.read()
        vsphere_creds_dict = {
            self.acm_page_nav["cc_provider_creds_vsphere_vcenter_server"]:
            f"{self.cluster_conf.ENV_DATA['vsphere_server']}",
            self.acm_page_nav["cc_provider_creds_vsphere_username"]:
            f"{self.cluster_conf.ENV_DATA['vsphere_user']}",
            self.acm_page_nav["cc_provider_creds_vsphere_password"]:
            f"{self.cluster_conf.ENV_DATA['vsphere_password']}",
            self.acm_page_nav["cc_provider_creds_vsphere_rootca"]:
            f"{vsphere_ca}",
            self.acm_page_nav["cc_provider_creds_vsphere_clustername"]:
            f"{self.cluster_conf.ENV_DATA['vsphere_cluster']}",
            self.acm_page_nav["cc_provider_creds_vsphere_dc"]:
            f"{self.cluster_conf.ENV_DATA['vsphere_datacenter']}",
            self.acm_page_nav["cc_provider_creds_vsphere_datastore"]:
            f"{self.cluster_conf.ENV_DATA['vsphere_datastore']}",
        }
        self.fill_multiple_textbox(vsphere_creds_dict)
        self.click_next_button()

        # Pull Secret and SSH
        # 1. Pull secret
        # 2. SSH Private key
        # 3. SSH Public key
        with open(os.path.join(DATA_DIR, "pull-secret"), "r") as fp:
            pull_secret = fp.read()
        ssh_pub_key_path = os.path.expanduser(
            self.cluster_conf.DEPLOYMENT["ssh_key"])
        ssh_priv_key_path = os.path.expanduser(
            self.cluster_conf.DEPLOYMENT["ssh_key_private"])

        with open(ssh_pub_key_path, "r") as fp:
            ssh_pub_key = fp.read()

        with open(ssh_priv_key_path, "r") as fp:
            ssh_priv_key = fp.read()

        pull_secret_and_ssh = {
            self.acm_page_nav["cc_provider_creds_vsphere_pullsecret"]:
            f"{pull_secret}",
            self.acm_page_nav["cc_provider_creds_vsphere_ssh_privkey"]:
            f"{ssh_priv_key}",
            self.acm_page_nav["cc_provider_creds_vsphere_ssh_pubkey"]:
            f"{ssh_pub_key}",
        }
        self.fill_multiple_textbox(pull_secret_and_ssh)
        self.click_next_button()
        self.do_click(
            locator=self.acm_page_nav["cc_provider_creds_vsphere_add_button"])
        # Go to credentials tab
        self.do_click(locator=self.acm_page_nav["Credentials"])
        credential_table_entry = format_locator(
            self.acm_page_nav["cc_table_entry"], self.platform_credential_name)
        if not self.check_element_presence(
            (By.XPATH, credential_table_entry[0]), timeout=20):
            raise ACMClusterDeployException(
                "Could not create credentials for vsphere")
        else:
            log.info(
                f"vsphere credential successfully created {self.platform_credential_name}"
            )
        # Get the ips in prereq itself
        from ocs_ci.deployment import vmware

        # Switch context to cluster which we are about to create
        prev_ctx = config.cur_index
        config.switch_ctx(self.cluster_conf.MULTICLUSTER["multicluster_index"])
        self.ips = vmware.assign_ips(self.nvips)
        vmware.create_dns_records(self.ips)
        config.switch_ctx(prev_ctx)
        self.driver.close()
        self.driver.switch_to.window(parent_tab)
        self.driver.switch_to.default_content()