Beispiel #1
0
    def start_baremetal_machines_with_ipmi_ctx(self, ipmi_ctxs, wait=True):
        """
        Start Baremetal Machines using Ipmi ctx

        Args:
            ipmi_ctxs (list): List of BM ipmi_ctx
            wait (bool): Wait for BMs to start

        """
        for ipmi_ctx in ipmi_ctxs:
            ipmi_ctx.chassis_control_power_up()

        if wait:
            for ipmi_ctx in ipmi_ctxs:
                for status in TimeoutSampler(600, 5, self.get_power_status, ipmi_ctx):
                    logger.info(
                        f"Waiting for Baremetal Machine to power on. "
                        f"Current Baremetal status: {status}"
                    )
                    if status == VM_POWERED_ON:
                        logger.info("Baremetal Machine reached poweredOn status")
                        break

        wait_for_cluster_connectivity(tries=400)
        wait_for_nodes_status(
            node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800
        )
        wait_for_nodes_status(
            node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800
        )
Beispiel #2
0
    def start_powernodes_machines(self,
                                  powernode_machines,
                                  timeout=900,
                                  wait=True,
                                  force=True):
        """
        Start PowerNode Machines

        Args:
            powernode_machines (list): List of PowerNode machines
            timeout (int): time in seconds to wait for node to reach 'not ready' state,
                and 'ready' state.
            wait (bool): Wait for PowerNodes to start - for future use
            force (bool): True for PowerNode ungraceful power off, False for
                graceful PowerNode shutdown - for future use
        """
        ocpversion = get_ocp_version("-")
        for node in powernode_machines:
            result = exec_cmd(
                f"sudo virsh start test-ocp{ocpversion}-{node.name}")
            logger.info(f"Result of shutdown {result}")

        wait_for_cluster_connectivity(tries=900)
        wait_for_nodes_status(node_names=get_master_nodes(),
                              status=constants.NODE_READY,
                              timeout=timeout)
        wait_for_nodes_status(node_names=get_worker_nodes(),
                              status=constants.NODE_READY,
                              timeout=timeout)
Beispiel #3
0
    def health_check(self, cluster_check=True, tries=20):
        """
        Perform Ceph and cluster health checks
        """
        wait_for_cluster_connectivity(tries=400)
        logger.info("Checking cluster and Ceph health")
        node.wait_for_nodes_status(timeout=300)

        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=tries)
        if cluster_check:
            self.ceph_cluster.cluster_health_check(timeout=60)
Beispiel #4
0
    def restart(self, node, timeout):
        """
        Restart the kubelet service using parent service class. After that, ensures the corresponding OCP node
        is connectable and moves to Ready state.

        Args:
            node (object): Node objects
            timeout (int): time in seconds to wait for service to stop.

        """
        super().restart(node, timeout)
        wait_for_cluster_connectivity(tries=900)
        wait_for_nodes_status(node_names=[node.name],
                              status=constants.NODE_READY,
                              timeout=timeout)
Beispiel #5
0
    def start_baremetal_machines(self, baremetal_machine, wait=True):
        """
        Start Baremetal Machines

        Args:
            baremetal_machine (list): BM objects
            wait (bool): Wait for BMs to start

        """
        for node in baremetal_machine:
            if self.mgmt_details[node.name]:
                ipmi_ctx = self.get_ipmi_ctx(
                    host=self.mgmt_details[node.name]["mgmt_console"],
                    user=self.mgmt_details[node.name]["mgmt_username"],
                    password=self.mgmt_details[node.name]["mgmt_password"],
                )
                logger.info(f"Powering On {node.name}")
                ipmi_ctx.chassis_control_power_up()
            if wait:
                if self.mgmt_details[node.name]:
                    ipmi_ctx = self.get_ipmi_ctx(
                        host=self.mgmt_details[node.name]["mgmt_console"],
                        user=self.mgmt_details[node.name]["mgmt_username"],
                        password=self.mgmt_details[node.name]["mgmt_password"],
                    )
                    for status in TimeoutSampler(
                        600, 5, self.get_power_status, ipmi_ctx
                    ):
                        logger.info(
                            f"Waiting for Baremetal Machine {node.name} to power on. "
                            f"Current Baremetal status: {status}"
                        )
                        if status == VM_POWERED_ON:
                            logger.info(
                                f"Baremetal Machine {node.name} reached poweredOn status"
                            )
                            ipmi_ctx.session.close()
                            break

        wait_for_cluster_connectivity(tries=400)
        wait_for_nodes_status(
            node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800
        )
        wait_for_nodes_status(
            node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800
        )
def wait_for_nodes_status_and_prometheus_health_check(pods):
    """
    Waits for the all the nodes to be in running state
    and also check prometheus health

    """

    # Validate all nodes are in READY state
    ocp.wait_for_cluster_connectivity(tries=400)
    wait_for_nodes_status(timeout=1800)

    # Check for the created pvc metrics after rebooting the master nodes
    for pod_obj in pods:
        assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
            f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
        )

    assert prometheus_health_check(), "Prometheus health is degraded"
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type="master",
                                                   print_table=True)

        if pod_name_of_node == "couchbase":
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == "osd":
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == "master":
            master_node = get_nodes(pod_name_of_node, num_of_nodes=1)

        # Restart relevant node
        if pod_name_of_node == "master":
            nodes.restart_nodes(master_node, wait=False)
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)
        else:
            restart_node = get_node_objs(node_list[random.randint(
                0,
                len(node_list) - 1)])
            nodes.restart_nodes(restart_node)

        # Validate all nodes and services are in READY state and up

        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        retry((CommandFailed), tries=60,
              delay=15)(bg_handler.wait_for_bg_operations(bg_ops,
                                                          timeout=3600))
        self.sanity_helpers.health_check(tries=40)
Beispiel #8
0
    def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup):
        """
        Test case to validate rebooting master node shouldn't effect
        amq workloads running in background

        """
        # Get all amq pods
        pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE)

        # Get the node list
        node = get_nodes(node_type, num_of_nodes=1)

        # Reboot one master nodes
        nodes.restart_nodes(node, wait=False)

        # Wait some time after rebooting master
        waiting_time = 40
        log.info(f"Waiting {waiting_time} seconds...")
        time.sleep(waiting_time)

        # Validate all nodes and services are in READY state and up
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all amq pods are up and running
        assert POD.wait_for_resource(condition="Running",
                                     resource_count=len(pod_obj_list),
                                     timeout=300)

        # Validate the results
        log.info("Validate message run completely")
        for thread in self.threads:
            thread.result(timeout=1800)
    def test_replication_with_disruptions(
        self,
        awscli_pod_session,
        mcg_obj_session,
        cld_mgr,
        bucket_factory,
        source_bucketclass,
        target_bucketclass,
        test_directory_setup,
        nodes,
    ):

        # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket
        target_bucket_name = bucket_factory(
            bucketclass=target_bucketclass)[0].name
        replication_policy = ("basic-replication-rule", target_bucket_name,
                              None)
        source_bucket_name = bucket_factory(
            bucketclass=source_bucketclass,
            replication_policy=replication_policy)[0].name
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            source_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=5,
            pattern="first-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Uni-directional bucket replication working as expected")

        # change from uni-directional to bi-directional replication policy
        logger.info(
            "Changing the replication policy from uni to bi-directional!")
        bi_replication_policy_dict = {
            "spec": {
                "additionalConfig": {
                    "replicationPolicy":
                    json.dumps([{
                        "rule_id": "basic-replication-rule-2",
                        "destination_bucket": source_bucket_name,
                    }])
                }
            }
        }
        OCP(
            namespace=config.ENV_DATA["cluster_namespace"],
            kind="obc",
            resource_name=target_bucket_name,
        ).patch(params=json.dumps(bi_replication_policy_dict),
                format_type="merge")
        logger.info(
            "Patch ran successfully! Changed the replication policy from uni to bi directional"
        )

        # write objects to the second bucket and see if it's replicated on the other
        logger.info("checking if bi-directional replication works!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=3,
            pattern="second-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")
        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Bi directional bucket replication working as expected")

        # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on
        # write
        logger.info(
            "checking replication when one of the bucket's objects are deleted!!"
        )
        try:
            mcg_obj_session.s3_resource.Bucket(
                target_bucket_name).objects.all().delete()
        except CommandFailed as e:
            logger.error(f"[Error] while deleting objects: {e}")
        if len(
                mcg_obj_session.s3_list_all_objects_in_bucket(
                    target_bucket_name)) != 0:
            assert (
                False
            ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}"
        logger.info("All the objects in RGW namespace buckets are deleted!!!")

        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="third-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info(
            "All the objects retrieved back to s3-compatible bucket on new write!!"
        )

        # restart RGW pods and then see if object sync still works
        logger.info(
            "Checking if the replication works when there is RGW pod restarts!!"
        )
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fourth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        pod_names = get_pod_name_by_pattern(
            "rgw", namespace=config.ENV_DATA["cluster_namespace"])
        pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"])
        delete_pods(pod_objs=pod_objs)
        wait_for_pods_to_be_running(
            pod_names=pod_names,
            namespace=config.ENV_DATA["cluster_namespace"])

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Object sync works after the RGW pod restarted!!")

        # write some object to any of the bucket, followed by immediate cluster restart
        logger.info("Checking replication when there is a cluster reboot!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fifth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        node_list = get_worker_nodes()
        node_objs = get_node_objs(node_list)
        nodes.restart_nodes(node_objs, timeout=500)
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        wait_for_pods_to_be_running(
            namespace=config.ENV_DATA["cluster_namespace"], timeout=800)
        logger.info("Nodes rebooted successfully!!")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Objects sync works even when the cluster is rebooted")