Exemple #1
0
    def test_rolling_reboot_node(self, node_type):
        """
        Test to rolling reboot of nodes
        """

        # Get info from SCALE_DATA_FILE for validation
        if os.path.exists(SCALE_DATA_FILE):
            file_data = templating.load_yaml(SCALE_DATA_FILE)
            namespace = file_data.get("NAMESPACE")
            pod_scale_list = file_data.get("POD_SCALE_LIST")
            pvc_scale_list = file_data.get("PVC_SCALE_LIST")
        else:
            raise FileNotFoundError

        node_list = list()

        # Rolling reboot nodes
        if node_type == constants.WORKER_MACHINE:
            tmp_list = get_nodes(node_type=node_type)
            ocs_node_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
            for tmp in tmp_list:
                if tmp.name in ocs_node_list:
                    node_list.append(tmp)
        else:
            node_list = get_nodes(node_type=node_type)

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in node_list:
            nodes.restart_nodes(nodes=[node])
            scale_lib.validate_node_and_oc_services_are_up_after_reboot()

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate cluster health ok and all pods are running
        assert utils.ceph_health_check(
            delay=180
        ), "Ceph health in bad state after node reboots"

        # Validate all PVCs from namespace are in Bound state
        assert scale_lib.validate_all_pvcs_and_check_state(
            namespace=namespace, pvc_scale_list=pvc_scale_list
        )

        # Validate all PODs from namespace are up and running
        assert scale_lib.validate_all_pods_and_check_state(
            namespace=namespace, pod_scale_list=pod_scale_list
        )
Exemple #2
0
    def __init__(self, **kwargs):
        """
        Initializer function

        Args:
            kwargs (dict):
                Following kwargs are valid
                repo: Ripsaw repo to used - a github link
                branch: branch to use from the repo
                namespace: namespace for the operator

        Example Usage:
            r1 = RipSaw()
            r1.apply_crd(crd='ripsaw_v1alpha1_ripsaw_crd.yaml')
            # use oc apply to apply custom modified bench
            my_custom_bench = my_custom_bench.yaml
            run_cmd('oc apply -f my_custom_bench')
        """
        self.args = kwargs
        self.repo = self.args.get(
            "repo", "https://github.com/cloud-bulldozer/benchmark-operator")
        self.branch = self.args.get("branch", "master")
        self.namespace = self.args.get("namespace", RIPSAW_NAMESPACE)
        self.pgsql_is_setup = False
        self.ocp = OCP()
        self.ns_obj = OCP(kind="namespace")
        self.pod_obj = OCP(namespace=RIPSAW_NAMESPACE, kind="pod")
        self._create_namespace()
        self._clone_ripsaw()
        self.worker_nodes = [node.name for node in get_nodes()]
        helpers.label_worker_node(self.worker_nodes,
                                  label_key="kernel-cache-dropper",
                                  label_value="yes")
Exemple #3
0
 def test_osd_balance(self, es):
     """
     Current pattern is:
         add 6 osds (9 total, 3 nodes)
         add 3 nodes
         add 9 osds (18 total, 6 nodes)
         add 3 nodes
         add 9 osds (27 total, 9 nodes)
     """
     crd_data = templating.load_yaml(constants.OSD_SCALE_BENCHMARK_YAML)
     our_uuid = uuid4().hex
     self.elastic_info = ElasticData(our_uuid, crd_data)
     self.elastic_info.es_connect()
     collect_stats(INITIAL_SETUP, self.elastic_info)
     for cntr in range(0, MAX_TIMES_ADDED):
         num_nodes = len(get_nodes(constants.WORKER_MACHINE))
         osd_incr = 3
         if cntr == 0 and num_nodes == START_NODE_NUM:
             osd_incr = 2
         if osd_incr == 3:
             scale_ocs_node()
             collect_stats("Three nodes have been added", self.elastic_info)
         cntval = 3 * osd_incr
         logging.info(f"Adding {cntval} osds to nodes")
         scale_capacity_with_deviceset(add_deviceset_count=osd_incr,
                                       timeout=900)
         collect_stats("OSD capacity increase", self.elastic_info)
     collect_stats(FINAL_REPORT, self.elastic_info)
Exemple #4
0
    def test_nodes_restart(self, nodes, node_type):
        """
        Test nodes restart (from the platform layer)

        """
        node_count = len(get_nodes(node_type=node_type))
        if node_type == constants.WORKER_MACHINE:
            ocp_nodes = get_nodes(node_type=node_type)
        else:
            ocp_nodes = get_nodes(node_type=node_type, num_of_nodes=2)

        nodes.restart_nodes(nodes=ocp_nodes, wait=False)
        wait_for_node_count_to_reach_status(node_count=node_count,
                                            node_type=node_type)
        self.sanity_helpers.health_check()
        self.create_resources()
Exemple #5
0
def initialize_data():
    """
    Initialize the data dictionary with cluster data

    Returns:
        dict: A dictionary contains the data to push to the dashboard
    """

    # worker type is relevant only for cloud instances.
    log.info("Initializing the dashboard data")
    worker_lbl = get_nodes(num_of_nodes=1)[0].data["metadata"]["labels"]
    if "beta.kubernetes.io/instance-type" in worker_lbl:
        worker_type = worker_lbl["beta.kubernetes.io/instance-type"]
    else:
        # TODO: Maybe for None cloud we can add the Arch ?
        #   worker_type = worker_lbl['kubernetes.io/arch']
        worker_type = ""
    log.info(f"The worker type is {worker_type}")

    (ocs_ver_info, _) = get_ocs_version()
    ocs_ver_full = ocs_ver_info["status"]["desired"]["version"]
    m = re.match(r"(\d.\d).(\d)", ocs_ver_full)
    if m and m.group(1) is not None:
        ocs_ver = m.group(1)
    log.info(f"ocs_ver is {ocs_ver_full}")
    platform = config.ENV_DATA["platform"].upper()
    if platform.lower() not in ["vsphere", "baremetal"]:
        platform = f"{platform.upper()} {worker_type}"
    data_template["commitid"] = ocs_ver_full
    data_template["project"] = f"OCS{ocs_ver}"
    data_template["branch"] = ocs_ver_info["spec"]["channel"]
    data_template["executable"] = ocs_ver
    data_template["environment"] = platform

    return data_template
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            get_nodes(node_type=node_type, num_of_nodes=1)[0]
            for node_type in ["worker", "master"]
        ]
        assert nodes, "Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Exemple #7
0
    def get_node_name_where_jenkins_pod_not_hosted(self,
                                                   node_type=constants.
                                                   WORKER_MACHINE,
                                                   num_of_nodes=1):
        """
        get nodes

        Args:
            node_type (str): The node type  (e.g. worker, master)
            num_of_nodes (int): The number of nodes to be returned

        Returns:
            list: List of compute node names
        """
        if node_type == constants.MASTER_MACHINE:
            nodes_drain = [
                node.name for node in get_nodes(node_type=node_type,
                                                num_of_nodes=num_of_nodes)
            ]
        elif node_type == constants.WORKER_MACHINE:
            pod_objs = []
            for project in self.projects:
                pod_names = get_pod_name_by_pattern(pattern="jenkins",
                                                    namespace=project)
                pod_obj = [
                    get_pod_obj(name=pod_name, namespace=project)
                    for pod_name in pod_names
                ]
                pod_objs += pod_obj
            nodes_app_name = set(get_app_pod_running_nodes(pod_objs))
            nodes_worker_name = set(get_worker_nodes())
            nodes_drain = nodes_worker_name - nodes_app_name
        else:
            raise ValueError("The node type is worker or master")
        return list(nodes_drain)[:num_of_nodes]
Exemple #8
0
    def test_pv_after_reboot_node(self, nodes):
        """
        Verify unexpected PV is not created after node reboot on LSO cluster

        """
        pv_before_reset = get_pv_names()
        worker_nodes = get_nodes(node_type=constants.WORKER_MACHINE,
                                 num_of_nodes=3)
        ocp_obj = OCP(kind=constants.PV)
        for worker_node in worker_nodes:
            # Restart one worker node
            nodes.restart_nodes(nodes=[worker_node], wait=True)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
            logger.info(f"Verify PV after reboot {worker_node}")
            pv_after_reset = get_pv_names()
            pv_diff = set(pv_after_reset) - set(pv_before_reset)
            pv_new = []
            for pv in pv_diff:
                pv_obj = ocp_obj.get(resource_name=pv)
                if pv_obj["spec"]["storageClassName"] == "localblock":
                    pv_new.append(pv)
            assert (
                not pv_new
            ), f"Unexpected PV {pv_new} created after reboot {worker_node}"
        logger.info("SUCCESS - No new PV was created.")
Exemple #9
0
    def get_node_by_attached_volume(self, volume):
        """
        Get the node by attached volume on IBM Cloud.

        Args:
            volume (str): volume id.

        Raises:
            NodeHasNoAttachedVolume: In case the volume is not attached to node

        Returns:
            str: worker id

        """
        cmd = f"ibmcloud is volume {volume} --output json"
        out = run_ibmcloud_cmd(cmd)
        out = json.loads(out)

        if not out["volume_attachments"]:
            logger.info("volume is not attached to node")
            raise NodeHasNoAttachedVolume("volume not attached to node")
        else:
            worker_id = out["volume_attachments"][0]["instance"]["name"]
            logger.info(f"volume is  attached to node: {worker_id}")
            worker_nodes = get_nodes(node_type="worker")
            for worker_node in worker_nodes:
                logger.info(
                    f"worker node id is:{worker_node.get()['metadata']['labels']['ibm-cloud.kubernetes.io/worker-id']}"
                )
                if (worker_node.get()["metadata"]["labels"]
                    ["ibm-cloud.kubernetes.io/worker-id"] == worker_id):
                    logger.info(f"return worker node is:{worker_id}")
                    return worker_node
Exemple #10
0
    def node_operations_entry_criteria(
        self,
        node_type,
        number_of_nodes,
        operation_name="Node Operation",
        network_fail_time=None,
    ):
        """
        Entry criteria function for node related operations

        Args:
            node_type (str): Type of node
            number_of_nodes (int): Number of nodes
            operation_name (str): Name of the node operation
            network_fail_time (int): Total time to fail the network in a node

        Returns:
            tuple: containing the params used in Node operations

        """
        self.validate_cluster(node_status=True, operation_name=operation_name)

        logger.info(f"Getting parameters related to: {operation_name}")
        typed_nodes = node.get_nodes(node_type=node_type,
                                     num_of_nodes=number_of_nodes)
        if network_fail_time:
            return typed_nodes, network_fail_time
        else:
            return typed_nodes
Exemple #11
0
def test_rebootnodes():
    """
    Check basic consistency in platform handling.
    """
    ibmcloud = IBMCloud()
    worker_nodes = node.get_nodes(node_type="worker")
    ibmcloud.restart_nodes(worker_nodes)
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        reboot_events_cmd = (
            f"get events -A --field-selector involvedObject.name="
            f"{typed_node_name},reason=Rebooted -o yaml")

        # Find the number of reboot events in 'typed_node_name'
        num_events = len(
            typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=False)

        try:
            wait_for_nodes_status(
                node_names=[typed_node_name],
                status=constants.NODE_NOT_READY_SCHEDULING_DISABLED,
            )
        except ResourceWrongStatusException:
            # Sometimes, the node will be back to running state quickly so
            # that the status change won't be detected. Verify the node was
            # actually restarted by checking the reboot events count
            new_num_events = len(
                typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])
            assert new_num_events > num_events, (
                f"Reboot event not found."
                f"Node {typed_node_name} did not restart.")

        wait_for_nodes_status(
            node_names=[typed_node_name],
            status=constants.NODE_READY_SCHEDULING_DISABLED,
        )

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_node_maintenance(self, reduce_and_resume_cluster_load, node_type,
                              pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=90)
Exemple #14
0
def get_new_device_paths(device_sets_required, osd_size_capacity_requested):
    """
    Get new device paths to add capacity over Baremetal cluster

    Args:
        device_sets_required (int) : Count of device sets to be added
        osd_size_capacity_requested (int) : Requested OSD size capacity

    Returns:
        list : List containing added device paths

    """
    ocp_obj = OCP(kind="localvolume",
                  namespace=config.ENV_DATA["local_storage_namespace"])
    workers = get_nodes(node_type="worker")
    worker_names = [worker.name for worker in workers]
    config.ENV_DATA["worker_replicas"] = len(worker_names)
    output = ocp_obj.get(resource_name="local-block")
    # Fetch device paths present in the current LVCR
    cur_device_list = output["spec"]["storageClassDevices"][0]["devicePaths"]
    # Clone repo and run playbook to fetch all device paths from each node
    path = os.path.join(constants.EXTERNAL_DIR, "device-by-id-ocp")
    clone_repo(constants.OCP_QE_DEVICEPATH_REPO, path)
    os.chdir(path)
    run_cmd("ansible-playbook devices_by_id.yml")
    # Filter unused/unallocated device paths
    with open("local-storage-block.yaml", "r") as cloned_file:
        with open("local-block.yaml", "w") as our_file:
            device_from_worker = [1] * config.ENV_DATA["worker_replicas"]
            cur_line = cloned_file.readline()
            while "devicePaths:" not in cur_line:
                our_file.write(cur_line)
                cur_line = cloned_file.readline()
            our_file.write(cur_line)
            cur_line = cloned_file.readline()
            # Add required number of device path from each worker node
            while cur_line:
                if str(osd_size_capacity_requested) in cur_line:
                    for i in range(len(worker_names)):
                        if device_from_worker[i] and (str(worker_names[i])
                                                      in cur_line):
                            if not any(s in cur_line for s in cur_device_list):
                                our_file.write(cur_line)
                                device_from_worker[
                                    i] = device_from_worker[i] - 1
                cur_line = cloned_file.readline()
    local_block_yaml = open("local-block.yaml")
    lvcr = yaml.load(local_block_yaml, Loader=yaml.FullLoader)
    new_dev_paths = lvcr["spec"]["storageClassDevices"][0]["devicePaths"]
    logger.info(f"Newly added devices are: {new_dev_paths}")
    if new_dev_paths:
        assert len(new_dev_paths) == (
            len(worker_names) * device_sets_required
        ), f"Current devices available = {len(new_dev_paths)}"
        os.chdir(constants.TOP_DIR)
        shutil.rmtree(path)
        # Return list of old device paths and newly added device paths
        cur_device_list.extend(new_dev_paths)
    return cur_device_list
    def test_registry_rolling_reboot_node(self, node_type, nodes):
        """
        Test registry workload when backed by OCS and reboot node one by one
        """

        # Get the node list
        node_list = get_nodes(node_type)

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(project_name=self.project_name)

        # Validate image exists in registries path
        validate_image_exists()

        for node in node_list:

            # Reboot node
            log.info(node.name)
            nodes.restart_nodes([node], wait=False)

            # Wait some time after rebooting node
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate all nodes and services are in READY state and up
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_cluster_connectivity)(tries=400)
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_nodes_status)(timeout=900)

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists()
Exemple #16
0
def test_attachvolume(get_volume):
    """
    Check basic consistency in platform handling.
    """
    ibmcloud = IBMCloud()

    worker_nodes = node.get_nodes(node_type="worker", num_of_nodes=1)
    ibmcloud.attach_volume(get_volume, worker_nodes)
Exemple #17
0
def verify_image_versions(old_images, upgrade_version, version_before_upgrade):
    """
    Verify if all the images of OCS objects got upgraded

    Args:
        old_images (set): set with old images
        upgrade_version (packaging.version.Version): version of OCS
        version_before_upgrade (float): version of OCS before upgrade

    """
    number_of_worker_nodes = len(get_nodes())
    osd_count = get_osd_count()
    verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL)
    verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL)
    # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID,
    # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID
    nooba_pods = 2 if upgrade_version < parse_version("4.3") else 3
    verify_pods_upgraded(old_images,
                         selector=constants.NOOBAA_APP_LABEL,
                         count=nooba_pods)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_CEPHFSPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_RBDPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.MON_APP_LABEL,
        count=3,
    )
    verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL)
    osd_timeout = 600 if upgrade_version >= parse_version("4.5") else 750
    verify_pods_upgraded(
        old_images,
        selector=constants.OSD_APP_LABEL,
        count=osd_count,
        timeout=osd_timeout * osd_count,
    )
    verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2)
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        rgw_count = get_rgw_count(upgrade_version.base_version, True,
                                  version_before_upgrade)
        verify_pods_upgraded(
            old_images,
            selector=constants.RGW_APP_LABEL,
            count=rgw_count,
        )
Exemple #18
0
def add_worker_based_on_pods_count_per_node(
    node_count, expected_count, role_type=None, machineset_name=None
):
    """
    Function to evaluate number of pods up in node and add new node accordingly.

    Args:
        machineset_name (list): Machineset_names to add more nodes if required.
        node_count (int): Additional nodes to be added
        expected_count (int): Expected pod count in one node
        role_type (str): To add type to the nodes getting added

    Returns:
        bool: True if Nodes gets added, else false.

    """
    # Check for POD running count on each nodes
    if (
        config.ENV_DATA["deployment_type"] == "ipi"
        and config.ENV_DATA["platform"].lower() == "aws"
    ):
        app_nodes = node.get_nodes(node_type=role_type)
        pod_count_dict = node.get_running_pod_count_from_node(node_type=role_type)
        high_count_nodes, less_count_nodes = ([] for i in range(2))
        for node_obj in app_nodes:
            count = pod_count_dict[f"{node_obj.name}"]
            if count >= expected_count:
                high_count_nodes.append(node_obj.name)
            else:
                less_count_nodes.append(node_obj.name)
        if len(less_count_nodes) <= 1:
            for name in machineset_name:
                count = machine.get_replica_count(machine_set=name)
                machine.add_node(machine_set=name, count=(count + node_count))
                machine.wait_for_new_node_to_be_ready(name)
            return True
        else:
            logging.info(
                f"Enough pods can be created with available nodes {pod_count_dict}"
            )
            return False
    elif (
        config.ENV_DATA["deployment_type"] == "upi"
        and config.ENV_DATA["platform"].lower() == "vsphere"
    ):
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif (
        config.ENV_DATA["deployment_type"] == "upi"
        and config.ENV_DATA["platform"].lower() == "baremetal"
    ):
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif (
        config.ENV_DATA["deployment_type"] == "upi"
        and config.ENV_DATA["platform"].lower() == "azure"
    ):
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
Exemple #19
0
def add_worker_based_on_cpu_utilization(
    node_count, expected_percent, role_type=None, machineset_name=None
):
    """
    Function to evaluate CPU utilization of nodes and add node if required.

    Args:
        machineset_name (list): Machineset_names to add more nodes if required.
        node_count (int): Additional nodes to be added
        expected_percent (int): Expected utilization precent
        role_type (str): To add type to the nodes getting added

    Returns:
        bool: True if Nodes gets added, else false.

    """
    # Check for CPU utilization on each nodes
    if (
        config.ENV_DATA["deployment_type"] == "ipi"
        and config.ENV_DATA["platform"].lower() == "aws"
    ):
        app_nodes = node.get_nodes(node_type=role_type)
        uti_dict = node.get_node_resource_utilization_from_oc_describe(
            node_type=role_type
        )
        uti_high_nodes, uti_less_nodes = ([] for i in range(2))
        for node_obj in app_nodes:
            utilization_percent = uti_dict[f"{node_obj.name}"]["cpu"]
            if utilization_percent > expected_percent:
                uti_high_nodes.append(node_obj.name)
            else:
                uti_less_nodes.append(node_obj.name)
        if len(uti_less_nodes) <= 1:
            for name in machineset_name:
                count = machine.get_replica_count(machine_set=name)
                machine.add_node(machine_set=name, count=(count + node_count))
                machine.wait_for_new_node_to_be_ready(name)
            return True
        else:
            logging.info(f"Enough resource available for more pod creation {uti_dict}")
            return False
    elif (
        config.ENV_DATA["deployment_type"] == "upi"
        and config.ENV_DATA["platform"].lower() == "vsphere"
    ):
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif (
        config.ENV_DATA["deployment_type"] == "upi"
        and config.ENV_DATA["platform"].lower() == "baremetal"
    ):
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif (
        config.ENV_DATA["deployment_type"] == "upi"
        and config.ENV_DATA["platform"].lower() == "azure"
    ):
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
Exemple #20
0
def get_environment_info():
    """
    Getting the environment information, Information that will be collected

    Versions:
        OCP - version / build / channel
        OCS - version / build
        Ceph - version
        Rook - version

    Platform:
        BM / VmWare / Cloud provider etc.
        Instance type / architecture
        Cluster name
        User name that run the test

    Return:
      dict: dictionary that contain the environment information

    """
    results = {}
    # getting the name and email  of the user that running the test.
    try:
        user = utils.run_cmd("git config --get user.name").strip()
        email = utils.run_cmd("git config --get user.email").strip()
        results["user"] = f"{user} <{email}>"
    except CommandFailed:
        # if no git user define, the default user is none
        results["user"] = ""

    results["clustername"] = ocp.get_clustername()
    results["platform"] = node.get_provider()
    if results["platform"].lower() not in constants.ON_PREM_PLATFORMS:
        results["platform"] = results["platform"].upper()

    results["ocp_build"] = ocp.get_build()
    results["ocp_channel"] = ocp.get_ocp_channel()
    results["ocp_version"] = utils.get_ocp_version()

    results["ceph_version"] = utils.get_ceph_version()
    results["rook_version"] = utils.get_rook_version()

    results["ocs_build"] = ocp.get_ocs_version()
    # Extracting the version number x.y.z from full build name
    m = re.match(r"(\d.\d).(\d)", results["ocs_build"])
    if m and m.group(1) is not None:
        results["ocs_version"] = m.group(1)

    # Getting the instance type for cloud or Arch type for None cloud
    worker_lbl = node.get_nodes(num_of_nodes=1)[0].data["metadata"]["labels"]
    if "beta.kubernetes.io/instance-type" in worker_lbl:
        results["worker_type"] = worker_lbl["beta.kubernetes.io/instance-type"]
    else:
        results["worker_type"] = worker_lbl["kubernetes.io/arch"]

    return results
Exemple #21
0
    def test_registry_shutdown_and_recovery_node(self, nodes):
        """
        Test registry workload when backed by OCS and
        its impact when node is shutdown and recovered

        """

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template="eap-cd-basic-s2i",
            image=
            "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest",
            pattern="eap-app",
        )

        # Get the node list
        node_list = get_nodes(node_type="worker")

        for node in node_list:

            # Stop node
            nodes.stop_nodes(nodes=[node])

            # Validate node reached NotReady state
            wait_for_nodes_status(node_names=[node.name],
                                  status=constants.NODE_NOT_READY)

            # Start node
            nodes.start_nodes(nodes=[node])

            # Validate all nodes are in READY state and up
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_nodes_status)(timeout=900)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)
    def test_run_pgsql_node_drain(self,
                                  pgsql,
                                  transactions=5600,
                                  node_type="worker"):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Select a node where pgbench is not running for drain
        typed_nodes = [
            node1.name for node1 in node.get_nodes(node_type=node_type)
        ]
        filter_list = pgsql.filter_pgbench_nodes_from_nodeslist(typed_nodes)
        typed_node_name = filter_list[random.randint(0, len(filter_list) - 1)]
        log.info(f"Selected node {typed_node_name} for node drain operation")

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
Exemple #23
0
    def test_rolling_nodes_restart(self, nodes, node_type):
        """
        Test restart nodes one after the other and check health status in between

        """
        ocp_nodes = get_nodes(node_type=node_type)
        for node in ocp_nodes:
            nodes.restart_nodes(nodes=[node], wait=False)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)

        self.create_resources()
Exemple #24
0
def collect_stats(action_text, elastic_info):
    """
    Write the current configuration information into the REPORT file.
    This information includes the osd, nodes and which osds are on which
    nodes.  The minimum and maximum numbers of osds per node are also
    computed and saved.

    Args:
        action_text (str): Title of last action taken
                (usually adding nodes or adding osds)
        elastic_info (es): ElasticData object for stat collection

    Raises:
        AssertionError: OSD layout is unbalanced
    """
    output_info = {"title": action_text}
    pod_obj = ocp.OCP(kind=constants.POD,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
    osd_list = pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
    node_stats = {}
    for osd_ent in osd_list:
        osd_node = osd_ent["spec"]["nodeName"]
        if osd_node in node_stats:
            node_stats[osd_node].append(osd_ent)
        else:
            node_stats[osd_node] = [osd_ent]
    osds_per_node = []
    for entry in node_stats:
        osds_per_node.append(len(node_stats[entry]))
    wnodes = get_nodes(constants.WORKER_MACHINE)
    for wnode in wnodes:
        if wnode.name not in node_stats:
            osds_per_node.append(0)
    maxov = max(osds_per_node)
    minov = min(osds_per_node)
    this_skew = maxov - minov
    logging.info(f"Skew found is {this_skew}")
    output_info["osds"] = osd_list
    output_info["worker_nodes"] = wnodes
    output_info["pairings"] = {}
    for entry in osd_list:
        output_info["pairings"][entry["metadata"]
                                ["name"]] = entry["spec"]["nodeName"]
    output_info["maxov"] = maxov
    output_info["minov"] = minov
    output_info["skew_value"] = this_skew
    elastic_info.add_key(elastic_info.record_counter, output_info)
    elastic_info.log_recent_activity()
    elastic_info.record_counter += 1
    ceph_health_check(tries=30, delay=60)
    assert is_balanced(this_skew, maxov), NOT_BALANCED
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type="master",
                                                   print_table=True)

        if pod_name_of_node == "couchbase":
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == "osd":
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == "master":
            master_node = get_nodes(pod_name_of_node, num_of_nodes=1)

        # Restart relevant node
        if pod_name_of_node == "master":
            nodes.restart_nodes(master_node, wait=False)
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)
        else:
            restart_node = get_node_objs(node_list[random.randint(
                0,
                len(node_list) - 1)])
            nodes.restart_nodes(restart_node)

        # Validate all nodes and services are in READY state and up

        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        retry((CommandFailed), tries=60,
              delay=15)(bg_handler.wait_for_bg_operations(bg_ops,
                                                          timeout=3600))
        self.sanity_helpers.health_check(tries=40)
Exemple #26
0
    def test_run_pgsql_node_drain(self,
                                  pgsql,
                                  transactions=900,
                                  node_type="master"):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Node drain with specific node type
        typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
    def test_registry_reboot_node(self, node_type, nodes):
        """
        Test registry workload when backed by OCS and reboot node
        """

        # Get the node list
        node = get_nodes(node_type, num_of_nodes=1)

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template="eap-cd-basic-s2i",
            image=
            "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest",
            pattern="eap-app",
        )

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)

        # Reboot one node
        nodes.restart_nodes(node, wait=False)

        # Validate all nodes and services are in READY state and up
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_cluster_connectivity)(tries=400)
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status)(timeout=900)

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)
Exemple #28
0
        def finalizer():
            config.switch_to_provider()
            log.info(
                "Verify that all the worker nodes are in a Ready state on the provider"
            )
            wnodes = get_nodes(node_type=constants.WORKER_MACHINE)
            for wnode in wnodes:
                is_recovered = recover_node_to_ready_state(wnode)
                if not is_recovered:
                    log.warning(f"The node {wnode.name} has failed to recover")

            log.info("Verify again that the ceph health is OK")
            ceph_health_check()

            config.switch_ctx(self.orig_index)
Exemple #29
0
def get_attached_volume(request, get_volume):
    """
    Attached volume
    """
    def finalizer():
        worker_nodes = node.get_nodes(node_type="worker", num_of_nodes=1)
        ibmcloud.detach_volume(get_volume, worker_nodes)

    request.addfinalizer(finalizer)
    ibmcloud = IBMCloud()

    worker_nodes = node.get_nodes(node_type="worker", num_of_nodes=1)
    ibmcloud.attach_volume(get_volume, worker_nodes)

    worker_id = ibmcloud.get_node_by_attached_volume(get_volume)
    return worker_id
Exemple #30
0
def get_max_pvc_count():
    """
    Return the maximum number of pvcs to test for.
    This value is 500 times the number of worker nodes.
    """
    worker_nodes = get_nodes(node_type="worker")
    count = 0
    for wnode in worker_nodes:
        wdata = wnode.data
        labellist = wdata["metadata"]["labels"].keys()
        if "node-role.kubernetes.io/worker" not in labellist:
            continue
        if "cluster.ocs.openshift.io/openshift-storage" not in labellist:
            continue
        count += 1
    pvc_count = count * constants.SCALE_MAX_PVCS_PER_NODE
    return pvc_count