コード例 #1
0
ファイル: node.py プロジェクト: mirekdlugosz/ocs-ci
def add_new_node_and_label_it(machineset_name):
    """
    Add a new node and label it

    Args:
        machineset_name (str): Name of the machine set

    """
    # Get the initial nodes list
    initial_nodes = get_worker_nodes()
    log.info(f"Current available worker nodes are {initial_nodes}")

    # get machineset replica count
    machineset_replica_count = machine.get_replica_count(machineset_name)

    # Increase its replica count
    machine.add_node(machineset_name, count=machineset_replica_count + 1)
    log.info(f"Increased {machineset_name} count "
             f"by {machineset_replica_count + 1}")

    # wait for the new node to come to ready state
    log.info("Waiting for the new node to be in ready state")
    machine.wait_for_new_node_to_be_ready(machineset_name)

    # Get the node name of new spun node
    nodes_after_new_spun_node = get_worker_nodes()
    new_spun_node = list(set(nodes_after_new_spun_node) - set(initial_nodes))
    log.info(f"New spun node is {new_spun_node}")

    # Label it
    node_obj = ocp.OCP(kind='node')
    node_obj.add_label(resource_name=new_spun_node[0],
                       label=constants.OPERATOR_NODE_LABEL)
    log.info(f"Successfully labeled {new_spun_node} with OCS storage label")
コード例 #2
0
    def dynamic_pvc_base(self, interface_type, reclaim_policy):
        """
        Base function for Dynamic PVC creation tests
        Fetches the worker nodes name list, creates StorageClass and PVC
        """
        self.interface_type = interface_type
        self.reclaim_policy = reclaim_policy
        self.worker_nodes_list = helpers.get_worker_nodes()

        if self.interface_type == constants.CEPHBLOCKPOOL:
            self.interface_name = self.cbp_obj.name
            self.secret_name = self.rbd_secret_obj.name

        elif self.interface_type == constants.CEPHFILESYSTEM:
            self.interface_name = helpers.get_cephfs_data_pool_name()
            self.secret_name = self.cephfs_secret_obj.name

        logger.info(
            f"Creating Storage Class with reclaimPolicy: {self.reclaim_policy}"
        )
        self.sc_obj = helpers.create_storage_class(
            interface_type=self.interface_type,
            interface_name=self.interface_name,
            secret_name=self.secret_name,
            reclaim_policy=self.reclaim_policy)

        logger.info(f"Creating PVC with accessModes: {self.access_mode}")
        self.pvc_obj = helpers.create_pvc(sc_name=self.sc_obj.name,
                                          namespace=self.namespace,
                                          size=self.pvc_size,
                                          wait=True,
                                          access_mode=self.access_mode)
コード例 #3
0
    def get_node_name_where_jenkins_pod_not_hosted(
        self, node_type=constants.WORKER_MACHINE, num_of_nodes=1
    ):
        """
        get nodes

        Args:
            node_type (str): The node type  (e.g. worker, master)
            num_of_nodes (int): The number of nodes to be returned

        Returns:
            list: List of compute node names
        """
        if node_type == constants.MASTER_MACHINE:
            nodes_drain = [node.name for node in get_typed_nodes(
                node_type=node_type, num_of_nodes=num_of_nodes
            )]
        elif node_type == constants.WORKER_MACHINE:
            pod_objs = []
            for project in self.projects:
                pod_names = get_pod_name_by_pattern(
                    pattern='jenkins', namespace=project
                )
                pod_obj = [get_pod_obj(name=pod_name, namespace=project) for pod_name in pod_names]
                pod_objs += pod_obj
            nodes_app_name = set(get_app_pod_running_nodes(pod_objs))
            nodes_worker_name = set(get_worker_nodes())
            nodes_drain = nodes_worker_name - nodes_app_name
        else:
            raise ValueError('The node type is worker or master')
        return list(nodes_drain)[:num_of_nodes]
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
     for thread in self.threads:
         thread.join()
     ceph_health_check()
コード例 #5
0
def label_nodes(request):
    """
    Fixture to label the node(s) that will run the application pod.
    That will be all workers node that do not run the OCS cluster.
    """
    def teardown():
        log.info('Clear label form worker (Application) nodes')
        # Getting all Application nodes
        app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL)
        helpers.remove_label_from_worker_node(app_nodes,
                                              constants.APP_NODE_LABEL)

    request.addfinalizer(teardown)

    # Getting all OCS nodes (to verify app pod wil not run on)
    ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
    # Add label to the worker nodes
    worker_nodes = helpers.get_worker_nodes()
    # Getting list of free nodes
    free_nodes = list(set(worker_nodes) - set(ocs_nodes))

    log.info('Adding the app-node label to Non-OCS workers')
    log.debug(f'The Workers nodes are : {worker_nodes}')
    log.debug(f'The OCS nodes are : {ocs_nodes}')
    log.debug(f'The free nodes are : {free_nodes}')

    assert free_nodes, \
        'Did not found any worker to run on, pleas deploy another worker'

    helpers.label_worker_node(free_nodes, constants.APP_NODE_LABEL,
                              constants.VDBENCH_NODE_LABEL)

    return
コード例 #6
0
    def test_rwx_pvc_assign_pod_node(self, pvc_factory, teardown_factory):
        """
        Test assign nodeName to a pod using RWX pvc
        """
        interface = constants.CEPHFILESYSTEM
        worker_nodes_list = helpers.get_worker_nodes()

        # Create a RWX PVC
        pvc_obj = pvc_factory(
            interface=interface, access_mode=constants.ACCESS_MODE_RWX,
            status=constants.STATUS_BOUND
        )

        # Create two pods on selected nodes
        pod_list = []
        selected_nodes = random.sample(worker_nodes_list, k=2)
        logger.info(
            f"Creating {len(selected_nodes)} pods with pvc {pvc_obj.name}"
        )
        for node in selected_nodes:
            logger.info(f"Creating pod on node: {node}")
            pod_obj = helpers.create_pod(
                interface_type=interface, pvc_name=pvc_obj.name,
                namespace=pvc_obj.namespace, node_name=node,
                pod_dict_path=constants.NGINX_POD_YAML
            )
            pod_list.append(pod_obj)
            teardown_factory(pod_obj)

        # Confirm that both pods are running on the selected_nodes
        logger.info('Checking whether pods are running on the selected nodes')
        for index in range(0, len(selected_nodes)):
            pod_obj = pod_list[index]
            selected_node = selected_nodes[index]
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING,
                timeout=120
            )
            pod_obj.reload()
            assert pod.verify_node_name(pod_obj, selected_node), (
                f"Pod {pod_obj.name} is running on a different node "
                f"than the selected node"
            )

        # Run IOs on all pods. FIO Filename is kept same as pod name
        with ThreadPoolExecutor() as p:
            for pod_obj in pod_list:
                logger.info(f"Running IO on pod {pod_obj.name}")
                p.submit(
                    pod_obj.run_io, storage_type='fs', size='512M',
                    runtime=30, fio_filename=pod_obj.name
                )

        # Check IO from all pods
        for pod_obj in pod_list:
            pod.get_fio_rw_iops(pod_obj)
コード例 #7
0
    def test_add_node(self):
        """
        Test for adding worker nodes to the cluster while IOs
        """
        dt = config.ENV_DATA['deployment_type']
        if dt == 'ipi':
            before_replica_counts = dict()
            count = 2
            machines = machine_utils.get_machinesets()
            for machine in machines:
                before_replica_counts.update(
                    {machine: machine_utils.get_replica_count(machine)})
            worker_nodes_before = helpers.get_worker_nodes()
            logger.info(
                f'The worker nodes number before adding a new node is {len(worker_nodes_before)}'
            )
            after_replica_counts = dict()
            for machine in machines:
                machine_utils.add_node(machine, count=count)
                after_replica_counts.update(({
                    machine:
                    machine_utils.get_replica_count(machine)
                }))
            logger.info(after_replica_counts)
            for sample in TimeoutSampler(timeout=300,
                                         sleep=3,
                                         func=helpers.get_worker_nodes):
                if len(sample) == count * len(machines):
                    break

            worker_nodes_after = helpers.get_worker_nodes()
            logger.info(
                f'The worker nodes number after adding a new node is {len(worker_nodes_after)}'
            )
            wait_for_nodes_status(node_names=worker_nodes_after,
                                  status=constants.NODE_READY)
        else:
            pytest.skip("UPI not yet supported")
コード例 #8
0
    def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20)
                pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20)
                pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

        if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM:
            if config.ENV_DATA['deployment_type'] == 'ipi':
                node.delete_and_create_osd_node_aws_ipi(osd_node_name)

            elif config.ENV_DATA['deployment_type'] == 'upi':
                node.delete_and_create_osd_node_aws_upi(osd_node_name)
            else:
                pytest.fail(
                    f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, "
                    f"results of this test run are all invalid.")

        elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM:
            pytest.skip("Skipping add node in Vmware platform due to "
                        "https://bugzilla.redhat.com/show_bug.cgi?id=1844521"
                        )

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=30)
コード例 #9
0
    def test_nodereplacement_proactive_with_io_running(self, pvc_factory,
                                                       pod_factory,
                                                       dc_pod_factory):
        """
        Knip-894 Node Replacement proactive when IO running in the background

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_node_name = select_osd_node_name()

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        delete_and_create_osd_node(osd_node_name)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
    def dynamic_pvc_base(self, interface_type, reclaim_policy):
        """
        Base function for Dynamic PVC creation tests
        Fetches the worker nodes name list, creates StorageClass and PVC
        """
        self.interface_type = interface_type
        self.reclaim_policy = reclaim_policy
        self.worker_nodes_list = helpers.get_worker_nodes()

        if self.interface_type == constants.CEPHBLOCKPOOL:
            self.interface_name = self.cbp_obj.name
            self.secret_name = self.rbd_secret_obj.name

        elif self.interface_type == constants.CEPHFILESYSTEM:
            self.interface_name = helpers.get_cephfs_data_pool_name()
            self.secret_name = self.cephfs_secret_obj.name

        logger.info(
            f"Creating Storage Class with reclaimPolicy: {self.reclaim_policy}"
        )
        self.sc_obj = helpers.create_storage_class(
            interface_type=self.interface_type,
            interface_name=self.interface_name,
            secret_name=self.secret_name,
            reclaim_policy=self.reclaim_policy
        )

        logger.info(f"Creating PVC with accessModes: {self.access_mode}")
        self.pvc_obj = helpers.create_pvc(
            sc_name=self.sc_obj.name, namespace=self.namespace,
            size=self.pvc_size, access_mode=self.access_mode
        )
        helpers.wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND)
        self.pvc_obj.reload()

        logger.info(
            f"Creating first pod on node: {self.worker_nodes_list[0]}"
            f" with pvc {self.pvc_obj.name}"
        )
        self.pod_obj1 = helpers.create_pod(
            interface_type=self.interface_type, pvc_name=self.pvc_obj.name,
            namespace=self.namespace, node_name=self.worker_nodes_list[0],
            pod_dict_path=constants.NGINX_POD_YAML
        )
        helpers.wait_for_resource_state(self.pod_obj1, constants.STATUS_RUNNING)
        self.pod_obj1.reload()
コード例 #11
0
def create_instance_in_clusterlogging():
    """
    Creation of instance for clusterlogging that creates PVCs,
    ElasticSearch, curator fluentd and kibana pods and checks for all
    the pods and PVCs

    Args:
        sc_name (str): Storage class name to create PVCs

    Returns:
        dict: Contains all detailed information of the
            instance such as pods that got created, its resources and limits
            values, storage class and size details etc.

    """
    num_of_worker_nodes = len(helpers.get_worker_nodes())
    num_of_master_nodes = len(helpers.get_master_nodes())
    nodes_in_cluster = num_of_worker_nodes + num_of_master_nodes
    inst_data = templating.load_yaml(constants.CL_INSTANCE_YAML)
    es_node_count = inst_data['spec']['logStore']['elasticsearch']['nodeCount']
    helpers.create_resource(wait=False, **inst_data)
    oc = ocp.OCP('v1', 'ClusterLogging', 'openshift-logging')
    logging_instance = oc.get(resource_name='instance', out_yaml_format='True')
    if logging_instance:
        logger.info("Successfully created instance for cluster-logging")
        logger.debug(logging_instance)
    else:
        logger.error("Instance for clusterlogging is not created properly")

    pod_obj = ocp.OCP(kind=constants.POD, namespace='openshift-logging')
    pod_status = pod_obj.wait_for_resource(condition=constants.STATUS_RUNNING,
                                           resource_count=2 + es_node_count +
                                           nodes_in_cluster,
                                           timeout=300,
                                           sleep=5)
    assert pod_status, "Pods are not in Running state."
    logger.info("All pods are in Running state")
    pvc_obj = ocp.OCP(kind=constants.PVC, namespace='openshift-logging')
    pvc_status = pvc_obj.wait_for_resource(condition=constants.STATUS_BOUND,
                                           resource_count=es_node_count,
                                           timeout=150,
                                           sleep=5)
    assert pvc_status, "PVCs are not in bound state."
    logger.info("PVCs are Bound")
    return logging_instance
コード例 #12
0
 def finalizer():
     """
     Finalizer to stop memory leak data capture thread and cleanup the files
     """
     set_flag_status('terminated')
     try:
         for status in TimeoutSampler(90, 3, get_flag_status):
             if status == 'terminated':
                 break
     except TimeoutExpiredError:
         log.warning("Background test execution still in progress before"
                     "memory leak thread terminated")
     if thread:
         thread.join()
     for worker in helpers.get_worker_nodes():
         if os.path.exists(f"/tmp/{worker}-top-output.txt"):
             os.remove(f"/tmp/{worker}-top-output.txt")
     log.info(f"Memory leak capture has stopped")
コード例 #13
0
    def test_rwo_pvc_assign_pod_node(
        self, interface, pvc_factory, teardown_factory
    ):
        """
        Test assign nodeName to a pod using RWO pvc
        """
        worker_nodes_list = helpers.get_worker_nodes()

        # Create a RWO PVC
        pvc_obj = pvc_factory(
            interface=interface, access_mode=constants.ACCESS_MODE_RWO,
            status=constants.STATUS_BOUND
        )

        # Create a pod on a particular node
        selected_node = random.choice(worker_nodes_list)
        logger.info(
            f"Creating a pod on node: {selected_node} with pvc {pvc_obj.name}"
        )

        pod_obj = helpers.create_pod(
            interface_type=interface, pvc_name=pvc_obj.name,
            namespace=pvc_obj.namespace, node_name=selected_node,
            pod_dict_path=constants.NGINX_POD_YAML
        )
        teardown_factory(pod_obj)

        # Confirm that the pod is running on the selected_node
        helpers.wait_for_resource_state(
            resource=pod_obj, state=constants.STATUS_RUNNING, timeout=120
        )
        pod_obj.reload()
        assert pod.verify_node_name(pod_obj, selected_node), (
            'Pod is running on a different node than the selected node'
        )

        # Run IO
        logger.info(f"Running IO on pod {pod_obj.name}")
        pod_obj.run_io(storage_type='fs', size='512M', runtime=30)
        pod.get_fio_rw_iops(pod_obj)
コード例 #14
0
    def setup(self, interface_type, reclaim_policy, storageclass_factory):
        """
        Creates storage class with specified interface and reclaim policy.
        Fetches all worker nodes

        Args:
            interface_type (str): The type of the interface
                (e.g. CephBlockPool, CephFileSystem)
            reclaim_policy (str): The type of reclaim policy
                (eg., 'Delete', 'Retain')
            storageclass_factory: A fixture to create new storage class

        Returns:
            tuple: containing the storage class instance and list of worker nodes

        """
        # Create storage class
        sc_obj = storageclass_factory(interface=interface_type,
                                      reclaim_policy=reclaim_policy)
        worker_nodes_list = helpers.get_worker_nodes()

        return sc_obj, worker_nodes_list
コード例 #15
0
 def run_memory_leak_in_bg():
     """
     Function to run memory leak in background thread
     Memory leak data is written in below format
     date time PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
     """
     oc = ocp.OCP(namespace=config.ENV_DATA['cluster_namespace'])
     while get_flag_status() == 'running':
         for worker in helpers.get_worker_nodes():
             filename = f"/tmp/{worker}-top-output.txt"
             top_cmd = f"debug nodes/{worker} -- chroot /host top -n 2 b"
             with open("/tmp/file.txt", "w+") as temp:
                 temp.write(
                     str(
                         oc.exec_oc_cmd(command=top_cmd,
                                        out_yaml_format=False)))
                 temp.seek(0)
                 for line in temp:
                     if line.__contains__("ceph-osd"):
                         with open(filename, "a+") as f:
                             f.write(str(datetime.now()))
                             f.write(' ')
                             f.write(line)
コード例 #16
0
    def setup(self, request, scenario, nodes, multi_pvc_factory,
              service_account_factory, dc_pod_factory):
        """
        Identify the nodes and start multiple dc pods for the test

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            nodes: A fixture to get instance of the relevant platform nodes class
            multi_pvc_factory: A fixture create a set of new PVCs
            service_account_factory: A fixture to create a service account
            dc_pod_factory: A fixture to create dc pod

        Returns:
            list: dc pod objs

        """
        worker_nodes = helpers.get_worker_nodes()
        ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
        non_ocs_nodes = list(set(worker_nodes) - set(ocs_nodes))

        def finalizer():
            helpers.remove_label_from_worker_node(node_list=worker_nodes,
                                                  label_key="nodetype")

            # Check ceph health
            ceph_health_check(tries=80)

        request.addfinalizer(finalizer)

        if (scenario == 'dedicated') and len(non_ocs_nodes) == 0:
            if config.ENV_DATA.get('deployment_type').lower() == 'ipi':
                machines = machine.get_machinesets()
                node.add_new_node_and_label_it(machines[0],
                                               num_nodes=1,
                                               mark_for_ocs_label=False)
            else:
                if config.ENV_DATA.get(
                        'platform').lower() == constants.VSPHERE_PLATFORM:
                    pytest.skip(
                        "Skipping add node in VSPHERE due to https://bugzilla.redhat.com/show_bug.cgi?id=1844521"
                    )
                is_rhel = config.ENV_DATA.get(
                    'rhel_workers') or config.ENV_DATA.get('rhel_user')
                node_type = constants.RHEL_OS if is_rhel else constants.RHCOS
                node.add_new_node_and_label_upi(node_type=node_type,
                                                num_nodes=1,
                                                mark_for_ocs_label=False)
            non_ocs_nodes = list(
                set(helpers.get_worker_nodes()) - set(ocs_nodes))

        app_pod_nodes = ocs_nodes if (scenario
                                      == "colocated") else non_ocs_nodes

        # Label nodes to be able to run app pods
        helpers.label_worker_node(node_list=app_pod_nodes,
                                  label_key="nodetype",
                                  label_value="app-pod")

        access_modes_rbd = [
            constants.ACCESS_MODE_RWO, f'{constants.ACCESS_MODE_RWX}-Block'
        ]

        access_modes_cephfs = [
            constants.ACCESS_MODE_RWO, constants.ACCESS_MODE_RWX
        ]

        pvcs_rbd = multi_pvc_factory(interface=constants.CEPHBLOCKPOOL,
                                     size=self.pvc_size,
                                     access_modes=access_modes_rbd,
                                     status=constants.STATUS_BOUND,
                                     num_of_pvc=len(access_modes_rbd))

        project = pvcs_rbd[0].project

        pvcs_cephfs = multi_pvc_factory(interface=constants.CEPHFILESYSTEM,
                                        project=project,
                                        size=self.pvc_size,
                                        access_modes=access_modes_cephfs,
                                        status=constants.STATUS_BOUND,
                                        num_of_pvc=len(access_modes_cephfs))

        pvcs = pvcs_cephfs + pvcs_rbd
        # Set volume mode on PVC objects
        for pvc_obj in pvcs:
            pvc_info = pvc_obj.get()
            setattr(pvc_obj, 'volume_mode', pvc_info['spec']['volumeMode'])

        sa_obj = service_account_factory(project=project)
        pods = []

        # Create pods
        for pvc_obj in pvcs:
            if constants.CEPHFS_INTERFACE in pvc_obj.storageclass.name:
                interface = constants.CEPHFILESYSTEM
            else:
                interface = constants.CEPHBLOCKPOOL

            num_pods = 2 if pvc_obj.access_mode == constants.ACCESS_MODE_RWX else 1
            logger.info("Creating app pods")
            for _ in range(num_pods):
                pods.append(
                    dc_pod_factory(interface=interface,
                                   pvc=pvc_obj,
                                   node_selector={'nodetype': 'app-pod'},
                                   raw_block_pv=pvc_obj.volume_mode == 'Block',
                                   sa_obj=sa_obj))

        logger.info(
            f"Created {len(pods)} pods using {len(pvcs_cephfs)} cephfs, {len(pvcs_rbd)} rbd PVCs."
        )

        return pods
コード例 #17
0
    def test_pvc_rwx_writeable_after_pod_deletions(
        self, pvc_factory, teardown_factory
    ):
        """
        Test assign nodeName to a pod using RWX pvc

        1. Create a new project.
        2. Create a RWX CEPHFS based PVC
        3. Attach the same PVC to multiple PODs and start IO on all the PODs
        4. Delete all but one pod.
        5. Verify mount point is still write-able.
             - Start IO again on the Running pod.
        6. Also, access the data written by deleted pods from the Running pod

        """
        worker_nodes_list = helpers.get_worker_nodes()

        # Create a RWX PVC
        pvc_obj = pvc_factory(
            interface=constants.CEPHFILESYSTEM, access_mode=constants.ACCESS_MODE_RWX,
            size=10, status=constants.STATUS_BOUND
        )
        logger.info(
            f"Creating pods on all worker nodes backed"
            f"with same pvc {pvc_obj.name}"
        )

        pod_list = []

        for each_node in worker_nodes_list:
            pod_obj = helpers.create_pod(
                interface_type=constants.CEPHFILESYSTEM, pvc_name=pvc_obj.name,
                namespace=pvc_obj.namespace, node_name=each_node,
                pod_dict_path=constants.NGINX_POD_YAML
            )
            pod_list.append(pod_obj)
            teardown_factory(pod_obj)

        # Confirm pods are created and are running on designated nodes
        node_count = 0
        for pod_obj in pod_list:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING,
                timeout=120
            )
            pod_obj.reload()
            assert pod.verify_node_name(pod_obj, worker_nodes_list[node_count]), (
                f'Pod {pod_obj.name} is running on a different node '
                f'than the selected node'
            )
            node_count = node_count + 1

        # Run IOs on all pods. FIO Filename is kept same as pod name
        with ThreadPoolExecutor() as p:
            for pod_obj in pod_list:
                logger.info(f"Running IO on pod {pod_obj.name}")
                p.submit(
                    pod_obj.run_io, storage_type='fs', size='512M',
                    runtime=30, fio_filename=pod_obj.name
                )

        # Check IO from all pods
        for pod_obj in pod_list:
            pod.get_fio_rw_iops(pod_obj)

        # Calculate md5sum of each file
        md5sum_pod_data = []
        for pod_obj in pod_list:
            md5sum_pod_data.append(pod.cal_md5sum(
                pod_obj=pod_obj, file_name=pod_obj.name
            ))

        # Delete all but the last app pod.
        for index in range(node_count - 1):
            pod_list[index].delete()
            pod_list[index].ocp.wait_for_delete(
                resource_name=pod_list[index].name
            )

        # Verify presence of files written by each pod
        logger.info(
            f"Verify existence of each file from app pod "
            f"{pod_list[-1].name} "
        )
        for pod_obj in pod_list:
            file_path = pod.get_file_path(pod_list[-1], pod_obj.name)
            assert pod.check_file_existence(pod_list[-1], file_path), (
                f"File {pod_obj.name} doesnt exist"
            )
            logger.info(
                f"File {pod_obj.name} exists in {pod_list[-1].name}"
            )

        # From surviving pod, verify data integrity of files
        # written by deleted pods
        logger.info(f"verify all data from {pod_list[-1].name}")

        for index, pod_obj in enumerate(pod_list):
            assert pod.verify_data_integrity(
                pod_obj=pod_list[-1], file_name=pod_obj.name,
                original_md5sum=md5sum_pod_data[index]
            )

        # From surviving pod, confirm mount point is still write-able
        logger.info(f"Re-running IO on pod {pod_list[-1].name}")
        fio_new_file = f"{pod_list[-1].name}-new-file"
        pod_list[-1].run_io(
            storage_type='fs', size='512M', runtime=30,
            fio_filename=fio_new_file
        )
        pod.get_fio_rw_iops(pod_list[-1])
        file_path = pod.get_file_path(pod_list[-1], fio_new_file)
        assert pod.check_file_existence(pod_list[-1], file_path), (
            f"File {fio_new_file} doesnt exist"
        )
        logger.info(
            f"File {fio_new_file} exists in {pod_list[-1].name} "
        )
コード例 #18
0
def label_nodes(request, with_ocs):
    """
    Fixture to label the node(s) that will run the application pod.
    That will be all workers node that do not run the OCS cluster.
    """

    m_set = ''  # this will hold machine_set name that added

    def teardown():

        ceph_health_check()

        if with_ocs:
            return

        if m_set != '':
            log.info(f'Destroy {m_set}')
            machine.delete_custom_machineset(m_set)
        else:
            log.info('Clear label form worker (Application) nodes')
            # Getting all Application nodes
            app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL)
            log.debug(f'The application nodes are : {app_nodes}')
            helpers.remove_label_from_worker_node(app_nodes,
                                                  constants.VDBENCH_NODE_LABEL)

    request.addfinalizer(teardown)

    if with_ocs:
        return

    # Add label to the worker nodes

    # Getting all OCS nodes (to verify app pod wil not run on)
    ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
    worker_nodes = helpers.get_worker_nodes()
    # Getting list of free nodes
    free_nodes = list(set(worker_nodes) - set(ocs_nodes))

    if not free_nodes:
        # No free nodes -  Creating new machineset for application pods
        log.info('Adding new machineset, with worker for application pod')
        m_set = machine.create_custom_machineset(
            label=constants.APP_NODE_LABEL)
        machine.wait_for_new_node_to_be_ready(m_set)

        free_nodes = machine.get_labeled_nodes(
            f'node-role.kubernetes.io/app={constants.APP_NODE_LABEL}')

        # TODO: implement this for VMWare as well.

    log.info('Adding the app-node label to Non-OCS workers')
    log.debug(f'The Workers nodes are : {worker_nodes}')
    log.debug(f'The OCS nodes are : {ocs_nodes}')
    log.debug(f'The free nodes are : {free_nodes}')

    assert free_nodes, \
        'Did not found any worker to run on, pleas deploy another worker'

    helpers.label_worker_node(free_nodes, constants.APP_NODE_LABEL,
                              constants.VDBENCH_NODE_LABEL)

    return
コード例 #19
0
    def test_pvc_reattach_time_performance(self, pvc_factory,
                                           teardown_factory):
        """
        Test assign nodeName to a pod using RWX pvc
        Performance in test_multiple_pvc_creation_measurement_performance
        Each kernel (unzipped) is 892M and 61694 files
        """

        kernel_url = 'https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.5.tar.gz'
        download_path = 'tmp'
        # Number of times we copy the kernel
        copies = 3

        # Download a linux Kernel
        import os
        dir_path = os.path.join(os.getcwd(), download_path)
        file_path = os.path.join(dir_path, 'file.gz')
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        urllib.request.urlretrieve(kernel_url, file_path)

        worker_nodes_list = helpers.get_worker_nodes()
        assert (len(worker_nodes_list) > 1)
        node_one = worker_nodes_list[0]
        node_two = worker_nodes_list[1]

        # Create a PVC
        accessmode = constants.ACCESS_MODE_RWX
        if self.interface == constants.CEPHBLOCKPOOL:
            accessmode = constants.ACCESS_MODE_RWO
        pvc_obj = pvc_factory(
            interface=self.interface,
            access_mode=accessmode,
            status=constants.STATUS_BOUND,
            size='15',
        )

        # Create a pod on one node
        logging.info(
            f"Creating Pod with pvc {pvc_obj.name} on node {node_one}")

        helpers.pull_images('nginx')
        pod_obj1 = helpers.create_pod(interface_type=self.interface,
                                      pvc_name=pvc_obj.name,
                                      namespace=pvc_obj.namespace,
                                      node_name=node_one,
                                      pod_dict_path=constants.NGINX_POD_YAML)

        # Confirm that pod is running on the selected_nodes
        logging.info('Checking whether pods are running on the selected nodes')
        helpers.wait_for_resource_state(resource=pod_obj1,
                                        state=constants.STATUS_RUNNING,
                                        timeout=120)

        pod_name = pod_obj1.name
        pod_path = '/var/lib/www/html'

        _ocp = OCP(namespace=pvc_obj.namespace)

        rsh_cmd = f"exec {pod_name} -- apt-get update"
        _ocp.exec_oc_cmd(rsh_cmd)
        rsh_cmd = f"exec {pod_name} -- apt-get install -y rsync"
        _ocp.exec_oc_cmd(rsh_cmd, ignore_error=True, out_yaml_format=False)

        rsh_cmd = f"rsync {dir_path} {pod_name}:{pod_path}"
        _ocp.exec_oc_cmd(rsh_cmd)

        rsh_cmd = f"exec {pod_name} -- tar xvf {pod_path}/tmp/file.gz -C /var/lib/www/html/tmp"
        _ocp.exec_oc_cmd(rsh_cmd)

        for x in range(copies):
            rsh_cmd = f"exec {pod_name} -- mkdir -p {pod_path}/folder{x}"
            _ocp.exec_oc_cmd(rsh_cmd)
            rsh_cmd = f"exec {pod_name} -- cp -r {pod_path}/tmp {pod_path}/folder{x}"
            _ocp.exec_oc_cmd(rsh_cmd)

        rsh_cmd = f"delete pod {pod_name}"
        _ocp.exec_oc_cmd(rsh_cmd)

        logging.info(
            f"Creating Pod with pvc {pvc_obj.name} on node {node_two}")

        pod_obj2 = helpers.create_pod(interface_type=self.interface,
                                      pvc_name=pvc_obj.name,
                                      namespace=pvc_obj.namespace,
                                      node_name=node_two,
                                      pod_dict_path=constants.NGINX_POD_YAML)

        start_time = time.time()

        pod_name = pod_obj2.name
        helpers.wait_for_resource_state(resource=pod_obj2,
                                        state=constants.STATUS_RUNNING,
                                        timeout=120)
        end_time = time.time()
        total_time = end_time - start_time
        if total_time > 60:
            raise ex.PerformanceException(
                f"Pod creation time is {total_time} and "
                f"greater than 60 seconds")
        logging.info(f"Pod {pod_name} creation time took {total_time} seconds")

        teardown_factory(pod_obj2)
        os.remove(file_path)
        os.rmdir(dir_path)
コード例 #20
0
ファイル: test_raw_block_pv.py プロジェクト: xenolinux/ocs-ci
    def raw_block_pv(self):
        """
        Testing basic creation of app pod with RBD RWX raw block pv support
        """
        worker_nodes = helpers.get_worker_nodes()
        pvcs = list()
        for size in ['500Mi', '10Gi', '1Ti']:
            pvcs.append(
                helpers.create_pvc(sc_name=self.sc_obj.name,
                                   size=size,
                                   access_mode=constants.ACCESS_MODE_RWX,
                                   namespace=self.namespace,
                                   volume_mode='Block'))
        pvc_mb, pvc_gb, pvc_tb = pvcs[0], pvcs[1], pvcs[2]

        for pvc in pvcs:
            helpers.wait_for_resource_state(resource=pvc,
                                            state=constants.STATUS_BOUND,
                                            timeout=120)

        pvs = [pvc.backed_pv_obj for pvc in pvcs]

        pods = list()
        pod_dict = constants.CSI_RBD_RAW_BLOCK_POD_YAML
        for pvc in pvc_mb, pvc_gb, pvc_tb:
            for _ in range(3):
                pods.append(
                    helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                                       pvc_name=pvc.name,
                                       namespace=self.namespace,
                                       raw_block_pv=True,
                                       pod_dict_path=pod_dict,
                                       node_name=random.choice(worker_nodes)))

        pvc_mb_pods, pvc_gb_pods, pvc_tb_pods = pods[0:3], pods[3:6], pods[6:9]
        for pod in pods:
            helpers.wait_for_resource_state(resource=pod,
                                            state=constants.STATUS_RUNNING,
                                            timeout=120)
        storage_type = 'block'

        with ThreadPoolExecutor() as p:
            for pod in pvc_mb_pods:
                logging.info(f'running io on pod {pod.name}')
                p.submit(
                    pod.run_io,
                    storage_type=storage_type,
                    size=f'{random.randint(10,200)}M',
                )
            for pod in pvc_gb_pods:
                logging.info(f'running io on pod {pod.name}')
                p.submit(
                    pod.run_io,
                    storage_type=storage_type,
                    size=f'{random.randint(1,5)}G',
                )
            for pod in pvc_tb_pods:
                logging.info(f'running io on pod {pod.name}')
                p.submit(
                    pod.run_io,
                    storage_type=storage_type,
                    size=f'{random.randint(10,15)}G',
                )

        for pod in pods:
            get_fio_rw_iops(pod)
        return pods, pvcs, pvs
コード例 #21
0
    def identify_and_add_nodes(self, scenario, num_of_nodes):
        """
        Fetches info about the worker nodes and add nodes (if required)

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test

        Returns:
            tuple: tuple containing:
                list: list of OCS nodes name
                list: list of non-OCS nodes name

        """
        nodes_to_add = 0
        initial_worker_nodes = helpers.get_worker_nodes()
        ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
        non_ocs_nodes = list(set(initial_worker_nodes) - set(ocs_nodes))

        if 'colocated' in scenario and len(ocs_nodes) < num_of_nodes:
            nodes_to_add = num_of_nodes - len(initial_worker_nodes)

        if 'dedicated' in scenario and len(non_ocs_nodes) < num_of_nodes:
            nodes_to_add = num_of_nodes - len(non_ocs_nodes)

        if nodes_to_add > 0:
            logger.info(f"{nodes_to_add} extra workers nodes needed")

            if config.ENV_DATA['deployment_type'] == 'ipi':
                machine_name = machine.get_machine_from_node_name(
                    random.choice(initial_worker_nodes))
                machineset_name = machine.get_machineset_from_machine_name(
                    machine_name)
                machineset_replica_count = machine.get_replica_count(
                    machineset_name)
                machine.add_node(machineset_name,
                                 count=machineset_replica_count + nodes_to_add)
                logger.info("Waiting for the new node(s) to be in ready state")
                machine.wait_for_new_node_to_be_ready(machineset_name)
            else:
                if config.ENV_DATA.get(
                        'platform').lower() == constants.VSPHERE_PLATFORM:
                    pytest.skip(
                        "Skipping add node in VSPHERE due to https://bugzilla.redhat.com/show_bug.cgi?id=1844521"
                    )
                is_rhel = config.ENV_DATA.get(
                    'rhel_workers') or config.ENV_DATA.get('rhel_user')
                node_type = constants.RHEL_OS if is_rhel else constants.RHCOS
                node.add_new_node_and_label_upi(node_type=node_type,
                                                num_nodes=nodes_to_add,
                                                mark_for_ocs_label=False)

            new_worker_nodes = helpers.get_worker_nodes()
            new_nodes_added = list(
                set(new_worker_nodes) - set(initial_worker_nodes))
            assert len(
                new_nodes_added
            ) == nodes_to_add, 'Extra nodes not added in the cluster'
            non_ocs_nodes += new_nodes_added

        if 'colocated' in scenario and len(ocs_nodes) < num_of_nodes:
            logger.info('Adding OCS storage label to Non-OCS workers')
            node_obj = ocp.OCP(kind=constants.NODE)
            nodes_to_label = non_ocs_nodes[0:(num_of_nodes - len(ocs_nodes))]
            for node_name in nodes_to_label:
                node_obj.add_label(resource_name=node_name,
                                   label=constants.OPERATOR_NODE_LABEL)
                ocs_nodes.append(node_name)
            non_ocs_nodes = list(set(non_ocs_nodes) - set(ocs_nodes))

        logger.info(f"The OCS nodes are : {ocs_nodes}")
        logger.info(f"The Non-OCS nodes are: {non_ocs_nodes}")
        return ocs_nodes, non_ocs_nodes
コード例 #22
0
ファイル: scale_lib.py プロジェクト: phlogistonjohn/ocs-ci
def check_and_add_enough_worker(worker_count):
    """
    Function to check if there is enough workers available to scale pods.
    IF there is no enough worker then worker will be added based on supported platforms
    Function also adds scale label to the respective worker nodes.

    Args:
        worker_count (int): Expected worker count to be present in the setup

    Returns:
        book: True is there is enough worker count else raise exception.

    """
    # Check either to use OCS workers for scaling app pods
    # Further continue to label the worker with scale label else not
    worker_list = helpers.get_worker_nodes()
    ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
    scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL)
    if config.RUN.get('use_ocs_worker_for_scale'):
        if not scale_worker:
            helpers.label_worker_node(
                node_list=worker_list, label_key='scale-label', label_value='app-scale'
            )
    else:
        if not scale_worker:
            for node_item in ocs_worker_list:
                worker_list.remove(node_item)
            if worker_list:
                helpers.label_worker_node(
                    node_list=worker_list, label_key='scale-label', label_value='app-scale'
                )
    scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL)
    logging.info(f"Print existing scale worker {scale_worker_list}")

    # Check if there is enough nodes to continue scaling of app pods
    if len(scale_worker_list) >= worker_count:
        logging.info(
            f"Setup has expected worker count {worker_count} "
            "to continue scale of pods"
        )
        return True
    else:
        logging.info(
            "There is no enough worker in the setup, will add enough worker "
            "for the automation supported platforms"
        )
        # Add enough worker for AWS
        if config.ENV_DATA['deployment_type'] == 'ipi' and config.ENV_DATA['platform'].lower() == 'aws':
            # Create machineset for app worker nodes on each aws zone
            # Each zone will have one app worker node
            ms_name = list()
            for obj in machine.get_machineset_objs():
                if 'app' in obj.name:
                    ms_name.append(obj.name)
            if not ms_name:
                if len(machine.get_machineset_objs()) == 3:
                    for zone in ['a', 'b', 'c']:
                        ms_name.append(
                            machine.create_custom_machineset(instance_type='m5.4xlarge', zone=zone)
                        )
                else:
                    ms_name.append(
                        machine.create_custom_machineset(instance_type='m5.4xlarge', zone='a')
                    )
                for ms in ms_name:
                    machine.wait_for_new_node_to_be_ready(ms)
            if len(ms_name) == 3:
                exp_count = int(worker_count / 3)
            else:
                exp_count = worker_count
            for name in ms_name:
                machine.add_node(machine_set=name, count=exp_count)
            for ms in ms_name:
                machine.wait_for_new_node_to_be_ready(ms)
            worker_list = helpers.get_worker_nodes()
            ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
            scale_label_worker = machine.get_labeled_nodes(constants.SCALE_LABEL)
            ocs_worker_list.extend(scale_label_worker)
            final_list = list(dict.fromkeys(ocs_worker_list))
            for node_item in final_list:
                if node_item in worker_list:
                    worker_list.remove(node_item)
            if worker_list:
                helpers.label_worker_node(
                    node_list=worker_list, label_key='scale-label', label_value='app-scale'
                )
            return True
        elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA['platform'].lower() == 'vsphere':
            raise UnsupportedPlatformError("Unsupported Platform to add worker")
        elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA['platform'].lower() == 'baremetal':
            raise UnsupportedPlatformError("Unsupported Platform to add worker")
        elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA['platform'].lower() == 'azure':
            raise UnsupportedPlatformError("Unsupported Platform to add worker")
        else:
            raise UnavailableResourceException(
                "There is no enough worker nodes to continue app pod scaling"
            )
コード例 #23
0
    def test_node_replacement_reactive_aws_ipi(
        self, nodes, pvc_factory, pod_factory, dc_pod_factory,
        failure, interface
    ):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(
                interface=interface, node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name
        )
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name
        )
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}"
        )

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(
            annotation=annotation, machine_name=machine_name
        )

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes)
        )
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(
            resource_name=new_spun_node[0],
            label=constants.OPERATOR_NODE_LABEL
        )
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label"
        )

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(
            dc_pod_obj, timeout=1200
        )
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE
        )
        for pod_obj in all_pod_obj:
            if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj, state=constants.STATUS_RUNNING,
                        timeout=1800
                    )
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE
                        )
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
コード例 #25
0
    def test_nodereplacement_proactive(self, pvc_factory, pod_factory,
                                       dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        # Unscheduling node
        node.unschedule_nodes([osd_node_name])
        # Draining Node
        node.drain_nodes([osd_node_name])
        log.info("Getting machine name from specified node name")
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"Node {osd_node_name} associated machine is {machine_name}")
        log.info(
            f"Deleting machine {machine_name} and waiting for new machine to come up"
        )
        machine.delete_machine_and_check_state_of_new_spinned_machine(
            machine_name)
        new_machine_list = machine.get_machines()
        for machines in new_machine_list:
            # Trimming is done to get just machine name
            # eg:- machine_name:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b-nlgkr
            # After trimming:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b
            if re.match(machines.name[:-6], machine_name):
                new_machine_name = machines.name
        machineset_name = machine.get_machineset_from_machine_name(
            new_machine_name)
        log.info("Waiting for new worker node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)
        new_node_name = node.get_node_from_machine_name(new_machine_name)
        log.info("Adding ocs label to newly created worker node")
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(resource_name=new_node_name,
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_node_name} with OCS storage label")
        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check()
コード例 #26
0
    def identify_and_add_nodes(self, scenario, num_of_nodes):
        """
        Fetches info about the worker nodes and add nodes (if required)

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test

        Returns:
            tuple: tuple containing:
                list: list of OCS nodes name
                list: list of non-OCS nodes name

        """
        nodes_to_add = 0
        initial_worker_nodes = helpers.get_worker_nodes()
        ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
        non_ocs_nodes = list(set(initial_worker_nodes) - set(ocs_nodes))

        if 'colocated' in scenario and len(ocs_nodes) < num_of_nodes:
            nodes_to_add = num_of_nodes - len(initial_worker_nodes)

        if 'dedicated' in scenario and len(non_ocs_nodes) < num_of_nodes:
            nodes_to_add = num_of_nodes - len(non_ocs_nodes)

        if nodes_to_add > 0:
            logger.info(f"{nodes_to_add} extra workers nodes needed")
            if config.ENV_DATA['deployment_type'] == 'ipi':
                machine_name = machine.get_machine_from_node_name(
                    random.choice(initial_worker_nodes)
                )
                machineset_name = machine.get_machineset_from_machine_name(
                    machine_name
                )
                machineset_replica_count = machine.get_replica_count(
                    machineset_name
                )
                machine.add_node(
                    machineset_name,
                    count=machineset_replica_count + nodes_to_add
                )
                logger.info("Waiting for the new node(s) to be in ready state")
                machine.wait_for_new_node_to_be_ready(machineset_name)
            else:
                # TODO: Add required num of nodes instead of skipping
                # https://github.com/red-hat-storage/ocs-ci/issues/1291
                pytest.skip("Add node not implemented for UPI, github issue #1291")

            new_worker_nodes = helpers.get_worker_nodes()
            new_nodes_added = list(set(new_worker_nodes) - set(initial_worker_nodes))
            assert len(new_nodes_added) > 0, 'Extra nodes not added in the cluster'
            non_ocs_nodes += new_nodes_added

        if 'colocated' in scenario and len(ocs_nodes) < num_of_nodes:
            logger.info('Adding OCS storage label to Non-OCS workers')
            node_obj = ocp.OCP(kind=constants.NODE)
            nodes_to_label = non_ocs_nodes[0:(num_of_nodes - len(ocs_nodes))]
            for node_name in nodes_to_label:
                node_obj.add_label(
                    resource_name=node_name, label=constants.OPERATOR_NODE_LABEL
                )
                ocs_nodes.append(node_name)
            non_ocs_nodes = list(set(non_ocs_nodes) - set(ocs_nodes))

        logger.info(f"The OCS nodes are : {ocs_nodes}")
        logger.info(f"The Non-OCS nodes are: {non_ocs_nodes}")
        return ocs_nodes, non_ocs_nodes
コード例 #27
0
def add_worker_node(instance_type=None):
    global ms_name
    ms_name = list()
    worker_list = helpers.get_worker_nodes()
    ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
    scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL)
    if config.RUN.get('use_ocs_worker_for_scale'):
        if not scale_worker:
            helpers.label_worker_node(node_list=worker_list,
                                      label_key='scale-label',
                                      label_value='app-scale')
    else:
        if not scale_worker:
            for node_item in ocs_worker_list:
                worker_list.remove(node_item)
            if worker_list:
                helpers.label_worker_node(node_list=worker_list,
                                          label_key='scale-label',
                                          label_value='app-scale')
    scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL)
    logging.info(f"Print existing scale worker {scale_worker_list}")

    if config.ENV_DATA['deployment_type'] == 'ipi' and config.ENV_DATA[
            'platform'].lower() == 'aws':
        log.info("Adding worker nodes on the current cluster")
        # Create machineset for app worker nodes on each zone
        for obj in machine.get_machineset_objs():
            if 'app' in obj.name:
                ms_name.append(obj.name)
        if instance_type is not None:
            instance_type = instance_type
        else:
            instance_type = 'm5.4xlarge'
        if not ms_name:
            if len(machine.get_machineset_objs()) == 3:
                for zone in ['a', 'b', 'c']:
                    ms_name.append(
                        machine.create_custom_machineset(
                            instance_type=instance_type, zone=zone))
            else:
                ms_name.append(
                    machine.create_custom_machineset(
                        instance_type=instance_type, zone='a'))
            for ms in ms_name:
                machine.wait_for_new_node_to_be_ready(ms)

        worker_list = helpers.get_worker_nodes()
        ocs_worker_list = machine.get_labeled_nodes(
            constants.OPERATOR_NODE_LABEL)
        scale_label_worker = machine.get_labeled_nodes(constants.SCALE_LABEL)
        ocs_worker_list.extend(scale_label_worker)
        final_list = list(dict.fromkeys(ocs_worker_list))
        for node_item in final_list:
            if node_item in worker_list:
                worker_list.remove(node_item)
        if worker_list:
            helpers.label_worker_node(node_list=worker_list,
                                      label_key='scale-label',
                                      label_value='app-scale')
        return True
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'vsphere':
        log.info('Running pgsql on existing worker nodes')
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'baremetal':
        log.info('Running pgsql on existing worker nodes')
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'azure':
        raise UnsupportedPlatformError("Unsupported Platform")
コード例 #28
0
    def test_all_worker_nodes_short_network_failure(self, nodes, setup,
                                                    node_restart_teardown):
        """
        OCS-1432/OCS-1433:
        - Start DeploymentConfig based app pods
        - Make all the worker nodes unresponsive by doing abrupt network failure
        - Reboot the unresponsive node after short duration of ~300 seconds
        - When unresponsive node recovers, app pods and ceph cluster should recover
        - Again run IOs from app pods
        """
        pod_objs = setup
        worker_nodes = helpers.get_worker_nodes()

        # Run IO on pods
        logger.info(f"Starting IO on {len(pod_objs)} app pods")
        with ThreadPoolExecutor() as executor:
            for pod_obj in pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                storage_type = ('block' if pod_obj.pvc.volume_mode == 'Block'
                                else 'fs')
                executor.submit(pod_obj.run_io,
                                storage_type=storage_type,
                                size='2G',
                                runtime=30,
                                fio_filename=f'{pod_obj.name}_io_f1')

        logger.info(f"IO started on all {len(pod_objs)} app pods")

        # Wait for IO results
        for pod_obj in pod_objs:
            pod.get_fio_rw_iops(pod_obj)

        # Induce network failure on all worker nodes
        with ThreadPoolExecutor() as executor:
            for node_name in worker_nodes:
                executor.submit(node.node_network_failure, node_name, False)

        node.wait_for_nodes_status(node_names=worker_nodes,
                                   status=constants.NODE_NOT_READY)

        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Reboot the worker nodes
        logger.info(f"Stop and start the worker nodes: {worker_nodes}")
        nodes.restart_nodes_by_stop_and_start(node.get_node_objs(worker_nodes))

        try:
            node.wait_for_nodes_status(node_names=worker_nodes,
                                       status=constants.NODE_READY)
            logger.info(
                "Verifying StorageCluster pods are in running/completed state")
            pod.wait_for_storage_pods(timeout=720)
        except ResourceWrongStatusException:
            # Restart nodes
            nodes.restart_nodes(node.get_node_objs(worker_nodes))

        assert ceph_health_check(tries=80), "Ceph cluster health is not OK"
        logger.info("Ceph cluster health is OK")

        # Get current info of app pods
        new_pod_objs = list()
        for pod_obj in pod_objs:
            pod_label = pod_obj.labels.get('deploymentconfig')
            pods_data = pod.get_pods_having_label(
                f'deploymentconfig={pod_label}', pod_obj.namespace)
            current_pods = [
                pod_data.get('metadata').get('name') for pod_data in pods_data
                if '-deploy' not in pod_data.get('metadata').get('name')
            ]
            logger.info(f'Pods with label {pod_label}: {current_pods}')

            # Remove the older pod from the list if pod is rescheduled
            if len(current_pods) > 1:
                current_pods.remove(pod_obj.name)

            new_pod_obj = pod.get_pod_obj(current_pods.pop(),
                                          pod_obj.namespace)
            new_pod_obj.pvc = pod_obj.pvc
            new_pod_objs.append(new_pod_obj)

        logger.info("Wait for app pods are in running state")
        for pod_obj in new_pod_objs:
            pod_obj.ocp.wait_for_resource(condition=constants.STATUS_RUNNING,
                                          resource_name=pod_obj.name,
                                          timeout=720,
                                          sleep=20)
        logger.info("All the app pods reached running state")

        # Run more IOs on app pods
        with ThreadPoolExecutor() as executor:
            for pod_obj in new_pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                pod_obj.wl_setup_done = False
                storage_type = ('block' if pod_obj.pvc.volume_mode == 'Block'
                                else 'fs')
                executor.submit(pod_obj.run_io,
                                storage_type=storage_type,
                                size='1G',
                                runtime=30,
                                fio_filename=f'{pod_obj.name}_io_f2')

        for pod_obj in new_pod_objs:
            pod.get_fio_rw_iops(pod_obj)
コード例 #29
0
    def test_node_replacement_reactive_aws_ipi(self, nodes, pvc_factory,
                                               pod_factory, dc_pod_factory,
                                               failure, interface):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(
                f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(annotation=annotation,
                                          machine_name=machine_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes))
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(resource_name=new_spun_node[0],
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label")

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        nodes_helpers.wait_for_all_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()