Beispiel #1
0
def add_capacity_test():
    osd_size = storage_cluster.get_osd_size()
    result = storage_cluster.add_capacity(osd_size)
    pod = OCP(kind=constants.POD,
              namespace=config.ENV_DATA["cluster_namespace"])
    pod.wait_for_resource(
        timeout=300,
        condition=constants.STATUS_RUNNING,
        selector="app=rook-ceph-osd",
        resource_count=result * 3,
    )

    # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061
    # pod.wait_for_resource(
    #     timeout=300,
    #     condition=constants.STATUS_COMPLETED,
    #     selector=constants.OSD_PREPARE_APP_LABEL,
    #     resource_count=result * 3
    # )
    # Commented this lines as a workaround due to bug 1842500

    # Verify OSDs are encrypted.
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()

    ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80)
    ceph_cluster_obj = CephCluster()
    assert ceph_cluster_obj.wait_for_rebalance(
        timeout=5400), "Data re-balance failed to complete"
Beispiel #2
0
    def wait_for_wl_to_finish(self, fio_client_pod):
        """
        Waiting until the workload is finished

        Args:
            fio_client_pod (obj): the FIO client pod object

        Raises:
            IOError: in case of the FIO failed to finish
        Returns:
            str: the end time of the workload

        """
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind="pod")
        pod_obj.wait_for_resource(
            condition="Completed",
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        # Getting the end time of the test
        end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())

        output = run_cmd(f"oc logs {fio_client_pod}")
        log.info(f"The Test log is : {output}")

        try:
            if "Fio failed to execute" not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        return end_time
Beispiel #3
0
def verify_provider_resources():
    """
    Verify resources specific to managed OCS provider:
    1. Ocs-provider-server pod is Running
    2. cephcluster is Ready and its hostNetworking is set to True
    3. Security groups are set up correctly
    """
    # Verify ocs-provider-server pod is Running
    pod_obj = OCP(
        kind="pod",
        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
    )
    pod_obj.wait_for_resource(condition="Running",
                              selector="app=ocsProviderApiServer",
                              resource_count=1)

    # Verify that cephcluster is Ready and hostNetworking is True
    cephcluster = OCP(kind="CephCluster",
                      namespace=defaults.ROOK_CLUSTER_NAMESPACE)
    cephcluster_yaml = cephcluster.get().get("items")[0]
    log.info("Verifying that cephcluster is Ready and hostNetworking is True")
    assert (
        cephcluster_yaml["status"]["phase"] == "Ready"
    ), f"Status of cephcluster ocs-storagecluster-cephcluster is {cephcluster_yaml['status']['phase']}"
    assert cephcluster_yaml["spec"]["network"][
        "hostNetwork"], f"hostNetwork is {cephcluster_yaml['spec']['network']['hostNetwork']}"

    assert verify_worker_nodes_security_groups()
    def check_scale_pods_and_pvcs_created_on_consumers(self):
        for consumer_i, fio_scale in self.consumer_i_per_fio_scale.items():
            config.switch_ctx(consumer_i)
            c_name = config.ENV_DATA.get("cluster_name")
            ocp_pvc = OCP(kind=constants.PVC, namespace=fio_scale.namespace)
            ocp_pvc.wait_for_resource(
                timeout=30,
                condition=constants.STATUS_BOUND,
                resource_count=self.scale_count,
            )
            log.info(
                f"All the PVCs were created successfully on the consumer {c_name}"
            )

            ocp_pod = OCP(kind=constants.POD, namespace=fio_scale.namespace)
            ocp_pod.wait_for_resource(
                timeout=30,
                condition=constants.STATUS_COMPLETED,
                resource_count=self.expected_pod_num,
            )
            log.info(
                f"All the pods were created successfully on the consumer {c_name}"
            )

        log.info(
            "All the pods and PVCs were created successfully on the consumers")
Beispiel #5
0
    def test_scale_osds_reboot_nodes(self, interface, project_factory,
                                     multi_pvc_factory, dc_pod_factory):
        """
        Check storage utilization, if its less then runs IO,
        Scale osds from 3-6, check for rebalance and reboot workers
        """
        current_osd_count = count_cluster_osd()
        proj_obj = project_factory()
        if current_osd_count == 3:
            while not validate_osd_utilization(osd_used=50):
                # Create pvc
                pvc_objs = multi_pvc_factory(project=proj_obj,
                                             interface=interface,
                                             size=self.pvc_size,
                                             num_of_pvc=self.num_of_pvcs)

                dc_pod_objs = list()
                for pvc_obj in pvc_objs:
                    dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj))

                wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs)

                for pod_obj in dc_pod_objs:
                    pod_obj.run_io(storage_type='fs',
                                   size='3G',
                                   runtime='60',
                                   fio_filename=f'{pod_obj.name}_io')

        # Add capacity
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA['cluster_namespace'])
        pod.wait_for_resource(timeout=300,
                              condition=constants.STATUS_RUNNING,
                              selector='app=rook-ceph-osd',
                              resource_count=count * 3)
        assert ceph_health_check(), "New OSDs failed to reach running state"

        cluster = CephCluster()

        # Get rebalance status
        rebalance_status = cluster.get_rebalance_status()
        logger.info(rebalance_status)
        if rebalance_status:
            time_taken = cluster.time_taken_to_complete_rebalance()
            logger.info(f"The time taken to complete rebalance {time_taken}")

        # Rolling reboot on worker nodes
        worker_nodes = get_typed_nodes(node_type='worker')

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in worker_nodes:
            nodes.restart_nodes(nodes=[node])
            wait_for_nodes_status()

        assert ceph_health_check(
            delay=180), "Failed, Ceph health bad after nodes reboot"
    def test_rgw_host_node_failure(
        self, nodes, node_restart_teardown, mcg_obj, bucket_factory
    ):
        """
        Test case to fail node where RGW and Noobaa-db-0 hosting
        and verify new pod spuns on healthy node

        """
        # Get rgw pods
        rgw_pod_obj = get_rgw_pods()

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name == "noobaa-db-0":
                noobaa_pod_node = get_pod_node(noobaa_pod)

        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(
                    f"Stopping node {pod_node} where"
                    f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted"
                )
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(
                    resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720
                )

                # Validate new rgw pod spun
                ocp_obj = OCP(
                    kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE
                )
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1")

                # Start the node
                nodes.start_nodes(node_obj)

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()

        # Verify all storage pods are running
        wait_for_storage_pods()
Beispiel #7
0
    def test_fio_workload_simple(self, ripsaw, interface, io_pattern):
        """
        This is a basic fio perf test
        """
        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd(
            'resources/crds/'
            'ripsaw_v1alpha1_ripsaw_crd.yaml'
        )
        sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs'

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)
        # Todo: have pvc_size set to 'get_osd_pods_memory_sum * 5'
        #  once pr-2037 is merged
        fio_cr['spec']['clustername'] = config.ENV_DATA['platform'] + get_build() + get_ocs_version()
        fio_cr['spec']['test_user'] = get_ocs_version() + interface + io_pattern
        fio_cr['spec']['workload']['args']['storageclass'] = sc
        if io_pattern == 'sequential':
            fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read']
        log.info(f'fio_cr: {fio_cr}')
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(
            300, 20, get_pod_name_by_pattern, 'fio-client', 'my-ripsaw'
        ):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        output = run_cmd(f'oc logs {fio_client_pod}')

        try:
            if 'Fio failed to execute' not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()
        analyze_regression(io_pattern, sc, es_username=fio_cr['spec']['test_user'])
    def test_add_capacity_node_restart(
        self,
        nodes,
        multi_pvc_factory,
        pod_factory,
        workload_storageutilization_rbd,
        num_of_nodes,
    ):
        """
        test add capacity when one of the worker nodes got restart in the middle of the process
        """
        logging.info(
            "Condition 1 to start the test is met: storageutilization is completed"
        )
        # Please notice: When the branch 'wip-add-capacity-e_e' will be merged into master
        # the test will include more much data both before, and after calling 'add_capacity'function.

        node_list = get_ocs_nodes(num_of_nodes=num_of_nodes)
        assert node_list, "Condition 2 to start test failed: No node to restart"

        max_osds = 15
        osd_pods_before = pod_helpers.get_osd_pods()
        assert (
            len(osd_pods_before) < max_osds
        ), "Condition 3 to start test failed: We have maximum of osd's in the cluster"
        logging.info("All start conditions are met!")

        osd_size = storage_cluster.get_osd_size()
        logging.info("Calling add_capacity function...")
        result = storage_cluster.add_capacity(osd_size)
        if result:
            logging.info("add capacity finished successfully")
        else:
            logging.info("add capacity failed")

        # Restart nodes while additional storage is being added
        logging.info("Restart nodes:")
        logging.info([n.name for n in node_list])
        nodes.restart_nodes(nodes=node_list, wait=True)
        logging.info("Finished restarting the node list")

        # The exit criteria verification conditions here are not complete. When the branch
        # 'wip-add-capacity-e_e' will be merged into master I will use the functions from this branch.

        pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"])
        pod.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector="app=rook-ceph-osd",
            resource_count=result * 3,
        )

        # Verify OSDs are encrypted
        if config.ENV_DATA.get("encryption_at_rest"):
            osd_encryption_verification()

        logging.info("Finished verifying add capacity osd storage with node restart")
        logging.info("Waiting for ceph health check to finished...")
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=180)
Beispiel #9
0
def add_capacity_test():
    osd_size = storage_cluster.get_osd_size()
    existing_osd_pods = get_osd_pods()
    existing_osd_pod_names = [pod.name for pod in existing_osd_pods]
    if ui_add_capacity_conditions():
        try:
            result = ui_add_capacity(osd_size)
        except Exception as e:
            logging.error(
                f"Add capacity via UI is not applicable and CLI method will be done. The error is {e}"
            )
            result = storage_cluster.add_capacity(osd_size)
    else:
        result = storage_cluster.add_capacity(osd_size)
    osd_pods_post_expansion = get_osd_pods()
    osd_pod_names_post_expansion = [
        pod.name for pod in osd_pods_post_expansion
    ]
    restarted_osds = list()
    logger.info(
        "Checking if existing OSD pods were restarted (deleted) post add capacity (bug 1931601)"
    )

    for pod in existing_osd_pod_names:
        if pod not in osd_pod_names_post_expansion:
            restarted_osds.append(pod)
    assert (
        len(restarted_osds) == 0
    ), f"The following OSD pods were restarted (deleted) post add capacity: {restarted_osds}"

    pod = OCP(kind=constants.POD,
              namespace=config.ENV_DATA["cluster_namespace"])
    if is_flexible_scaling_enabled():
        replica_count = 1
    else:
        replica_count = 3
    pod.wait_for_resource(
        timeout=300,
        condition=constants.STATUS_RUNNING,
        selector="app=rook-ceph-osd",
        resource_count=result * replica_count,
    )

    # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061
    # pod.wait_for_resource(
    #     timeout=300,
    #     condition=constants.STATUS_COMPLETED,
    #     selector=constants.OSD_PREPARE_APP_LABEL,
    #     resource_count=result * 3
    # )
    # Commented this lines as a workaround due to bug 1842500

    # Verify OSDs are encrypted.
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()

    check_ceph_health_after_add_capacity(ceph_rebalance_timeout=3600)
Beispiel #10
0
    def test_delete_rook_ceph_osd_deployment(self):
        osd_deployments = get_osd_deployments()
        deployment_obj = OCP(kind=constants.DEPLOYMENT,
                             namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        pod_obj = OCP(kind=constants.POD,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        for osd_deployment in osd_deployments:
            # Get rook-ceph-osd pod name associated with the deployment
            osd_deployment_name = osd_deployment.name
            old_osd_pod = get_pod_name_by_pattern(
                pattern=osd_deployment_name,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )[0]

            logger.info(f"Deleting OSD deployment: {osd_deployment_name}")
            try:
                deployment_obj.delete(resource_name=osd_deployment_name)
                deployment_obj.wait_for_resource(
                    condition="0/1",
                    resource_name=osd_deployment_name,
                    column="READY")
            except CommandFailed as err:
                if "NotFound" not in str(err):
                    raise

            # Wait for new OSD deployment to be Ready
            deployment_obj.wait_for_resource(condition="1/1",
                                             resource_name=osd_deployment_name,
                                             column="READY")

            # Check if a new OSD pod is created
            new_osd_pod = get_pod_name_by_pattern(
                pattern=osd_deployment_name,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )[0]
            assert old_osd_pod != new_osd_pod, "New OSD pod not created"

            # Check if new OSD pod is up and running
            logger.info(
                "Waiting for a new OSD pod to get created and reach Running state"
            )
            assert pod_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=new_osd_pod,
                column="STATUS",
            ), f"New OSD pod {new_osd_pod} is not in {constants.STATUS_RUNNING} state"

        # If clusterwide encryption is enabled, verify that the new OSDs are encrypted
        if config.ENV_DATA.get("encryption_at_rest"):
            osd_encryption_verification()

        assert ceph_health_check(delay=120,
                                 tries=50), "Ceph health check failed"
Beispiel #11
0
 def test_add_capacity(self):
     """
     Test to add variable capacity to the OSD cluster while IOs running
     """
     self.ceph_cluster = CephCluster()
     osd_size = storage_cluster.get_osd_size()
     result = storage_cluster.add_capacity(osd_size)
     pod = OCP(kind=constants.POD,
               namespace=config.ENV_DATA['cluster_namespace'])
     pod.wait_for_resource(timeout=300,
                           condition=constants.STATUS_RUNNING,
                           selector='app=rook-ceph-osd',
                           resource_count=result * 3)
     self.ceph_cluster.cluster_health_check(timeout=1200)
Beispiel #12
0
    def test_sql_workload_simple(self, ripsaw):
        """
        This is a basic pgsql workload
        """
        # Deployment postgres
        log.info("Deploying postgres database")
        ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml')
        ripsaw.setup_postgresql()

        # Create pgbench benchmark
        log.info("Create resource file for pgbench workload")
        pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML)
        pg_obj = OCS(**pg_data)
        pg_obj.create()

        # Wait for pgbench pod to be created
        for pgbench_pod in TimeoutSampler(300, 3, get_pod_name_by_pattern,
                                          'pgbench-1-dbs-client', 'my-ripsaw'):
            try:
                if pgbench_pod[0] is not None:
                    pgbench_client_pod = pgbench_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Wait for pg_bench pod to initialized and complete
        log.info("Waiting for pgbench_client to complete")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=pgbench_client_pod,
            timeout=800,
            sleep=10,
        )

        # Running pgbench and parsing logs
        output = run_cmd(f'oc logs {pgbench_client_pod}')
        pg_output = utils.parse_pgsql_logs(output)
        log.info("*******PGBench output log*********\n" f"{pg_output}")
        for data in pg_output:
            latency_avg = data['latency_avg']
            if not latency_avg:
                raise UnexpectedBehaviour("PGBench failed to run, "
                                          "no data found on latency_avg")
        log.info("PGBench has completed successfully")

        # Clean up pgbench benchmark
        log.info("Deleting PG bench benchmark")
        pg_obj.delete()
Beispiel #13
0
    def _deploy_es(self):
        log.info('Deploy the PVC for the ElasticSearch cluster')
        self.ocp.apply(self.pvc)

        log.info('Deploy the ElasticSearch cluster')
        self.ocp.apply(self.crd)

        for es_pod in TimeoutSampler(
            300, 20, get_pod_name_by_pattern, 'quickstart-es-default', self.namespace
        ):
            try:
                if es_pod[0] is not None:
                    self.espod = es_pod[0]
                    log.info(f'The ElasticSearch pod {self.espod} Started')
                    break
            except IndexError:
                log.info('elasticsearch pod not ready yet')

        es_pod = OCP(kind='pod', namespace=self.namespace)
        log.info('Waiting for ElasticSearch to Run')
        assert es_pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_name=self.espod,
            sleep=30,
            timeout=600
        )
        log.info('Elastic Search is ready !!!')
Beispiel #14
0
    def test_rgw_pod_existence(self):
        if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS
                or storagecluster_independent_check()):
            if (not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM
                    and not config.ENV_DATA["platform"]
                    == constants.IBMCLOUD_PLATFORM
                    and (version.get_semantic_ocs_version_from_config() >
                         version.VERSION_4_5)):
                logger.info("Checking whether RGW pod is not present")
                assert (
                    not pod.get_rgw_pods()
                ), "RGW pods should not exist in the current platform/cluster"

        elif (config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS
              and not config.ENV_DATA["mcg_only_deployment"]):
            rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"],
                                      check_if_cluster_was_upgraded(), None)
            logger.info(
                f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform'
            )
            rgw_pod = OCP(kind=constants.POD,
                          namespace=config.ENV_DATA["cluster_namespace"])
            assert rgw_pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.RGW_APP_LABEL,
                resource_count=rgw_count,
                timeout=60,
            )
def add_capacity_test():
    osd_size = storage_cluster.get_osd_size()
    existing_osd_pods = get_osd_pods()
    existing_osd_pod_names = [pod.name for pod in existing_osd_pods]
    result = storage_cluster.add_capacity(osd_size)
    osd_pods_post_expansion = get_osd_pods()
    osd_pod_names_post_expansion = [
        pod.name for pod in osd_pods_post_expansion
    ]
    restarted_osds = list()
    logger.info(
        "Checking if existing OSD pods were restarted (deleted) post add capacity (bug 1931601)"
    )

    for pod in existing_osd_pod_names:
        if pod not in osd_pod_names_post_expansion:
            restarted_osds.append(pod)
    assert (
        len(restarted_osds) == 0
    ), f"The following OSD pods were restarted (deleted) post add capacity: {restarted_osds}"

    pod = OCP(kind=constants.POD,
              namespace=config.ENV_DATA["cluster_namespace"])
    pod.wait_for_resource(
        timeout=300,
        condition=constants.STATUS_RUNNING,
        selector="app=rook-ceph-osd",
        resource_count=result * 3,
    )

    # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061
    # pod.wait_for_resource(
    #     timeout=300,
    #     condition=constants.STATUS_COMPLETED,
    #     selector=constants.OSD_PREPARE_APP_LABEL,
    #     resource_count=result * 3
    # )
    # Commented this lines as a workaround due to bug 1842500

    # Verify OSDs are encrypted.
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()

    ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80)
    ceph_cluster_obj = CephCluster()
    assert ceph_cluster_obj.wait_for_rebalance(
        timeout=5400), "Data re-balance failed to complete"
Beispiel #16
0
def validate_monitoring_pods_are_respinned_and_running_state(pods_list):
    """
    Validate monitoring pods are respinned and running state

    Args:
        pod_list (list): List of the pods where pvc are mounted
    """
    ocp = OCP(api_version='v1', kind='Pod', namespace='openshift-monitoring')
    assert ocp.wait_for_resource(
        condition=constants.STATUS_PENDING, resource_name=pods_list[0]
    ), f"failed to reach pod {pods_list[0]} "
    f"desired status {constants.STATUS_PENDING}"
    for pod in pods_list:
        assert ocp.wait_for_resource(
            condition=constants.STATUS_RUNNING, resource_name=pod
        ), f"failed to reach pod {pod} "
        f"desired status {constants.STATUS_RUNNING}"
Beispiel #17
0
    def wait_for_osd_pods_to_be_running(self, storagedeviceset_count):
        """
        The function gets the number of storage device set in the cluster, and wait
        for the osd pods to be in status running.

        Args:
            storagedeviceset_count (int): the number of storage device set in the cluster

        """
        logging.info("starting function 'wait_for_osd_pods_to_be_running'")
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA['cluster_namespace'])

        pod.wait_for_resource(timeout=420,
                              condition=constants.STATUS_RUNNING,
                              selector='app=rook-ceph-osd',
                              resource_count=storagedeviceset_count * 3)
        self.new_pods_in_status_running = True
Beispiel #18
0
    def test_add_capacity_osd_pod_delete(self, workload_storageutilization_rbd):
        """
        Test add capacity when one of the osd pods gets deleted
        in the middle of the process.
        """
        used_percentage = get_percent_used_capacity()
        logging.info(f"storageutilization is completed. used capacity = {used_percentage}")

        max_osds = 15
        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)
        if number_of_osd_pods_before >= max_osds:
            pytest.skip("We have maximum of osd's in the cluster")

        d = Disruptions()
        d.set_resource('osd')

        osd_size = storage_cluster.get_osd_size()
        logging.info(f"Adding one new set of OSDs. osd size = {osd_size}")
        storagedeviceset_count = storage_cluster.add_capacity(osd_size)
        logging.info("Adding one new set of OSDs was issued without problems")

        # OSD number go down by one and then gradually go up by 1
        # and finally the OSD number will be storagedeviceset_count*3
        pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before)
        logging.info("Delete an osd pod while storage capacity is getting increased")
        d.delete_resource(1)

        pod = OCP(
            kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']
        )

        pod.wait_for_resource(
            timeout=420,
            condition=constants.STATUS_RUNNING,
            selector='app=rook-ceph-osd',
            resource_count=storagedeviceset_count * 3
        )

        logging.info("Finished verifying add capacity when one of the osd pods gets deleted")
        logging.info("Waiting for ceph health check to finished...")
        ceph_health_check(
            namespace=config.ENV_DATA['cluster_namespace'], tries=80
        )
Beispiel #19
0
    def wait_for_wl_to_finish(self, fio_client_pod):
        """
        Waiting until the workload is finished

        Args:
            fio_client_pod (obj): the FIO client pod object

        Returns:
            str: the end time of the workload

        """
        if dev_mode:
            timeout = 3600
            sleeptime = 30
        else:
            timeout = 18000
            sleeptime = 300

        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind="pod")
        pod_obj.wait_for_resource(
            condition="Completed",
            resource_name=fio_client_pod,
            timeout=timeout,
            sleep=sleeptime,
        )

        # Getting the end time of the test
        end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())

        output = run_cmd(f"oc logs {fio_client_pod}")
        log_file_name = f"{self.full_log_path}/test-pod.log"
        with open(log_file_name, "w") as f:
            f.write(output)
        log.info(f"The Test log is can be found at : {log_file_name}")

        try:
            if "Fio failed to execute" not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        return end_time
Beispiel #20
0
    def _deploy_es(self):
        """
        Deploying the Elasticsearch server

        """

        # Creating PVC for the elasticsearch server and wait until it bound
        log.info("Creating 10 GiB PVC for the ElasticSearch cluster on")
        try:
            self.pvc_obj = create_pvc(
                sc_name=self.args.get("sc") or constants.CEPHBLOCKPOOL_SC,
                namespace=self.namespace,
                pvc_name="elasticsearch-data-quickstart-es-default-0",
                access_mode=constants.ACCESS_MODE_RWO,
                size="10Gi",
            )

            # Make sure the PVC bound, or delete it and raise exception
            wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND)
        except ResourceWrongStatusException:
            log.error("The PVC couldn't created")
            return False

        self.pvc_obj.reload()

        log.info("Deploy the ElasticSearch cluster")
        self.ocp.apply(self.crd)

        sample = TimeoutSampler(
            timeout=300,
            sleep=10,
            func=self._pod_is_found,
            pattern="quickstart-es-default",
        )
        if not sample.wait_for_func_status(True):
            log.error("The ElasticSearch pod deployment Failed")
            return False

        self.espod = get_pod_name_by_pattern("quickstart-es-default",
                                             self.namespace)[0]
        log.info(f"The ElasticSearch pod {self.espod} Started")

        es_pod = OCP(kind="pod", namespace=self.namespace)
        log.info("Waiting for ElasticSearch to Run")
        if not es_pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=self.espod,
                sleep=30,
                timeout=600,
        ):
            log.error("TThe ElasticSearch pod is not running !")
            return False
        else:
            log.info("Elastic Search is ready !!!")
            return True
Beispiel #21
0
def add_capacity_test():
    osd_size = storage_cluster.get_osd_size()
    result = storage_cluster.add_capacity(osd_size)
    pod = OCP(kind=constants.POD,
              namespace=config.ENV_DATA['cluster_namespace'])
    pod.wait_for_resource(timeout=300,
                          condition=constants.STATUS_RUNNING,
                          selector='app=rook-ceph-osd',
                          resource_count=result * 3)

    # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061
    # pod.wait_for_resource(
    #     timeout=300,
    #     condition=constants.STATUS_COMPLETED,
    #     selector=constants.OSD_PREPARE_APP_LABEL,
    #     resource_count=result * 3
    # )
    # Commented this lines as a workaround due to bug 1842500

    ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=80)
Beispiel #22
0
    def __init__(self, mcg, obc):
        """
        Initializer function

        Args:
            mcg (obj): Multi cloud gateway object
            obc (str): Name of the Object Bucket Claim
        """
        self.obc_name = obc
        self.namespace = config.ENV_DATA['cluster_namespace']
        obc_obj = OCP(namespace=self.namespace, kind='ObjectBucketClaim')
        assert obc_obj.wait_for_resource(
            condition=constants.STATUS_BOUND,
            resource_name=self.obc_name,
            column='PHASE',
            resource_count=1,
            timeout=60
        ), "OBC did not reach BOUND Phase, cannot initialize OBC credentials"
        obc_resource = OCP(namespace=self.namespace,
                           kind='ObjectBucketClaim',
                           resource_name=self.obc_name)
        obc_results = obc_resource.get()
        self.ob_name = obc_results.get('spec').get('ObjectBucketName')
        self.bucket_name = obc_results.get('spec').get('bucketName')
        ob_obj = OCP(namespace=self.namespace,
                     kind='ObjectBucket',
                     resource_name=self.ob_name).get()
        self.obc_account = ob_obj.get('spec').get('additionalState').get(
            'account')
        secret_obc_obj = OCP(kind='secret',
                             namespace=self.namespace,
                             resource_name=self.obc_name).get()

        self.access_key_id = base64.b64decode(
            secret_obc_obj.get('data').get('AWS_ACCESS_KEY_ID')).decode(
                'utf-8')
        self.access_key = base64.b64decode(
            secret_obc_obj.get('data').get('AWS_SECRET_ACCESS_KEY')).decode(
                'utf-8')
        self.s3_endpoint = mcg.s3_endpoint

        self.s3_resource = boto3.resource(
            's3',
            verify=False,
            endpoint_url=self.s3_endpoint,
            aws_access_key_id=self.access_key_id,
            aws_secret_access_key=self.access_key)

        self.s3_client = boto3.client('s3',
                                      verify=False,
                                      endpoint_url=self.s3_endpoint,
                                      aws_access_key_id=self.access_key_id,
                                      aws_secret_access_key=self.access_key)
Beispiel #23
0
    def test_add_capacity(self):
        """
        Test to add variable capacity to the OSD cluster while IOs running
        """
        osd_size = storage_cluster.get_osd_size()
        result = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA['cluster_namespace'])
        pod.wait_for_resource(timeout=300,
                              condition=constants.STATUS_RUNNING,
                              selector='app=rook-ceph-osd',
                              resource_count=result * 3)

        # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061
        pod.wait_for_resource(timeout=300,
                              condition=constants.STATUS_COMPLETED,
                              selector=constants.OSD_PREPARE_APP_LABEL,
                              resource_count=result * 3)

        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'],
                          tries=80)
    def run(self):
        """
        Run the benchmark and wait until it completed

        """
        # Create the benchmark object
        self.sf_obj = OCS(**self.crd_data)
        self.sf_obj.create()

        # Wait for benchmark pods to get created - takes a while
        for bench_pod in TimeoutSampler(
                240,
                10,
                get_pod_name_by_pattern,
                "smallfile-client",
                benchmark_operator.BMO_NAME,
        ):
            try:
                if bench_pod[0] is not None:
                    small_file_client_pod = bench_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        bench_pod = OCP(kind="pod", namespace=benchmark_operator.BMO_NAME)
        log.info("Waiting for SmallFile benchmark to Run")
        assert bench_pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_name=small_file_client_pod,
            sleep=30,
            timeout=600,
        )
        log.info("The SmallFiles benchmark is running, wait for completion")
        bench_pod.wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=small_file_client_pod,
            timeout=18000,
            sleep=60,
        )
        log.info("The SmallFiles benchmark is completed")
Beispiel #25
0
    def wait_for_osd_pods_to_be_running(self, storagedeviceset_count):
        """
        The function gets the number of storage device set in the cluster, and wait
        for the osd pods to be in status running.

        Args:
            storagedeviceset_count (int): the number of storage device set in the cluster

        """
        logging.info("starting function 'wait_for_osd_pods_to_be_running'")
        pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"])
        if is_flexible_scaling_enabled():
            replica_count = 1
        else:
            replica_count = 3
        pod.wait_for_resource(
            timeout=420,
            condition=constants.STATUS_RUNNING,
            selector="app=rook-ceph-osd",
            resource_count=storagedeviceset_count * replica_count,
        )
        self.new_pods_in_status_running = True
Beispiel #26
0
    def create_kafkadrop(self, wait=True):
        """
        Create kafkadrop pod, service and routes

        Args:
            wait (bool): If true waits till kafkadrop pod running

        Return:
            tuple: Contains objects of kafkadrop pod, service and route

        """
        # Create kafkadrop pod
        try:
            kafkadrop = list(
                templating.load_yaml(constants.KAFKADROP_YAML,
                                     multi_document=True))
            self.kafkadrop_pod = OCS(**kafkadrop[0])
            self.kafkadrop_svc = OCS(**kafkadrop[1])
            self.kafkadrop_route = OCS(**kafkadrop[2])
            self.kafkadrop_pod.create()
            self.kafkadrop_svc.create()
            self.kafkadrop_route.create()
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during creation of kafkadrop which kafka UI")
            raise cf

        # Validate kafkadrop pod running
        if wait:
            ocp_obj = OCP(kind=constants.POD,
                          namespace=constants.AMQ_NAMESPACE)
            ocp_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector="app=kafdrop",
                timeout=120,
                sleep=5,
            )

        return self.kafkadrop_pod, self.kafkadrop_svc, self.kafkadrop_route
        def finalizer():
            op_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            pod_obj = OCP(
                kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            operator_obj = op_obj.get(resource_name=constants.ROOK_CEPH_OPERATOR)
            if operator_obj.get("spec").get("replicas") != 1:
                modify_deployment_replica_count(
                    deployment_name=constants.ROOK_CEPH_OPERATOR, replica_count=1
                ), "Failed to scale up rook-ceph-operator to 1"

            log.info("Validate all mons are up and running")
            try:
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=60,
                    sleep=5,
                )
            except (TimeoutExpiredError, ResourceWrongStatusException) as ex:
                log.warning(ex)
                op_obj.delete(resource_name=constants.ROOK_CEPH_OPERATOR)
                for pod in get_mon_pods():
                    pod.delete()
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=360,
                    sleep=5,
                )
            log.info("All mons are up and running")
Beispiel #28
0
    def _deploy_es(self):
        """
        Deploying the Elasticsearch server

        """

        # Creating PVC for the elasticsearch server and wait until it bound
        log.info("Creating 10 GiB PVC for the ElasticSearch cluster on")
        self.pvc_obj = create_pvc(
            sc_name=constants.CEPHBLOCKPOOL_SC,
            namespace=self.namespace,
            pvc_name="elasticsearch-data-quickstart-es-default-0",
            access_mode=constants.ACCESS_MODE_RWO,
            size="10Gi",
        )
        wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND)
        self.pvc_obj.reload()

        log.info("Deploy the ElasticSearch cluster")
        self.ocp.apply(self.crd)

        sample = TimeoutSampler(
            timeout=300,
            sleep=10,
            func=self._pod_is_found,
            pattern="quickstart-es-default",
        )
        if not sample.wait_for_func_status(True):
            self.cleanup()
            raise Exception("The ElasticSearch pod deployment Failed")
        self.espod = get_pod_name_by_pattern("quickstart-es-default",
                                             self.namespace)[0]
        log.info(f"The ElasticSearch pod {self.espod} Started")

        es_pod = OCP(kind="pod", namespace=self.namespace)
        log.info("Waiting for ElasticSearch to Run")
        assert es_pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_name=self.espod,
            sleep=30,
            timeout=600,
        )
        log.info("Elastic Search is ready !!!")
Beispiel #29
0
    def test_smallfile_workload(self, ripsaw):
        """
        Run SmallFile Workload
        """
        log.info("Apply Operator CRD")
        ripsaw.apply_crd('resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml')

        log.info("Running SmallFile bench")
        sf_data = templating.load_yaml_to_dict(
            constants.SMALLFILE_BENCHMARK_YAML)
        sf_obj = OCS(**sf_data)
        sf_obj.create()
        # wait for benchmark pods to get created - takes a while
        for bench_pod in TimeoutSampler(40, 3, get_pod_name_by_pattern,
                                        'smallfile-client', 'my-ripsaw'):
            try:
                if bench_pod[0] is not None:
                    small_file_client_pod = bench_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        bench_pod = OCP(kind='pod', namespace='my-ripsaw')
        log.info("Waiting for SmallFile benchmark to Run")
        assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                           resource_name=small_file_client_pod,
                                           sleep=30,
                                           timeout=600)
        start_time = time.time()
        timeout = 900
        while True:
            logs = bench_pod.exec_oc_cmd(f'logs {small_file_client_pod}',
                                         out_yaml_format=False)
            if "RUN STATUS DONE" in logs:
                log.info("SmallFile Benchmark Completed Successfully")
                break

            if timeout < (time.time() - start_time):
                raise TimeoutError(
                    f"Timed out waiting for benchmark to complete")
            time.sleep(30)
Beispiel #30
0
def setup_ceph_toolbox(force_setup=False):
    """
    Setup ceph-toolbox - also checks if toolbox exists, if it exists it
    behaves as noop.

    Args:
        force_setup (bool): force setup toolbox pod

    """
    ocs_version = version.get_semantic_ocs_version_from_config()
    if ocsci_config.ENV_DATA["mcg_only_deployment"]:
        log.info("Skipping Ceph toolbox setup due to running in MCG only mode")
        return
    namespace = ocsci_config.ENV_DATA["cluster_namespace"]
    ceph_toolbox = get_pod_name_by_pattern("rook-ceph-tools", namespace)
    # setup toolbox for external mode
    # Refer bz: 1856982 - invalid admin secret
    if len(ceph_toolbox) == 1:
        log.info("Ceph toolbox already exists, skipping")
        if force_setup:
            log.info("Running force setup for Ceph toolbox!")
        else:
            return
    external_mode = ocsci_config.DEPLOYMENT.get("external_mode")

    if ocs_version == version.VERSION_4_2:
        tool_box_data = templating.load_yaml(constants.TOOL_POD_YAML)
        tool_box_data["spec"]["template"]["spec"]["containers"][0][
            "image"] = get_rook_version()
        rook_toolbox = OCS(**tool_box_data)
        rook_toolbox.create()
    else:
        if external_mode:
            toolbox = templating.load_yaml(constants.TOOL_POD_YAML)
            toolbox["spec"]["template"]["spec"]["containers"][0][
                "image"] = get_rook_version()
            toolbox["metadata"]["name"] += "-external"
            keyring_dict = ocsci_config.EXTERNAL_MODE.get("admin_keyring")
            if ocs_version >= version.VERSION_4_10:
                toolbox["spec"]["template"]["spec"]["containers"][0][
                    "command"] = ["/bin/bash"]
                toolbox["spec"]["template"]["spec"]["containers"][0]["args"][
                    0] = "-m"
                toolbox["spec"]["template"]["spec"]["containers"][0]["args"][
                    1] = "-c"
                toolbox["spec"]["template"]["spec"]["containers"][0][
                    "tty"] = True
            env = toolbox["spec"]["template"]["spec"]["containers"][0]["env"]
            # replace secret
            env = [
                item for item in env
                if not (item["name"] == "ROOK_CEPH_SECRET")
            ]
            env.append({
                "name": "ROOK_CEPH_SECRET",
                "value": keyring_dict["key"]
            })
            toolbox["spec"]["template"]["spec"]["containers"][0]["env"] = env
            # add ceph volumeMounts
            ceph_volume_mount_path = {
                "mountPath": "/etc/ceph",
                "name": "ceph-config"
            }
            ceph_volume = {"name": "ceph-config", "emptyDir": {}}
            toolbox["spec"]["template"]["spec"]["containers"][0][
                "volumeMounts"].append(ceph_volume_mount_path)
            toolbox["spec"]["template"]["spec"]["volumes"].append(ceph_volume)
            rook_toolbox = OCS(**toolbox)
            rook_toolbox.create()
            return

        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1982721
        # TODO: Remove workaround when bug 1982721 is fixed
        # https://github.com/red-hat-storage/ocs-ci/issues/4585
        if ocsci_config.ENV_DATA.get("is_multus_enabled"):
            toolbox = templating.load_yaml(constants.TOOL_POD_YAML)
            toolbox["spec"]["template"]["spec"]["containers"][0][
                "image"] = get_rook_version()
            toolbox["metadata"]["name"] += "-multus"
            toolbox["spec"]["template"]["metadata"]["annotations"] = {
                "k8s.v1.cni.cncf.io/networks": "openshift-storage/ocs-public"
            }
            toolbox["spec"]["template"]["spec"]["hostNetwork"] = False
            rook_toolbox = OCS(**toolbox)
            rook_toolbox.create()
            return

        # for OCS >= 4.3 there is new toolbox pod deployment done here:
        # https://github.com/openshift/ocs-operator/pull/207/
        log.info("starting ceph toolbox pod")
        run_cmd(
            "oc patch ocsinitialization ocsinit -n openshift-storage --type "
            'json --patch  \'[{ "op": "replace", "path": '
            '"/spec/enableCephTools", "value": true }]\'')
        toolbox_pod = OCP(kind=constants.POD, namespace=namespace)
        toolbox_pod.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-tools",
            resource_count=1,
            timeout=120,
        )