Beispiel #1
0
    def cleanup(self):
        """
        Function to tear down
        """
        # Delete all pods, pvcs and namespaces
        for namespace in self.namespace_list:
            delete_objs_parallel(
                obj_list=pod.get_all_pods(namespace=namespace.namespace),
                namespace=namespace.namespace,
                kind=self.kind,
            )
            delete_objs_parallel(
                obj_list=pvc.get_all_pvc_objs(namespace=namespace.namespace),
                namespace=namespace.namespace,
                kind=constants.PVC,
            )
            ocp = OCP(kind=constants.NAMESPACE)
            ocp.delete(resource_name=namespace.namespace)

        # Remove scale label from worker nodes in cleanup
        scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL)
        helpers.remove_label_from_worker_node(node_list=scale_workers,
                                              label_key="scale-label")

        # Delete machineset which will delete respective nodes too for aws-ipi platform
        if self.ms_name:
            for name in self.ms_name:
                machine.delete_custom_machineset(name)
Beispiel #2
0
    def test_delete_rook_ceph_osd_deployment(self):
        osd_deployments = get_osd_deployments()
        deployment_obj = OCP(kind=constants.DEPLOYMENT,
                             namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        pod_obj = OCP(kind=constants.POD,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        for osd_deployment in osd_deployments:
            # Get rook-ceph-osd pod name associated with the deployment
            osd_deployment_name = osd_deployment.name
            old_osd_pod = get_pod_name_by_pattern(
                pattern=osd_deployment_name,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )[0]

            logger.info(f"Deleting OSD deployment: {osd_deployment_name}")
            try:
                deployment_obj.delete(resource_name=osd_deployment_name)
                deployment_obj.wait_for_resource(
                    condition="0/1",
                    resource_name=osd_deployment_name,
                    column="READY")
            except CommandFailed as err:
                if "NotFound" not in str(err):
                    raise

            # Wait for new OSD deployment to be Ready
            deployment_obj.wait_for_resource(condition="1/1",
                                             resource_name=osd_deployment_name,
                                             column="READY")

            # Check if a new OSD pod is created
            new_osd_pod = get_pod_name_by_pattern(
                pattern=osd_deployment_name,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )[0]
            assert old_osd_pod != new_osd_pod, "New OSD pod not created"

            # Check if new OSD pod is up and running
            logger.info(
                "Waiting for a new OSD pod to get created and reach Running state"
            )
            assert pod_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=new_osd_pod,
                column="STATUS",
            ), f"New OSD pod {new_osd_pod} is not in {constants.STATUS_RUNNING} state"

        # If clusterwide encryption is enabled, verify that the new OSDs are encrypted
        if config.ENV_DATA.get("encryption_at_rest"):
            osd_encryption_verification()

        assert ceph_health_check(delay=120,
                                 tries=50), "Ceph health check failed"
Beispiel #3
0
def delete_machine(machine_name):
    """
    Deletes a machine

    Args:
        machine_name (str): Name of the machine you want to delete

    Raises:
        CommandFailed: In case yaml_file and resource_name wasn't provided
    """
    machine_obj = OCP(kind="machine",
                      namespace=constants.OPENSHIFT_MACHINE_API_NAMESPACE)
    log.info(f"Deleting machine {machine_name}")
    machine_obj.delete(resource_name=machine_name)
Beispiel #4
0
def delete_all_pvcs(namespace=None):
    """
    Deletes all pvc in namespace

    Args:
        namespace (str): Name of namespace

    Returns:
        bool: True if deletion is successful
    """
    if not namespace:
        namespace = config.ENV_DATA['cluster_namespace']
    ocp_pvc_obj = OCP(kind=constants.PVC, namespace=namespace)
    ocp_pvc_list = get_all_pvcs(namespace=namespace)
    pvc_list = ocp_pvc_list['items']
    for item in pvc_list:
        ocp_pvc_obj.delete(resource_name=item.get('metadata').get('name'))

    return True
        def finalizer():
            op_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            pod_obj = OCP(
                kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            operator_obj = op_obj.get(resource_name=constants.ROOK_CEPH_OPERATOR)
            if operator_obj.get("spec").get("replicas") != 1:
                modify_deployment_replica_count(
                    deployment_name=constants.ROOK_CEPH_OPERATOR, replica_count=1
                ), "Failed to scale up rook-ceph-operator to 1"

            log.info("Validate all mons are up and running")
            try:
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=60,
                    sleep=5,
                )
            except (TimeoutExpiredError, ResourceWrongStatusException) as ex:
                log.warning(ex)
                op_obj.delete(resource_name=constants.ROOK_CEPH_OPERATOR)
                for pod in get_mon_pods():
                    pod.delete()
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=360,
                    sleep=5,
                )
            log.info("All mons are up and running")
Beispiel #6
0
class ElasticSearch(object):
    """
    ElasticSearch Environment
    """
    def __init__(self):
        """
        Initializer function

        """
        log.info("Initializing the Elastic-Search environment object")
        self.namespace = "elastic-system"
        self.eck_file = "ocs_ci/templates/app-pods/eck.1.3.1-all-in-one.yaml"
        self.dumper_file = "ocs_ci/templates/app-pods/esclient.yaml"
        self.pvc = "ocs_ci/templates/app-pods/es-pvc.yaml"
        self.crd = "ocs_ci/templates/app-pods/esq.yaml"

        # Creating some different types of OCP objects
        self.ocp = OCP(kind="pod",
                       resource_name="elastic-operator-0",
                       namespace=self.namespace)
        self.ns_obj = OCP(kind="namespace", namespace=self.namespace)
        self.es = OCP(resource_name="quickstart-es-http",
                      namespace=self.namespace)
        self.elasticsearch = OCP(namespace=self.namespace,
                                 kind="elasticsearch")
        self.password = OCP(
            kind="secret",
            resource_name="quickstart-es-elastic-user",
            namespace=self.namespace,
        )

        # Deploy the ECK all-in-one.yaml file
        self._deploy_eck()
        # Deploy the Elastic-Search server
        self._deploy_es()

        # Verify that ES is Up & Running
        timeout = 600
        while timeout > 0:
            if self.get_health():
                log.info("The ElasticSearch server is ready !")
                break
            else:
                log.warning("The ElasticSearch server is not ready yet")
                log.info("going to sleep for 30 sec. before next check")
                time.sleep(30)
                timeout -= 30

        self._deploy_data_dumper_client()

        # Connect to the server
        self.con = self._es_connect()

    def _deploy_eck(self):
        """
        Deploying the ECK environment for the Elasticsearch, and make sure it
        is in Running mode

        """

        log.info("Deploying the ECK environment for the ES cluster")
        self.ocp.apply(self.eck_file)

        for es_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern,
                                     "elastic-operator", self.namespace):
            try:
                if es_pod[0] is not None:
                    self.eckpod = es_pod[0]
                    log.info(f"The ECK pod {self.eckpod} is ready !")
                    break
            except IndexError:
                log.info("ECK operator pod not ready yet")

    def _deploy_data_dumper_client(self):
        """
        Deploying elastic search client pod with utility which dump all the data
        from the server to .tgz file

        """

        log.info("Deploying the es client for dumping all data")
        self.ocp.apply(self.dumper_file)

        for dmp_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern,
                                      "es-dumper", self.namespace):
            try:
                if dmp_pod[0] is not None:
                    self.dump_pod = dmp_pod[0]
                    log.info(
                        f"The dumper client pod {self.dump_pod} is ready !")
                    break
            except IndexError:
                log.info("Dumper pod not ready yet")

    def get_ip(self):
        """
        This function return the IP address of the Elasticsearch cluster.
        this IP is to use inside the OCP cluster

        Return
            str : String that represent the Ip Address.

        """
        return self.es.get()["spec"]["clusterIP"]

    def get_port(self):
        """
        This function return the port of the Elasticsearch cluster.

        Return
            str : String that represent the port.

        """
        return self.es.get()["spec"]["ports"][0]["port"]

    def _deploy_es(self):
        log.info("Deploy the PVC for the ElasticSearch cluster")
        self.ocp.apply(self.pvc)

        log.info("Deploy the ElasticSearch cluster")
        self.ocp.apply(self.crd)

        for es_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                     "quickstart-es-default", self.namespace):
            try:
                if es_pod[0] is not None:
                    self.espod = es_pod[0]
                    log.info(f"The ElasticSearch pod {self.espod} Started")
                    break
            except IndexError:
                log.info("elasticsearch pod not ready yet")

        es_pod = OCP(kind="pod", namespace=self.namespace)
        log.info("Waiting for ElasticSearch to Run")
        assert es_pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_name=self.espod,
            sleep=30,
            timeout=600,
        )
        log.info("Elastic Search is ready !!!")

    def get_health(self):
        """
        This method return the health status of the Elasticsearch.

        Returns:
            bool : True if the status is green (OK) otherwise - False

        """
        return self.elasticsearch.get(
        )["items"][0]["status"]["health"] == "green"

    def get_password(self):
        """
        This method return the password used to connect the Elasticsearch.

        Returns:
            str : The password as text

        """
        return base64.b64decode(
            self.password.get()["data"]["elastic"]).decode("utf-8")

    def cleanup(self):
        """
        Cleanup the environment from all Elasticsearch components, and from the
        port forwarding process.

        """
        log.info("Teardown the Elasticsearch environment")
        log.info("Deleting all resources")
        log.info("Deleting the dumper client pod")
        self.ocp.delete(yaml_file=self.dumper_file)
        log.info("Deleting the es resource")
        self.ocp.delete(yaml_file=self.crd)
        log.info("Deleting the es project")
        self.ns_obj.delete_project(project_name=self.namespace)
        self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180)

    def _es_connect(self):
        """
        Create a connection to the local ES

        Returns:
            Elasticsearch: elasticsearch connection object

        Raise:
            ConnectionError: if can not connect to the server

        """
        try:
            es = Elasticsearch([{
                "host": self.get_ip(),
                "port": self.get_port()
            }])
        except esexp.ConnectionError:
            log.error("Can not connect to ES server in the LocalServer")
            raise
        return es

    def get_indices(self):
        """
        Getting list of all indices in the ES server - all created by the test,
        the installation of the ES was without any indexes pre-installed.

        Returns:
            list : list of all indices defined in the ES server

        """
        results = []
        log.info("Getting all indices")
        for ind in self.con.indices.get_alias("*"):
            results.append(ind)
        return results

    def _copy(self, es):
        """
        Copy All data from the internal ES server to the main ES.

        **This is deprecated function** , use the dump function, and load
        the data from the files for the main ES server

        Args:
            es (obj): elasticsearch object which connected to the main ES
        """

        query = {"size": 1000, "query": {"match_all": {}}}
        for ind in self.get_indices():
            log.info(f"Reading {ind} from internal ES server")
            try:
                result = self.con.search(index=ind, body=query)
            except esexp.NotFoundError:
                log.warning(f"{ind} Not found in the Internal ES.")
                continue

            log.debug(f"The results from internal ES for {ind} are :{result}")
            log.info(f"Writing {ind} into main ES server")
            for doc in result["hits"]["hits"]:
                log.debug(f"Going to write : {doc}")
                es.index(index=ind, doc_type="_doc", body=doc["_source"])

    def dumping_all_data(self, target_path):
        """
        Dump All data from the internal ES server to .tgz file.

        Args:
            target_path (str): the path where the results file will be copy into

        Return:
            bool: True if the dump operation succeed and return the results data to the host
                  otherwise False
        """

        log.info("dumping data from ES server to .tgz file")
        rsh_cmd = f"rsh {self.dump_pod} /elasticsearch-dump/esdumper.py --ip {self.get_ip()} --port {self.get_port()}"
        result = self.ocp.exec_oc_cmd(rsh_cmd,
                                      out_yaml_format=False,
                                      timeout=1200)
        if "ES dump is done." not in result:
            log.error("There is no data in the Elasticsearch server")
            return False
        else:
            src_file = result.split()[-1]
            log.info(f"Copy {src_file} from the client pod")

            cp_command = f"cp {self.dump_pod}:{src_file} {target_path}/FullResults.tgz"
            result = self.ocp.exec_oc_cmd(cp_command, timeout=120)
            log.info(f"The output from the POD is {result}")
            log.info("Extracting the FullResults.tgz file")
            kwargs = {"cwd": target_path}
            results = run_command(f"tar zxvf {target_path}/FullResults.tgz",
                                  **kwargs)
            log.debug(f"The untar results is {results}")
            if "Error in command" in results:
                log.warning("Can not untar the dumped file")
                return False

        return True
Beispiel #7
0
class OCS(object):
    """
    Base OCSClass
    """
    def __init__(self, **kwargs):
        """
        Initializer function

        Args:
            kwargs (dict):
                1) For existing resource, use OCP.reload() to get the
                resource's dictionary and use it to pass as **kwargs
                2) For new resource, use yaml files templates under
                /templates/CSI like:
                obj_dict = load_yaml(
                    os.path.join(
                        TEMPLATE_DIR, "some_resource.yaml"
                        )
                    )
        """
        self.data = kwargs
        self._api_version = self.data.get('api_version')
        self._kind = self.data.get('kind')
        self._namespace = None
        if 'metadata' in self.data:
            self._namespace = self.data.get('metadata').get('namespace')
            self._name = self.data.get('metadata').get('name')
        self.ocp = OCP(api_version=self._api_version,
                       kind=self.kind,
                       namespace=self._namespace)
        self.temp_yaml = tempfile.NamedTemporaryFile(mode='w+',
                                                     prefix=self._kind,
                                                     delete=False)
        # This _is_delete flag is set to True if the delete method was called
        # on object of this class and was successfull.
        self._is_deleted = False

    @property
    def api_version(self):
        return self._api_version

    @property
    def kind(self):
        return self._kind

    @property
    def namespace(self):
        return self._namespace

    @property
    def name(self):
        return self._name

    @property
    def is_deleted(self):
        return self._is_deleted

    def reload(self):
        """
        Reloading the OCS instance with the new information from its actual
        data.
        After creating a resource from a yaml file, the actual yaml file is
        being changed and more information about the resource is added.
        """
        self.data = self.get()
        self.__init__(**self.data)

    def get(self, out_yaml_format=True):
        return self.ocp.get(resource_name=self.name,
                            out_yaml_format=out_yaml_format)

    def describe(self):
        return self.ocp.describe(resource_name=self.name)

    def create(self, do_reload=True):
        log.info(f"Adding {self.kind} with name {self.name}")
        templating.dump_data_to_temp_yaml(self.data, self.temp_yaml.name)
        status = self.ocp.create(yaml_file=self.temp_yaml.name)
        if do_reload:
            self.reload()
        return status

    def delete(self, wait=True, force=False):
        """
        Delete the OCS object if its not already deleted
        (using the internal is_deleted flag)

        Args:
            wait (bool): Wait for object to be deleted
            force (bool): Force delete object

        Returns:
            bool: True if deleted, False otherwise

        """
        # Avoid accidental delete of default storageclass and secret
        if (self.name == constants.DEFAULT_STORAGECLASS_CEPHFS
                or self.name == constants.DEFAULT_STORAGECLASS_RBD):
            log.info(f"Attempt to delete default Secret or StorageClass")
            return

        if self._is_deleted:
            log.info(f"Attempt to remove resource: {self.name} which is"
                     f"already deleted! Skipping delete of this resource!")
            result = True
        else:
            result = self.ocp.delete(resource_name=self.name,
                                     wait=wait,
                                     force=force)
            self._is_deleted = True
        return result

    def apply(self, **data):
        with open(self.temp_yaml.name, 'w') as yaml_file:
            yaml.dump(data, yaml_file)
        assert self.ocp.apply(
            yaml_file=self.temp_yaml.name), (f"Failed to apply changes {data}")
        self.reload()

    def add_label(self, label):
        """
        Addss a new label

        Args:
            label (str): New label to be assigned for this pod
                E.g: "label=app='rook-ceph-mds'"
        """
        status = self.ocp.add_label(resource_name=self.name, label=label)
        self.reload()
        return status

    def delete_temp_yaml_file(self):
        utils.delete_file(self.temp_yaml.name)
    def test_multiple_mon_pod_stays_on_same_node(self):
        """
        A testcase to verify multiple mon pods stays on same node

        1. Edit the rook-ceph-mon-endpoints configmap
           say, assign mon-a to another node that would be on
           the same node as another mon (compute-1 instead of compute-0)
        2. Delete the mon-a deployment
        3. Edit the mon-b deployment to remove the required mon anti-affinity
        4. Restart the operator
        5. Edit the mon-a deployment to remove the required mon anti-affinity
        6. See mon-a start on compute-1 with mon-b
        7. Soon after, see the operator failover one of these mons onto the
        node that doesn't currently have a mon (compute-0) and start mon-d

        """
        ocs_version = config.ENV_DATA["ocs_version"]
        # Check that we have LSO cluster and OCS version is 4.8 and below
        # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937
        if not (is_lso_cluster()
                and Version.coerce(ocs_version) <= Version.coerce("4.8")):
            pytest.skip(
                "Skip the test because mons are not node assignment from Rook, if cluster is not "
                "LSO based. And also currently, we want to run the test only with OCS 4.8 and "
                "below. This is a workaround due to issue "
                "https://github.com/red-hat-storage/ocs-ci/issues/4937")
        # Initialize
        rook_ceph_mon = "rook-ceph-mon"

        # Get mons running on pod
        mon_pods = get_mon_pods()
        mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get(
            "mon")
        mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get(
            "mon")
        mon_node = get_pod_node(mon_pods[1])

        # Edit the rook-ceph-mon-endpoints
        log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}")
        configmap_obj = OCP(kind=CONFIGMAP,
                            namespace=OPENSHIFT_STORAGE_NAMESPACE)
        rook_ceph_mon_configmap = configmap_obj.get(
            resource_name=ROOK_CEPH_MON_ENDPOINTS)
        json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"])
        json_val["node"][mon_name_to_del].update(
            json_val["node"][mon_name_to_edit])
        rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val)
        new_data = rook_ceph_mon_configmap["data"]
        params = f'{{"data": {json.dumps(new_data)}}}'
        configmap_obj.patch(
            resource_name=ROOK_CEPH_MON_ENDPOINTS,
            params=params,
            format_type="strategic",
        )
        log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully")
        log.info(
            f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}"
        )

        # Delete one mon deployment which had been edited
        dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE)
        mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}"
        log.info(f"Deleting mon {mon_deployment_name_to_del} deployments")
        dep_obj.delete(resource_name=mon_deployment_name_to_del)

        # Edit other mon deployment to remove mon anti-affinity
        mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}"
        log.info(f"Edit mon {mon_deployment_name_to_edit} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_edit,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}"
        )

        # Restart operator
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(condition=STATUS_RUNNING,
                                  selector=OPERATOR_LABEL)

        # Validate deleted deployment mon came up and in pending state
        # Initially mon stucks in pending state, remove defined anti-affinity
        POD_OBJ.wait_for_resource(
            condition=STATUS_PENDING,
            resource_count=1,
            selector=MON_APP_LABEL,
            timeout=1200,
        )
        # Edit mon deployment to remove mon anti-affinity
        log.info(f"Edit mon {mon_deployment_name_to_del} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_del,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}"
        )

        # Validate mon pod moved to another node such that 2 mons are running on same node
        log.info("Waiting for 5 seconds for mon recovery")
        time.sleep(5)
        new_mon_pods = get_mon_pods()
        new_node = [
            get_pod_node(mon) for mon in new_mon_pods if mon.get().get(
                "metadata").get("labels").get("mon") == mon_name_to_del
        ]
        assert (
            new_node[0].name == mon_node.name
        ), f"Mon moved to node {mon_node} such that 2 mons are running on same node"

        # Verify rook deletes one of the mon and move to another node
        timeout = 60
        log.info(f"Waiting for {timeout} seconds for mon recovery")
        time.sleep(timeout)

        POD_OBJ.wait_for_resource(
            condition=STATUS_RUNNING,
            resource_count=len(mon_pods),
            selector=MON_APP_LABEL,
            timeout=3600,
            sleep=5,
        )
        log.info(
            "Mons are up and running state and validate are running on different nodes"
        )
        mon_pods_running_on_same_node()
    def test_pvc_creation_after_del_mon_services(self, interface, pod_factory):
        """
        1. Delete one mon service
        2. Edit the configmap rook-ceph-endpoints
           remove all the deleted mon services entries
        3. Delete deployment, pvc of deleted mon service
        4. Restart rook-ceph-operator
        5. Make sure all mon pods are running
        6. Make sure ceph health Ok and storage pods are running
        7. Sleep for 300 seconds before deleting another mon
        8. Repeat above steps for all mons and at the
           end each mon should contain different endpoints
        9. Create PVC, should succeeded.

        """

        pod_obj = pod_factory(interface=interface)
        run_io_in_bg(pod_obj)

        # Get all mon services
        mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()
        mon_count = len(mon_pods)

        list_old_svc = []
        for svc in mon_svc:

            # Get rook-ceph-operator pod obj
            operator_pod_obj = get_operator_pods()
            operator_name = operator_pod_obj[0].name

            # Scale down rook-ceph-operator
            log.info("Scale down rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=0
            ), "Failed to scale down rook-ceph-operator to 0"
            log.info("Successfully scaled down rook-ceph-operator to 0")

            # Validate rook-ceph-operator pod not running
            POD_OBJ.wait_for_delete(resource_name=operator_name)

            svc_name = svc["metadata"]["name"]
            cluster_ip = svc["spec"]["clusterIP"]
            port = svc["spec"]["ports"][0]["port"]
            mon_endpoint = f"{cluster_ip}:{port}"
            mon_id = svc["spec"]["selector"]["mon"]
            list_old_svc.append(cluster_ip)

            # Delete deployment
            log.info("Delete mon deployments")
            del_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            mon_info = del_obj.get(resource_name=svc_name)
            del_obj.delete(resource_name=svc_name)

            # Delete pvc
            if is_lso_cluster():
                mon_data_path = f"/var/lib/rook/mon-{mon_id}"
                mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][
                    "kubernetes.io/hostname"
                ]
                log.info(f"Delete the directory `{mon_data_path}` from {mon_node}")
                cmd = f"rm -rf {mon_data_path}"
                ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
                ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd])
            else:
                log.info("Delete mon PVC")
                pvc_name = svc["metadata"]["labels"]["pvc_name"]
                pvc_obj = OCP(
                    kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
                )
                pvc_obj.delete(resource_name=pvc_name)

            # Delete the mon service
            log.info("Delete mon service")
            svc_obj = OCP(
                kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            svc_obj.delete(resource_name=svc_name)

            # Edit the cm
            log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}")
            configmap_obj = OCP(
                kind=constants.CONFIGMAP,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            output_get = configmap_obj.get(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS
            )
            new_data = output_get["data"]
            new_data["csi-cluster-config-json"] = (
                new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "")
                if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1
                else new_data["csi-cluster-config-json"].replace(
                    f',"{mon_endpoint}"', ""
                )
            )
            new_data["data"] = ",".join(
                [
                    value
                    for value in new_data["data"].split(",")
                    if f"{mon_id}=" not in value
                ]
            )
            new_data["mapping"] = (
                new_data["mapping"].replace(f'"{mon_id}":null,', "")
                if new_data["mapping"].find(f'"{mon_id}":null,') != -1
                else new_data["mapping"].replace(f',"{mon_id}":null', "")
            )
            params = f'{{"data": {json.dumps(new_data)}}}'
            log.info(f"Removing {mon_id} entries from configmap")
            configmap_obj.patch(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS,
                params=params,
                format_type="strategic",
            )
            log.info(
                f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully"
            )

            # Scale up rook-ceph-operator
            log.info("Scale up rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=1
            ), "Failed to scale up rook-ceph-operator to 1"
            log.info("Successfully scaled up rook-ceph-operator to 1")
            log.info("Validate rook-ceph-operator pod is running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OPERATOR_LABEL,
                resource_count=1,
                timeout=600,
                sleep=5,
            )

            # Validate all mons are running
            log.info("Validate all mons are up and running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=mon_count,
                timeout=1200,
                sleep=5,
            )
            log.info("All mons are up and running")

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Validate all storage pods are running
            wait_for_storage_pods()

            # Sleep for some seconds before deleting another mon
            sleep_time = 300
            log.info(f"Waiting for {sleep_time} seconds before deleting another mon")
            time.sleep(sleep_time)

        # Check the endpoints are different
        log.info("Validate the mon endpoints are changed")
        new_mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        list_new_svc = []
        for new_svc in new_mon_svc:
            cluster_ip = new_svc["spec"]["clusterIP"]
            list_new_svc.append(cluster_ip)
        diff = set(list_new_svc) ^ set(list_old_svc)
        assert len(diff) == len(list_old_svc + list_new_svc), (
            f"Not all endpoints are changed. Set of old "
            f"endpoints {list_old_svc} and new endpoints {list_new_svc}"
        )
        log.info(f"All new mon endpoints are created {list_new_svc}")

        # Create PVC and pods
        log.info(f"Create {interface} PVC")
        pod_obj = pod_factory(interface=interface)
        pod_obj.run_io(storage_type="fs", size="500M")
    def test_del_mon_svc(
        self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown
    ):
        """
        Test to verify same mon comes up and running
        after deleting mon services manually and joins the quorum

        1. Delete the mon services
        2. Restart the rook operator
        3. Make sure all mon pods are running,
        and same service or endpoints are running
        4. Make sure ceph health Ok and storage pods are running
        5. Create PVC, should succeeded.

        """

        self.sanity_helpers = Sanity()

        # Get all mon services
        mon_svc_before = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()

        # Delete the mon services one by one
        svc_obj = OCP(
            kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
        )
        mon_svc_ip_before = []
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            mon_svc_ip_before.append(svc["spec"]["clusterIP"])
            log.info(f"Delete mon service {svc_name}")
            svc_obj.delete(resource_name=svc_name)
            # Verify mon services deleted
            svc_obj.wait_for_delete(resource_name=svc_name)

        # Restart the rook-operator pod
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL
        )

        # Verify same mon services are created again
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            svc_obj.check_resource_existence(
                should_exist=True, timeout=300, resource_name=svc_name
            )
        log.info("Same old mon services are recreated")

        # Validate all mons are running
        log.info("Validate all mons are up and running")
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.MON_APP_LABEL,
            resource_count=len(mon_pods),
            timeout=600,
            sleep=3,
        )

        # Validate same mon services are running
        log.info("Validate same mon services are running")
        mon_svc_after = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after]
        assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, (
            "Different mon services are running. "
            f"Before mon services list: {mon_svc_ip_before}, "
            f"After mon services list: {mon_svc_ip_after}"
        )
        log.info("Same old mon services are running and all mons are in running state")

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Create and delete resources
        self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)
Beispiel #11
0
    def test_noobaa_rebuild(self, bucket_factory):
        """
        Test case to verify noobaa rebuild. Verifies KCS: https://access.redhat.com/solutions/5948631

        1. Stop the noobaa-operator by setting the replicas of noobaa-operator deployment to 0.
        2. Delete the noobaa deployments/statefulsets.
        3. Delete the PVC db-noobaa-db-0.
        4. Patch existing backingstores and bucketclasses to remove finalizer
        5. Delete the backingstores/bucketclass.
        6. Delete the noobaa secrets.
        7. Restart noobaa-operator by setting the replicas back to 1.
        8. Monitor the pods in openshift-storage for noobaa pods to be Running.

        """

        dep_ocp = OCP(kind=constants.DEPLOYMENT,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        state_ocp = OCP(kind=constants.STATEFULSET,
                        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        noobaa_pvc_obj = get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"])

        # Scale down noobaa operator
        logger.info(
            f"Scaling down {constants.NOOBAA_OPERATOR_DEPLOYMENT} deployment to replica: 0"
        )
        dep_ocp.exec_oc_cmd(
            f"scale deployment {constants.NOOBAA_OPERATOR_DEPLOYMENT} --replicas=0"
        )

        # Delete noobaa deployments and statefulsets
        logger.info("Deleting noobaa deployments and statefulsets")
        dep_ocp.delete(resource_name=constants.NOOBAA_ENDPOINT_DEPLOYMENT)
        state_ocp.delete(resource_name=constants.NOOBAA_DB_STATEFULSET)
        state_ocp.delete(resource_name=constants.NOOBAA_CORE_STATEFULSET)

        # Delete noobaa-db pvc
        pvc_obj = OCP(kind=constants.PVC,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        logger.info("Deleting noobaa-db pvc")
        pvc_obj.delete(resource_name=noobaa_pvc_obj[0].name, wait=True)
        pvc_obj.wait_for_delete(resource_name=noobaa_pvc_obj[0].name,
                                timeout=300)

        # Patch and delete existing backingstores
        params = '{"metadata": {"finalizers":null}}'
        bs_obj = OCP(kind=constants.BACKINGSTORE,
                     namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        for bs in bs_obj.get()["items"]:
            assert bs_obj.patch(
                resource_name=bs["metadata"]["name"],
                params=params,
                format_type="merge",
            ), "Failed to change the parameter in backingstore"
            logger.info(f"Deleting backingstore: {bs['metadata']['name']}")
            bs_obj.delete(resource_name=bs["metadata"]["name"])

        # Patch and delete existing bucketclass
        bc_obj = OCP(kind=constants.BUCKETCLASS,
                     namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        for bc in bc_obj.get()["items"]:
            assert bc_obj.patch(
                resource_name=bc["metadata"]["name"],
                params=params,
                format_type="merge",
            ), "Failed to change the parameter in bucketclass"
            logger.info(f"Deleting bucketclass: {bc['metadata']['name']}")
            bc_obj.delete(resource_name=bc["metadata"]["name"])

        # Delete noobaa secrets
        logger.info("Deleting noobaa related secrets")
        dep_ocp.exec_oc_cmd(
            "delete secrets noobaa-admin noobaa-endpoints noobaa-operator noobaa-server"
        )

        # Scale back noobaa-operator deployment
        logger.info(
            f"Scaling back {constants.NOOBAA_OPERATOR_DEPLOYMENT} deployment to replica: 1"
        )
        dep_ocp.exec_oc_cmd(
            f"scale deployment {constants.NOOBAA_OPERATOR_DEPLOYMENT} --replicas=1"
        )

        # Wait and validate noobaa PVC is in bound state
        pvc_obj.wait_for_resource(
            condition=constants.STATUS_BOUND,
            resource_name=noobaa_pvc_obj[0].name,
            timeout=600,
            sleep=120,
        )

        # Validate noobaa pods are up and running
        pod_obj = OCP(kind=constants.POD,
                      namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        noobaa_pods = get_noobaa_pods()
        pod_obj.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_count=len(noobaa_pods),
            selector=constants.NOOBAA_APP_LABEL,
            timeout=900,
        )

        # Verify everything running fine
        logger.info(
            "Verifying all resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Verify default backingstore/bucketclass
        default_bs = OCP(kind=constants.BACKINGSTORE,
                         namespace=constants.OPENSHIFT_STORAGE_NAMESPACE).get(
                             resource_name=DEFAULT_NOOBAA_BACKINGSTORE)
        default_bc = OCP(kind=constants.BUCKETCLASS,
                         namespace=constants.OPENSHIFT_STORAGE_NAMESPACE).get(
                             resource_name=DEFAULT_NOOBAA_BUCKETCLASS)
        assert (default_bs["status"]["phase"] == default_bc["status"]["phase"]
                == constants.STATUS_READY
                ), "Failed: Default bs/bc are not in ready state"

        # Create OBCs
        logger.info("Creating OBCs after noobaa rebuild")
        bucket_factory(amount=3, interface="OC", verify_health=True)
Beispiel #12
0
class TestPvcMultiSnapshotPerformance(PASTest):
    """
    Tests to measure PVC snapshots creation performance & scale
    The test is trying to to take the maximal number of snapshot for one PVC
    """
    def setup(self):
        """
        Setting up the test environment :
            Calculating the amount of storage which available for the test
            Creating namespace (project) for the test

        """
        log.info("Setting up the test environment")

        super(TestPvcMultiSnapshotPerformance, self).setup()

        self.total_creation_time = 0
        self.total_csi_creation_time = 0
        self.total_creation_speed = 0

        # Getting the total Storage capacity
        try:
            self.ceph_capacity = int(self.ceph_cluster.get_ceph_capacity())
        except Exception as err:
            err_msg = f"Failed to get Storage capacity : {err}"
            log.error(err_msg)
            raise Exception(err_msg)

        # Use 70% of the storage capacity in the test
        self.capacity_to_use = int(self.ceph_capacity * 0.7)

        # Creating new namespace for the test
        self.nss_name = "pas-test-namespace"
        log.info(f"Creating new namespace ({self.nss_name}) for the test")
        try:
            self.proj = helpers.create_project(project_name=self.nss_name)
        except CommandFailed as ex:
            if str(ex).find("(AlreadyExists)"):
                log.warning("The namespace is already exists !")
            log.error("Cannot create new project")
            raise CommandFailed(f"{self.nss_name} was not created")

        # Initialize a general Snapshot object to use in the test
        self.snapshot = OCP(kind="volumesnapshot", namespace=self.nss_name)

    def teardown(self):
        """
        Cleaning up the environment :
            Delete all snapshot
            Delete the POD
            Delete the PVC and the PV
            Delete the StorageClass
            Delete the VolumeSnapshotClass
            Delete the data pool
            Switch to the default namespace
            Delete the tested namespace

        """
        log.info("Cleanup the test environment")

        if self.full_teardown:
            # Getting the name of the PCV's backed PV
            try:
                pv = self.pvc_obj.get("spec")["spec"]["volumeName"]
            except KeyError:
                log.error(
                    f"Cannot found key in the PVC object {json.dumps(self.pvc_obj.get('spec').get('spec'), indent=3)}"
                )

            # Getting the list of all snapshots
            try:
                snapshot_list = self.snapshot.get(all_namespaces=True)["items"]
            except Exception as err:
                log.error(f"Cannot get the list of snapshots : {err}")
                snapshot_list = []

            # Deleting al snapshots from the cluster
            log.info(f"Trying to delete all ({len(snapshot_list)}) Snapshots")
            log.debug(
                f"The list of all snapshots is : {json.dumps(snapshot_list, indent=3)}"
            )
            for vs in snapshot_list:
                snap_name = vs["metadata"]["name"]
                log.info(f"Try to delete {snap_name}")
                try:
                    self.snapshot.delete(resource_name=snap_name)
                except Exception as err:
                    log.error(f"Cannot delete {snap_name} : {err}")

            # Deleting the pod which wrote data to the pvc
            log.info(f"Deleting the test POD : {self.pod_obj.name}")
            try:
                self.pod_obj.delete()
                log.info("Wait until the pod is deleted.")
                self.pod_obj.ocp.wait_for_delete(
                    resource_name=self.pod_obj.name)
            except Exception as ex:
                log.error(f"Cannot delete the test pod : {ex}")

            # Deleting the PVC which used in the test.
            try:
                log.info(f"Delete the PVC : {self.pvc_obj.name}")
                self.pvc_obj.delete()
                log.info("Wait until the pvc is deleted.")
                self.pvc_obj.ocp.wait_for_delete(
                    resource_name=self.pvc_obj.name)
            except Exception as ex:
                log.error(f"Cannot delete the test pvc : {ex}")

            # Delete the backend PV of the PVC
            log.info(f"Try to delete the backend PV : {pv}")
            try:
                run_oc_command(f"delete pv {pv}")
            except Exception as ex:
                err_msg = f"cannot delete PV {pv} - [{ex}]"
                log.error(err_msg)

            # Deleting the StorageClass used in the test
            log.info(f"Deleting the test StorageClass : {self.sc_obj.name}")
            try:
                self.sc_obj.delete()
                log.info("Wait until the SC is deleted.")
                self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name)
            except Exception as ex:
                log.error(f"Can not delete the test sc : {ex}")

            # Deleting the VolumeSnapshotClass used in the test
            log.info(
                f"Deleting the test Snapshot Class : {self.snap_class.name}")
            try:
                self.snap_class.delete()
                log.info("Wait until the VSC is deleted.")
                self.snap_class.ocp.wait_for_delete(
                    resource_name=self.snap_class.name)
            except Exception as ex:
                log.error(f"Can not delete the test vsc : {ex}")

            # Deleting the Data pool
            log.info(f"Deleting the test storage pool : {self.sc_name}")
            self.delete_ceph_pool(self.sc_name)
            # Verify deletion by checking the backend CEPH pools using the toolbox
            results = self.ceph_cluster.toolbox.exec_cmd_on_pod(
                "ceph osd pool ls")
            log.debug(f"Existing pools are : {results}")
            if self.sc_name in results.split():
                log.warning(
                    "The pool did not deleted by CSI, forcing delete it manually"
                )
                self.ceph_cluster.toolbox.exec_cmd_on_pod(
                    f"ceph osd pool delete {self.sc_name} {self.sc_name} "
                    "--yes-i-really-really-mean-it")
            else:
                log.info(f"The pool {self.sc_name} was deleted successfully")

            # Deleting the namespace used by the test
            log.info(f"Deleting the test namespace : {self.nss_name}")
            switch_to_default_rook_cluster_project()
            try:
                self.proj.delete(resource_name=self.nss_name)
                self.proj.wait_for_delete(resource_name=self.nss_name,
                                          timeout=60,
                                          sleep=10)
            except CommandFailed:
                log.error(f"Can not delete project {self.nss_name}")
                raise CommandFailed(f"{self.nss_name} was not created")

            # After deleting all data from the cluster, we need to wait until it will re-balance
            ceph_health_check(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
                              tries=30,
                              delay=60)

        super(TestPvcMultiSnapshotPerformance, self).teardown()

    def init_full_results(self, full_results):
        """
        Initialize the full results object which will send to the ES server

        Args:
            full_results (obj): an empty FIOResultsAnalyse object

        Returns:
            FIOResultsAnalyse (obj): the input object fill with data

        """
        for key in self.environment:
            full_results.add_key(key, self.environment[key])
        full_results.add_key("index", full_results.new_index)
        full_results.add_key("snapshot_num", self.num_of_snaps)
        full_results.add_key("pvc_size", self.pvc_size)
        full_results.add_key("storageclass", self.sc_name.split("-")[-1])
        full_results.add_key("dataset", self.capacity_to_use)
        return full_results

    def get_csi_pod(self, namespace):
        """
        Getting pod list in specific namespace, for the provision logs

        Args:
            namespace (str): the namespace where the pod is deployed.

        Returns:
            list : list of lines from the output of the command.

        """
        results = run_oc_command(cmd="get pod", namespace=namespace)
        if ERRMSG in results:
            err_msg = "Can not get the CSI controller pod"
            log.error(err_msg)
            raise Exception(err_msg)
        return results

    def build_fio_command(self):
        """
        Building the FIO command that will be run on the pod before each snapshot

        """
        # Find the path that the PVC is mounted within the POD
        path = (self.pod_obj.get("spec").get("spec").get("containers")[0].get(
            "volumeMounts")[0].get("mountPath"))
        self.fio_cmd = (
            "fio --name=fio-fillup --rw=write --bs=4m --direct=1 --numjobs=1"
            " --time_based=0 --runtime=36000 --ioengine=libaio --end_fsync=1"
            f" --filename={path}/{self.file_name} --size={self.file_size}"
            " --output-format=json")
        log.info(f"The FIO command is : {self.fio_cmd}")

    def create_snapshotclass(self, interface):
        """
        Creates own VolumeSnapshotClass

        Args:
            interface (str): Interface type used

        Returns:
            ocs_obj (obj): SnapshotClass obj instances

        """
        if interface == constants.CEPHFILESYSTEM:
            snapclass_name = "pas-test-cephfs-snapshot-class"
        else:
            snapclass_name = "pas-test-rbd-snapshot-class"

        yaml_files = {
            constants.CEPHBLOCKPOOL: constants.CSI_RBD_SNAPSHOTCLASS_YAML,
            constants.CEPHFILESYSTEM: constants.CSI_CEPHFS_SNAPSHOTCLASS_YAML,
        }
        snapshotclass_data = templating.load_yaml(yaml_files[interface])

        snapshotclass_data["metadata"]["name"] = snapclass_name
        ocs_obj = ocs.OCS(**snapshotclass_data)
        log.info(f"Creating new snapshot class : {snapclass_name}")
        try:
            created_snapclass = ocs_obj.create(do_reload=True)
            log.debug(created_snapclass)
        except Exception as ex:
            err_msg = f"Failed to create new snapshot class : {snapclass_name} [{ex}]"
            log.error(err_msg)
            raise Exception(err_msg)
        return ocs_obj

    def create_snapshot(self, snap_num):
        """
        Creating snapshot of volume
        measure the total snapshot creation time
        and the CSI creation time

        Args:
            snap_num (int) the number of snapshot to create

        Returns:
            int: the creation time of the snapshot (in sec.)

        """
        log.info(f"Taking snapshot number {snap_num}")
        # Getting UTC time before test starting for log retrieve
        start_time = self.get_time("csi")
        snap_name = f"pvc-snap-{snap_num}-"
        snap_name += self.pvc_obj.name.split("-")[-1]
        self.snap_templ["metadata"]["name"] = snap_name
        self.snap_templ["spec"][
            "volumeSnapshotClassName"] = self.snap_class.name

        fd, tmpfile = tempfile.mkstemp(suffix=".yaml", prefix="Snap")
        log.debug(f"Going to create {tmpfile}")
        with open(tmpfile, "w") as f:
            yaml.dump(self.snap_templ, f, default_flow_style=False)

        res = run_oc_command(cmd=f"create -f {tmpfile}",
                             namespace=self.nss_name)
        if ERRMSG in res[0]:
            err_msg = f"Failed to create snapshot : {res}"
            log.error(err_msg)
            raise Exception(err_msg)

        # wait until snapshot is ready
        timeout = 720
        sleep_time = 10
        snap_con_name = None
        snap_uid = None
        while timeout > 0:
            res = run_oc_command(f"get volumesnapshot {snap_name} -o yaml",
                                 namespace=self.nss_name)

            if ERRMSG not in res[0]:
                res = yaml.safe_load("\n".join(res))
                log.debug(f"The command output is : {yaml.dump(res)}")
                try:
                    if res["status"]["readyToUse"]:
                        log.info(f"{snap_name} Created and ready to use")
                        snap_con_name = res["status"][
                            "boundVolumeSnapshotContentName"]
                        snap_uid = res["metadata"]["uid"]
                        break
                    else:
                        log.info(
                            f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check"
                        )
                        time.sleep(sleep_time)
                        timeout -= sleep_time
                except Exception:
                    log.info(
                        f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check"
                    )
                    time.sleep(sleep_time)
                    timeout -= sleep_time

            else:
                err_msg = f"Can not get snapshot status {res}"
                log.error(err_msg)
                raise Exception(err_msg)
        if snap_con_name:
            creation_time = performance_lib.measure_total_snapshot_creation_time(
                snap_name, start_time)
            csi_creation_time = performance_lib.measure_csi_snapshot_creation_time(
                self.interface, snap_uid, start_time)
            return (creation_time, csi_creation_time)
        else:
            err_msg = "Snapshot was not created on time"
            log.error(err_msg)
            raise TimeoutError(err_msg)

    def run(self):
        """
        Running the test
            for each snapshot : write data on the pod and take snapshot
        """
        results = []
        for test_num in range(1, self.num_of_snaps + 1):
            log.info(f"Starting test number {test_num}")

            # Running IO on the POD - (re)-write data on the PVC
            self.pod_obj.exec_cmd_on_pod(self.fio_cmd,
                                         out_yaml_format=False,
                                         timeout=3600)

            # Taking Snapshot of the PVC
            ct, sci_ct = self.create_snapshot(test_num)
            speed = self.filesize / ct
            self.total_creation_time += ct
            self.total_csi_creation_time += sci_ct
            self.total_creation_speed += speed

            results.append({
                "Snap Num": test_num,
                "time": ct,
                "csi_time": sci_ct,
                "speed": speed
            })
            log.info(f"Results for snapshot number {test_num} are : "
                     f"Creation time is {ct} , Creation speed {speed}, "
                     f"Csi creation time is {sci_ct}")

        log.debug(f"All results are : {json.dumps(results, indent=3)}")
        return results

    @pytest.mark.polarion_id("OCS-2623")
    @pytest.mark.parametrize(
        argnames=["interface_type", "snap_number"],
        argvalues=[
            pytest.param(*[constants.CEPHBLOCKPOOL, 512]),
            pytest.param(*[constants.CEPHFILESYSTEM, 100]),
        ],
    )
    def test_pvc_multiple_snapshot_performance(
        self,
        pvc_factory,
        pod_factory,
        secret_factory,
        interface_type,
        snap_number,
    ):
        """
        1. Creating PVC
           size is depend on storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 80% of data
        3. Take a snapshot of the PVC and measure the total and CSI times of creation.
        4. re-write the data on the PVC
        5. Take a snapshot of the PVC and measure the total and the CSI times of creation.
        6. repeat steps 4-5 the numbers of snapshot we want to take : 512
           this will be run by outside script for low memory consumption
        7. print all information.

        Raises:
            StorageNotSufficientException: in case of not enough capacity

        """

        # Getting the full path for the test logs
        self.results_path = get_full_test_logs_path(cname=self)
        self.full_log_path = f"{self.results_path}-{interface_type}-{snap_number}"
        log.info(f"Logs file path name is : {self.full_log_path}")
        log.info(f"Reslut path is : {self.results_path}")

        self.full_teardown = True
        self.num_of_snaps = snap_number
        if self.dev_mode:
            self.num_of_snaps = 2

        log.info(
            f"Going to create {self.num_of_snaps} {interface_type} snapshots")

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        self.need_capacity = int((self.num_of_snaps + 2) * 1.35)

        # Test will run only on system with enough capacity
        if self.capacity_to_use < self.need_capacity:
            err_msg = (f"The system have only {self.ceph_capacity} GiB, "
                       f"we want to use only {self.capacity_to_use} GiB, "
                       f"and we need {self.need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        self.pvc_size = int(self.capacity_to_use / (self.num_of_snaps + 2))
        if self.dev_mode:
            self.pvc_size = 5

        self.interface = interface_type
        self.sc_name = "pas-testing-rbd"
        pool_name = self.sc_name
        if self.interface == constants.CEPHFILESYSTEM:
            self.sc_name = "pas-testing-cephfs"
            pool_name = f"{self.sc_name}-data0"

        # Creating new storage pool
        self.create_new_pool(self.sc_name)

        # Creating new StorageClass (pool) for the test.
        secret = secret_factory(interface=self.interface)
        self.sc_obj = helpers.create_storage_class(
            interface_type=self.interface,
            interface_name=pool_name,
            secret_name=secret.name,
            sc_name=self.sc_name,
            fs_name=self.sc_name,
        )
        log.info(f"The new SC is : {self.sc_obj.name}")
        log.debug(f"All SC data is {json.dumps(self.sc_obj.data, indent=3)}")

        # Create new VolumeSnapshotClass
        self.snap_class = self.create_snapshotclass(self.interface)

        # Create new PVC
        log.info(f"Creating {self.pvc_size} GiB PVC of {interface_type}")
        self.pvc_obj = pvc_factory(
            interface=self.interface,
            storageclass=self.sc_obj,
            size=self.pvc_size,
            status=constants.STATUS_BOUND,
            project=self.proj,
        )

        # Create POD which will attache to the new PVC
        log.info("Creating A POD")
        self.pod_obj = pod_factory(
            interface=self.interface,
            pvc=self.pvc_obj,
            status=constants.STATUS_RUNNING,
            pod_dict_path=constants.PERF_POD_YAML,
        )

        # Calculating the file size as 80% of the PVC size
        self.filesize = self.pvc_obj.size * 0.80
        # Change the file size to MB for the FIO function
        self.file_size = f"{int(self.filesize * constants.GB2MB)}M"
        self.file_name = self.pod_obj.name

        log.info(
            f"Total capacity size is : {self.ceph_capacity} GiB, "
            f"Going to use {self.need_capacity} GiB, "
            f"With {self.num_of_snaps} Snapshots to {self.pvc_size} GiB PVC. "
            f"File size to be written is : {self.file_size} "
            f"with the name of {self.file_name}")

        # Reading basic snapshot yaml file
        self.snap_yaml = constants.CSI_CEPHFS_SNAPSHOT_YAML
        self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_CEPHFS
        self.fs_type = "cephfs"
        if interface_type == constants.CEPHBLOCKPOOL:
            self.snap_yaml = constants.CSI_RBD_SNAPSHOT_YAML
            self.fs_type = "rbd"
            self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_RBD
        with open(self.snap_yaml, "r") as stream:
            try:
                self.snap_templ = yaml.safe_load(stream)
                self.snap_templ["spec"]["volumeSnapshotClassName"] = self.sc
                self.snap_templ["spec"]["source"][
                    "persistentVolumeClaimName"] = self.pvc_obj.name
            except yaml.YAMLError as exc:
                log.error(f"Can not read template yaml file {exc}")
        log.debug(
            f"Snapshot yaml file : {self.snap_yaml} "
            f"Content of snapshot yaml file {json.dumps(self.snap_templ, indent=4)}"
        )

        self.build_fio_command()
        self.start_time = self.get_time()

        # Initialize the results doc file.
        full_results = self.init_full_results(
            ResultsAnalyse(self.uuid, self.crd_data, self.full_log_path,
                           "multiple_snapshots"))
        full_results.all_results = self.run()
        self.end_time = self.get_time()
        full_results.add_key(
            "avg_creation_time",
            f"{float(self.total_creation_time / self.num_of_snaps):.2f}",
        )
        full_results.add_key(
            "avg_csi_creation_time",
            f"{float(self.total_csi_creation_time / self.num_of_snaps):.2f}",
        )
        full_results.add_key(
            "avg_creation_speed",
            f"{float(self.total_creation_speed / self.num_of_snaps):.2f}",
        )
        full_results.add_key("test_time", {
            "start": self.start_time,
            "end": self.end_time
        })

        # Writing the analyzed test results to the Elastic-Search server
        if full_results.es_write():
            res_link = full_results.results_link()
            log.info(f"The Result can be found at : {res_link}")

            # Create text file with results of all subtests (2 - according to the parameters)
            self.write_result_to_file(res_link)

    def test_pvc_multiple_snapshot_performance_results(self):
        """
        This is not a test - it only checks that previous tests were completed and finished
        as expected with reporting the full results (links in the ES) of previous 2 tests
        """

        self.full_teardown = False
        self.number_of_tests = 2
        results_path = get_full_test_logs_path(
            cname=self, fname="test_pvc_multiple_snapshot_performance")
        self.results_file = os.path.join(results_path, "all_results.txt")
        log.info(f"Check results in {self.results_file}.")
        self.check_tests_results()

        self.push_to_dashboard(test_name="PVC Multiple Snapshots Creation")
Beispiel #13
0
class ElasticSearch(object):
    """
    ElasticSearch Environment
    """
    def __init__(self, **kwargs):
        """
        Initializer function

        """
        log.info("Initializing the Elastic-Search environment object")
        self.args = kwargs
        self.namespace = "elastic-system"
        self.repo = self.args.get("repo", constants.OCS_WORKLOADS)
        self.branch = self.args.get("branch", "master")
        self.dir = tempfile.mkdtemp(prefix="eck_")

        # Clone the ECK repo locally
        self._clone()

        self.eck_path = os.path.join(self.dir, "ocs-workloads/eck")
        self.eck_file = os.path.join(self.eck_path, "crds.yaml")
        self.dumper_file = os.path.join(constants.TEMPLATE_APP_POD_DIR,
                                        "esclient.yaml")
        self.crd = os.path.join(constants.TEMPLATE_APP_POD_DIR, "esq.yaml")

        # Creating some different types of OCP objects
        self.ocp = OCP(kind="pod",
                       resource_name="elastic-operator-0",
                       namespace=self.namespace)
        self.ns_obj = OCP(kind="namespace", namespace=self.namespace)
        self.es = OCP(resource_name="quickstart-es-http",
                      namespace=self.namespace)
        self.elasticsearch = OCP(namespace=self.namespace,
                                 kind="elasticsearch")
        self.password = OCP(
            kind="secret",
            resource_name="quickstart-es-elastic-user",
            namespace=self.namespace,
        )

        # Deploy the ECK all-in-one.yaml file
        self._deploy_eck()
        # Deploy the Elastic-Search server
        self._deploy_es()

        # Verify that ES is Up & Running
        sample = TimeoutSampler(timeout=180, sleep=10, func=self.get_health)
        if not sample.wait_for_func_status(True):
            raise Exception("Elasticsearch deployment Failed")

        # Deploy the elasticsearch dumper pod
        self._deploy_data_dumper_client()

        # Connect to the server
        self.con = self._es_connect()

    def _clone(self):
        """
        clone the ECK repo into temp directory

        """
        try:
            log.info(f"Cloning ECK in {self.dir}")
            git_clone_cmd = f"git clone -b {self.branch} {self.repo} --depth 1"
            run(git_clone_cmd, shell=True, cwd=self.dir, check=True)
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Error during cloning of ECK repository")
            raise cf

    def _pod_is_found(self, pattern):
        """
        Boolean function which check if pod (by pattern) is exist.

        Args:
            pattern (str): the pattern of the pod to look for

        Returns:
            bool : True if pod found, otherwise False
        """
        return len(get_pod_name_by_pattern(pattern, self.namespace)) > 0

    def _deploy_eck(self):
        """
        Deploying the ECK environment for the Elasticsearch, and make sure it
        is in Running mode

        """

        log.info("Deploying the ECK environment for the ES cluster")
        log.info("Deploy the ECK CRD's")
        self.ocp.apply(self.eck_file)
        log.info("deploy the ECK operator")
        self.ocp.apply(f"{self.eck_path}/operator.yaml")

        sample = TimeoutSampler(timeout=300,
                                sleep=10,
                                func=self._pod_is_found,
                                pattern="elastic-operator")
        if not sample.wait_for_func_status(True):
            err_msg = "ECK deployment Failed"
            log.error(err_msg)
            self.cleanup()
            raise Exception(err_msg)

        log.info("The ECK pod is ready !")

    def _deploy_data_dumper_client(self):
        """
        Deploying elastic search client pod with utility which dump all the data
        from the server to .tgz file

        """

        log.info("Deploying the es client for dumping all data")
        self.ocp.apply(self.dumper_file)

        sample = TimeoutSampler(timeout=300,
                                sleep=10,
                                func=self._pod_is_found,
                                pattern="es-dumper")
        if not sample.wait_for_func_status(True):
            self.cleanup()
            raise Exception("Dumper pod deployment Failed")
        self.dump_pod = get_pod_name_by_pattern("es-dumper", self.namespace)[0]
        log.info(f"The dumper client pod {self.dump_pod} is ready !")

    def get_ip(self):
        """
        This function return the IP address of the Elasticsearch cluster.
        this IP is to use inside the OCP cluster

        Return
            str : String that represent the Ip Address.

        """
        return self.es.get()["spec"]["clusterIP"]

    def get_port(self):
        """
        This function return the port of the Elasticsearch cluster.

        Return
            str : String that represent the port.

        """
        return self.es.get()["spec"]["ports"][0]["port"]

    def _deploy_es(self):
        """
        Deploying the Elasticsearch server

        """

        # Creating PVC for the elasticsearch server and wait until it bound
        log.info("Creating 10 GiB PVC for the ElasticSearch cluster on")
        self.pvc_obj = create_pvc(
            sc_name=constants.CEPHBLOCKPOOL_SC,
            namespace=self.namespace,
            pvc_name="elasticsearch-data-quickstart-es-default-0",
            access_mode=constants.ACCESS_MODE_RWO,
            size="10Gi",
        )
        wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND)
        self.pvc_obj.reload()

        log.info("Deploy the ElasticSearch cluster")
        self.ocp.apply(self.crd)

        sample = TimeoutSampler(
            timeout=300,
            sleep=10,
            func=self._pod_is_found,
            pattern="quickstart-es-default",
        )
        if not sample.wait_for_func_status(True):
            self.cleanup()
            raise Exception("The ElasticSearch pod deployment Failed")
        self.espod = get_pod_name_by_pattern("quickstart-es-default",
                                             self.namespace)[0]
        log.info(f"The ElasticSearch pod {self.espod} Started")

        es_pod = OCP(kind="pod", namespace=self.namespace)
        log.info("Waiting for ElasticSearch to Run")
        assert es_pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_name=self.espod,
            sleep=30,
            timeout=600,
        )
        log.info("Elastic Search is ready !!!")

    def get_health(self):
        """
        This method return the health status of the Elasticsearch.

        Returns:
            bool : True if the status is green (OK) otherwise - False

        """
        return self.elasticsearch.get(
        )["items"][0]["status"]["health"] == "green"

    def get_password(self):
        """
        This method return the password used to connect the Elasticsearch.

        Returns:
            str : The password as text

        """
        return base64.b64decode(
            self.password.get()["data"]["elastic"]).decode("utf-8")

    def cleanup(self):
        """
        Cleanup the environment from all Elasticsearch components, and from the
        port forwarding process.

        """
        log.info("Teardown the Elasticsearch environment")
        log.info("Deleting all resources")
        log.info("Deleting the dumper client pod")
        self.ocp.delete(yaml_file=self.dumper_file)
        log.info("Deleting the es resource")
        self.ocp.delete(yaml_file=self.crd)
        log.info("Deleting the es project")
        # self.ns_obj.delete_project(project_name=self.namespace)
        self.ocp.delete(f"{self.eck_path}/operator.yaml")
        self.ocp.delete(yaml_file=self.eck_file)
        self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180)

    def _es_connect(self):
        """
        Create a connection to the local ES

        Returns:
            Elasticsearch: elasticsearch connection object, None if Cannot connect to ES

        """
        try:
            es = Elasticsearch([{
                "host": self.get_ip(),
                "port": self.get_port()
            }])
        except esexp.ConnectionError:
            log.warning("Cannot connect to ES server in the LocalServer")
            es = None
        return es

    def get_indices(self):
        """
        Getting list of all indices in the ES server - all created by the test,
        the installation of the ES was without any indexes pre-installed.

        Returns:
            list : list of all indices defined in the ES server

        """
        results = []
        log.info("Getting all indices")
        for ind in self.con.indices.get_alias("*"):
            results.append(ind)
        return results

    def dumping_all_data(self, target_path):
        """
        Dump All data from the internal ES server to .tgz file.

        Args:
            target_path (str): the path where the results file will be copy into

        Return:
            bool: True if the dump operation succeed and return the results data to the host
                  otherwise False
        """

        log.info("dumping data from ES server to .tgz file")
        rsh_cmd = f"rsh {self.dump_pod} /elasticsearch-dump/esdumper.py --ip {self.get_ip()} --port {self.get_port()}"
        result = self.ocp.exec_oc_cmd(rsh_cmd,
                                      out_yaml_format=False,
                                      timeout=1200)
        if "ES dump is done." not in result:
            log.error("There is no data in the Elasticsearch server")
            return False
        else:
            src_file = result.split()[-1]
            log.info(f"Copy {src_file} from the client pod")

            cp_command = f"cp {self.dump_pod}:{src_file} {target_path}/FullResults.tgz"
            result = self.ocp.exec_oc_cmd(cp_command, timeout=120)
            log.info(f"The output from the POD is {result}")
            log.info("Extracting the FullResults.tgz file")
            kwargs = {"cwd": target_path}
            results = run_command(f"tar zxvf {target_path}/FullResults.tgz",
                                  **kwargs)
            log.debug(f"The untar results is {results}")
            if "Error in command" in results:
                log.warning("Cannot untar the dumped file")
                return False

        return True
class TestPvcMultiSnapshotPerformance(PASTest):
    """
    Tests to measure PVC snapshots creation performance & scale
    The test is trying to to take the maximal number of snapshot for one PVC
    """
    def setup(self):
        """
        Setting up the test environment :
            Calculating the amount of storage which available for the test
            Creating namespace (project) for the test

        """
        log.info("Setting up the test environment")

        super(TestPvcMultiSnapshotPerformance, self).setup()

        # Getting the total Storage capacity
        try:
            self.ceph_capacity = int(self.ceph_cluster.get_ceph_capacity())
        except Exception as err:
            err_msg = f"Failed to get Storage capacity : {err}"
            log.error(err_msg)
            raise Exception(err_msg)

        # Use 70% of the storage capacity in the test
        self.capacity_to_use = int(self.ceph_capacity * 0.7)

        # Creating new namespace for the test
        self.nss_name = "pas-test-namespace"
        log.info(f"Creating new namespace ({self.nss_name}) for the test")
        try:
            self.proj = helpers.create_project(project_name=self.nss_name)
        except CommandFailed as ex:
            if str(ex).find("(AlreadyExists)"):
                log.warning("The Namespace is Already Exists !")
            log.error("Can not create new project")
            raise CommandFailed(f"{self.nss_name} was not created")

        # Initialize a general Snapshot object to use in the test
        self.snapshot = OCP(kind="volumesnapshot", namespace=self.nss_name)

    def teardown(self):
        """
        Cleaning up the environment :
            Delete all snapshot
            Delete the POD
            Delete the PVC and the PV
            Delete the StorageClass
            Delete the VolumeSnapshotClass
            Delete the data pool
            Switch to the default namespace
            Delete the tested namespace

        """
        log.info("Cleanup the test environment")

        # Getting the name of the PCV's backed PV
        try:
            pv = self.pvc_obj.get("spec")["spec"]["volumeName"]
        except KeyError:
            log.error(
                f"Can not found key in the PVC object {json.dumps(self.pvc_obj.get('spec').get('spec'), indent=3)}"
            )

        # Getting the list of all snapshots
        try:
            snapshot_list = self.snapshot.get(all_namespaces=True)["items"]
        except Exception as err:
            log.error(f"Cannot get the list of snapshots : {err}")
            snapshot_list = []

        # Deleting al snapshots from the cluster
        log.info(f"Trying to delete all ({len(snapshot_list)}) Snapshots")
        log.debug(
            f"The list of all snapshots is : {json.dumps(snapshot_list, indent=3)}"
        )
        for vs in snapshot_list:
            snap_name = vs["metadata"]["name"]
            log.info(f"Try to delete {snap_name}")
            try:
                self.snapshot.delete(resource_name=snap_name)
            except Exception as err:
                log.error(f"Can not delete {snap_name} : {err}")

        # Deleting the pod which wrote data to the pvc
        log.info(f"Deleting the test POD : {self.pod_obj.name}")
        try:
            self.pod_obj.delete()
            log.info("Wait until the pod is deleted.")
            self.pod_obj.ocp.wait_for_delete(resource_name=self.pod_obj.name)
        except Exception as ex:
            log.error(f"Can not delete the test pod : {ex}")

        # Deleting the PVC which used in the test.
        log.info(f"Delete the PVC : {self.pvc_obj.name}")
        try:
            self.pvc_obj.delete()
            log.info("Wait until the pvc is deleted.")
            self.pvc_obj.ocp.wait_for_delete(resource_name=self.pvc_obj.name)
        except Exception as ex:
            log.error(f"Can not delete the test pvc : {ex}")

        # Delete the backend PV of the PVC
        log.info(f"Try to delete the backend PV : {pv}")
        try:
            run_oc_command(f"delete pv {pv}")
        except Exception as ex:
            err_msg = f"can not delete PV {pv} - [{ex}]"
            log.error(err_msg)

        # Deleting the StorageClass used in the test
        log.info(f"Deleting the test StorageClass : {self.sc_obj.name}")
        try:
            self.sc_obj.delete()
            log.info("Wait until the SC is deleted.")
            self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name)
        except Exception as ex:
            log.error(f"Can not delete the test sc : {ex}")

        # Deleting the VolumeSnapshotClass used in the test
        log.info(f"Deleting the test Snapshot Class : {self.snap_class.name}")
        try:
            self.snap_class.delete()
            log.info("Wait until the VSC is deleted.")
            self.snap_class.ocp.wait_for_delete(
                resource_name=self.snap_class.name)
        except Exception as ex:
            log.error(f"Can not delete the test vsc : {ex}")

        # Deleting the Data pool
        log.info(f"Deleting the test storage pool : {self.sc_name}")
        self.delete_ceph_pool(self.sc_name)
        # Verify deletion by checking the backend CEPH pools using the toolbox
        results = self.ceph_cluster.toolbox.exec_cmd_on_pod("ceph osd pool ls")
        log.debug(f"Existing pools are : {results}")
        if self.sc_name in results.split():
            log.warning(
                "The pool did not deleted by CSI, forcing delete it manually")
            self.ceph_cluster.toolbox.exec_cmd_on_pod(
                f"ceph osd pool delete {self.sc_name} {self.sc_name} "
                "--yes-i-really-really-mean-it")
        else:
            log.info(f"The pool {self.sc_name} was deleted successfully")

        # Deleting the namespace used by the test
        log.info(f"Deleting the test namespace : {self.nss_name}")
        switch_to_default_rook_cluster_project()
        try:
            self.proj.delete(resource_name=self.nss_name)
            self.proj.wait_for_delete(resource_name=self.nss_name,
                                      timeout=60,
                                      sleep=10)
        except CommandFailed:
            log.error(f"Can not delete project {self.nss_name}")
            raise CommandFailed(f"{self.nss_name} was not created")

        super(TestPvcMultiSnapshotPerformance, self).teardown()

    def get_csi_pod(self, namespace):
        """
        Getting pod list in specific namespace, for the provision logs

        Args:
            namespace (str): the namespace where the pod is deployed.

        Returns:
            list : list of lines from the output of the command.

        """
        results = run_oc_command(cmd="get pod", namespace=namespace)
        if ERRMSG in results:
            err_msg = "Can not get the CSI controller pod"
            log.error(err_msg)
            raise Exception(err_msg)
        return results

    def get_log_names(self):
        """
        Finding the name of snapshot logging file
        the start time is in the 'csi-snapshot-controller' pod, and
        the end time is in the provisioner pod (csi-snapshotter container)

        """
        self.log_names = {"start": [], "end": []}
        log.info("Looking for logs pod name")

        # Getting csi log name for snapshot start creation messages
        results = self.get_csi_pod(
            namespace="openshift-cluster-storage-operator")
        for line in results:
            if "csi-snapshot-controller" in line and "operator" not in line:
                self.log_names["start"].append(line.split()[0])

        # Getting csi log name for snapshot end creation messages
        results = self.get_csi_pod(namespace="openshift-storage")
        for line in results:
            if "prov" in line and self.fs_type in line:
                self.log_names["end"].append(line.split()[0])

        log.info(
            f"The CSI logs for the test are : {json.dumps(self.log_names, indent=4)}"
        )

    def build_fio_command(self):
        """
        Building the FIO command that will be run on the pod before each snapshot

        """
        # Find the path that the PVC is mounted within the POD
        path = (self.pod_obj.get("spec").get("spec").get("containers")[0].get(
            "volumeMounts")[0].get("mountPath"))
        self.fio_cmd = (
            "fio --name=fio-fillup --rw=write --bs=4m --direct=1 --numjobs=1"
            " --time_based=0 --runtime=36000 --ioengine=libaio --end_fsync=1"
            f" --filename={path}/{self.file_name} --size={self.file_size}"
            " --output-format=json")
        log.info(f"The FIO command is : {self.fio_cmd}")

    def create_snapshotclass(self, interface):
        """
        Creates own VolumeSnapshotClass

        Args:
            interface (str): Interface type used

        Returns:
            ocs_obj (obj): SnapshotClass obj instances

        """
        if interface == constants.CEPHFILESYSTEM:
            snapclass_name = "pas-test-cephfs-snapshot-class"
        else:
            snapclass_name = "pas-test-rbd-snapshot-class"

        yaml_files = {
            constants.CEPHBLOCKPOOL: constants.CSI_RBD_SNAPSHOTCLASS_YAML,
            constants.CEPHFILESYSTEM: constants.CSI_CEPHFS_SNAPSHOTCLASS_YAML,
        }
        snapshotclass_data = templating.load_yaml(yaml_files[interface])

        snapshotclass_data["metadata"]["name"] = snapclass_name
        ocs_obj = ocs.OCS(**snapshotclass_data)
        log.info(f"Creating new snapshot class : {snapclass_name}")
        try:
            created_snapclass = ocs_obj.create(do_reload=True)
            log.debug(created_snapclass)
        except Exception as ex:
            err_msg = f"Failed to create new snapshot class : {snapclass_name} [{ex}]"
            log.error(err_msg)
            raise Exception(err_msg)
        return ocs_obj

    def create_snapshot(self, snap_num):
        """
        Creating snapshot of volume, and measure the creation time

        Args:
            snap_num (int) the number of snapshot to create

        Returns:
            int: the creation time of the snapshot (in sec.)

        """
        log.info(f"Taking snapshot number {snap_num}")
        # Getting UTC time before test starting for log retrieve
        UTC_datetime = datetime.datetime.utcnow().strftime(
            "%Y-%m-%dT%H:%M:%SZ")
        snap_name = f"pvc-snap-{snap_num}-"
        snap_name += self.pvc_obj.name.split("-")[-1]
        self.snap_templ["metadata"]["name"] = snap_name
        self.snap_templ["spec"][
            "volumeSnapshotClassName"] = self.snap_class.name

        fd, tmpfile = tempfile.mkstemp(suffix=".yaml", prefix="Snap")
        log.debug(f"Going to create {tmpfile}")
        with open(tmpfile, "w") as f:
            yaml.dump(self.snap_templ, f, default_flow_style=False)

        res = run_oc_command(cmd=f"create -f {tmpfile}",
                             namespace=self.nss_name)
        if ERRMSG in res[0]:
            err_msg = f"Failed to create snapshot : {res}"
            log.error(err_msg)
            raise Exception(err_msg)

        # wait until snapshot is ready
        timeout = 720
        sleep_time = 10
        snap_con_name = None
        while timeout > 0:
            res = run_oc_command(f"get volumesnapshot {snap_name} -o yaml",
                                 namespace=self.nss_name)

            if ERRMSG not in res[0]:
                res = yaml.safe_load("\n".join(res))
                log.debug(f"The command output is : {yaml.dump(res)}")
                try:
                    if res["status"]["readyToUse"]:
                        log.info(f"{snap_name} Created and ready to use")
                        snap_con_name = res["status"][
                            "boundVolumeSnapshotContentName"]
                        break
                    else:
                        log.info(
                            f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check"
                        )
                        time.sleep(sleep_time)
                        timeout -= sleep_time
                except Exception:
                    log.info(
                        f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check"
                    )
                    time.sleep(sleep_time)
                    timeout -= sleep_time

            else:
                err_msg = f"Can not get snapshot status {res}"
                log.error(err_msg)
                raise Exception(err_msg)
        if snap_con_name:
            return self.get_creation_time(snap_name, snap_con_name,
                                          UTC_datetime)
        else:
            err_msg = "Snapshot did not created on time"
            log.error(err_msg)
            raise TimeoutError(err_msg)

    def read_logs(self, kind, namespace, start_time):
        """
        Reading the csi-driver logs, since we use different logs for the start time
        for end time (creation snapshot), we call this function twice.

        Args:
            kind (str): the kind of logs to read 'start' or 'end'
            namespace (str): in which namespace the pod exists
            start_time (time): the start time of the specific test,
               so we dont need to read the full log

        Returns:
            list : the contant of all read logs(s) - can be more then one log

        """
        logs = []
        # The pod with the logs for 'start' creation time have only one container
        container = ""
        if kind == "end":
            # The pod with the logs for 'end' creation time have more then one container
            container = "-c csi-snapshotter"
        for l in self.log_names[kind]:
            logs.append(
                run_oc_command(
                    f"logs {l} {container} --since-time={start_time}",
                    namespace=namespace,
                ))
        return logs

    def get_creation_time(self, snap_name, content_name, start_time):
        """
        Calculate the creation time of the snapshot.
        find the start / end time in the logs, and calculate the total time.

        Args:
            snap_name (str): the snapshot name that create
            content_name (str): the content name of the snapshot, the end time
             lodged on the content name and not on the snap name.
            start_time (time): time of test starting so, retrieving log will be short as possible

        Returns:
            int: creation time in seconds

        Raises:
            General exception : can not found start/end of creation time

        """

        # Start and End snapshot creation time
        times = {"start": None, "end": None}
        logs_info = {
            "start": {
                "ns": "openshift-cluster-storage-operator",
                "log_line": "Creating content for snapshot",
            },
            "end": {
                "ns": "openshift-storage",
                "log_line": "readyToUse true"
            },
        }

        for op in ["start", "end"]:
            logs = self.read_logs(op, logs_info[op]["ns"], start_time)
            for sublog in logs:
                for line in sublog:
                    if (snap_name in line or content_name
                            in line) and logs_info[op]["log_line"] in line:
                        times[op] = line.split(" ")[1]
                        times[op] = datetime.datetime.strptime(
                            times[op], time_format)
            if times[op] is None:
                err_msg = f"Can not find {op} time of {snap_name}"
                log.error(err_msg)
                raise Exception(err_msg)

        results = (times["end"] - times["start"]).total_seconds()
        log.debug(
            f"Start creation time is : {times['start']}, End creation time is : {times['end']}"
            f" and Total creation time is {results}")

        return results

    def run(self):
        """
        Running the test
            for each snapshot : write data on the pod and take snapshot
        """
        results = []
        for test_num in range(1, self.num_of_snaps + 1):
            log.info(f"Starting test number {test_num}")

            # Running IO on the POD - (re)-write data on the PVC
            self.pod_obj.exec_cmd_on_pod(self.fio_cmd, out_yaml_format=False)

            # Taking Snapshot of the PVC
            ct = self.create_snapshot(test_num)
            speed = self.filesize / ct
            results.append({"Snap Num": test_num, "time": ct, "speed": speed})
            log.info(f"Results for snapshot number {test_num} are : "
                     f"Creation time is {ct} , Creation speed {speed}")

        log.debug(f"All results are : {json.dumps(results, indent=3)}")
        return results

    @ignore_leftovers
    @pytest.mark.polarion_id("OCS-2623")
    @pytest.mark.parametrize(
        argnames=["interface_type", "snap_number"],
        argvalues=[
            pytest.param(*[constants.CEPHBLOCKPOOL, 512]),
            pytest.param(*[constants.CEPHFILESYSTEM, 100]),
        ],
    )
    def test_pvc_multiple_snapshot_performance(
        self,
        pvc_factory,
        pod_factory,
        secret_factory,
        interface_type,
        snap_number,
    ):
        """
        1. Creating PVC
           size is depend on storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 80% of data
        3. Take a snapshot of the PVC and measure the time of creation.
        4. re-write the data on the PVC
        5. Take a snapshot of the PVC and measure the time of creation.
        6. repeat steps 4-5 the numbers of snapshot we want to take : 512
           this will be run by outside script for low memory consumption
        7. print all information.

        Raises:
            StorageNotSufficientException: in case of not enough capacity

        """

        self.num_of_snaps = snap_number
        if self.dev_mode:
            self.num_of_snaps = 2

        log.info(
            f"Going to Create {self.num_of_snaps} {interface_type} snapshots")

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        self.need_capacity = int((self.num_of_snaps + 2) * 1.35)

        # Test will run only on system with enough capacity
        if self.capacity_to_use < self.need_capacity:
            err_msg = (f"The system have only {self.ceph_capacity} GiB, "
                       f"we want to use only {self.capacity_to_use} GiB, "
                       f"and we need {self.need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        self.pvc_size = int(self.capacity_to_use / (self.num_of_snaps + 2))
        if self.dev_mode:
            self.pvc_size = 5

        self.interface = interface_type
        self.sc_name = "pas-testing-rbd"
        pool_name = self.sc_name
        if self.interface == constants.CEPHFILESYSTEM:
            self.sc_name = "pas-testing-cephfs"
            pool_name = f"{self.sc_name}-data0"

        # Creating new storage pool
        self.create_new_pool(self.sc_name)

        # Creating new StorageClass (pool) for the test.
        secret = secret_factory(interface=self.interface)
        self.sc_obj = helpers.create_storage_class(
            interface_type=self.interface,
            interface_name=pool_name,
            secret_name=secret.name,
            sc_name=self.sc_name,
            fs_name=self.sc_name,
        )
        log.info(f"The new SC is : {self.sc_obj.name}")
        log.debug(f"All Sc data is {json.dumps(self.sc_obj.data, indent=3)}")

        # Create new VolumeSnapshotClass
        self.snap_class = self.create_snapshotclass(self.interface)

        # Create new PVC
        log.info(f"Creating {self.pvc_size} GiB PVC of {interface_type}")
        self.pvc_obj = pvc_factory(
            interface=self.interface,
            storageclass=self.sc_obj,
            size=self.pvc_size,
            status=constants.STATUS_BOUND,
            project=self.proj,
        )

        # Create POD which will attache to the new PVC
        log.info("Creating A POD")
        self.pod_obj = pod_factory(
            interface=self.interface,
            pvc=self.pvc_obj,
            status=constants.STATUS_RUNNING,
            pod_dict_path=constants.PERF_POD_YAML,
        )

        # Calculating the file size as 80% of the PVC size
        self.filesize = self.pvc_obj.size * 0.80
        # Change the file size to MB for the FIO function
        self.file_size = f"{int(self.filesize * constants.GB2MB)}M"
        self.file_name = self.pod_obj.name

        log.info(
            f"Total capacity size is : {self.ceph_capacity} GiB, "
            f"Going to use {self.need_capacity} GiB, "
            f"With {self.num_of_snaps} Snapshots to {self.pvc_size} GiB PVC. "
            f"File size to be written is : {self.file_size} "
            f"with the name of {self.file_name}")

        # Reading basic snapshot yaml file
        self.snap_yaml = constants.CSI_CEPHFS_SNAPSHOT_YAML
        self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_CEPHFS
        self.fs_type = "cephfs"
        if interface_type == constants.CEPHBLOCKPOOL:
            self.snap_yaml = constants.CSI_RBD_SNAPSHOT_YAML
            self.fs_type = "rbd"
            self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_RBD
        with open(self.snap_yaml, "r") as stream:
            try:
                self.snap_templ = yaml.safe_load(stream)
                self.snap_templ["spec"]["volumeSnapshotClassName"] = self.sc
                self.snap_templ["spec"]["source"][
                    "persistentVolumeClaimName"] = self.pvc_obj.name
            except yaml.YAMLError as exc:
                log.error(f"Can not read template yaml file {exc}")
        log.debug(
            f"Snapshot yaml file : {self.snap_yaml} "
            f"Content of snapshot yaml file {json.dumps(self.snap_templ, indent=4)}"
        )

        self.get_log_names()
        self.build_fio_command()

        self.run()
Beispiel #15
0
class OCS(object):
    """
    Base OCSClass
    """
    def __init__(self, **kwargs):
        """
        Initializer function

        Args:
            kwargs (dict):
                1) For existing resource, use OCP.reload() to get the
                resource's dictionary and use it to pass as **kwargs
                2) For new resource, use yaml files templates under
                /templates/CSI like:
                obj_dict = load_yaml_to_dict(
                    os.path.join(
                        TEMPLATE_DIR, "some_resource.yaml"
                        )
                    )
        """
        self.data = kwargs
        self._api_version = self.data.get('api_version')
        self._kind = self.data.get('kind')
        self._namespace = None
        if 'metadata' in self.data:
            self._namespace = self.data.get('metadata').get('namespace')
            self._name = self.data.get('metadata').get('name')
        self.ocp = OCP(api_version=self._api_version,
                       kind=self.kind,
                       namespace=self._namespace)
        self.temp_yaml = tempfile.NamedTemporaryFile(mode='w+',
                                                     prefix=self._kind,
                                                     delete=False)

    @property
    def api_version(self):
        return self._api_version

    @property
    def kind(self):
        return self._kind

    @property
    def namespace(self):
        return self._namespace

    @property
    def name(self):
        return self._name

    def reload(self):
        """
        Reloading the OCS instance with the new information from its actual
        data.
        After creating a resource from a yaml file, the actual yaml file is
        being changed and more information about the resource is added.
        """
        self.data = self.get()
        self.__init__(**self.data)

    def get(self, out_yaml_format=True):
        return self.ocp.get(resource_name=self.name,
                            out_yaml_format=out_yaml_format)

    def create(self):
        log.info(f"Adding {self.kind} with name {self.name}")
        templating.dump_dict_to_temp_yaml(self.data, self.temp_yaml.name)
        status = self.ocp.create(yaml_file=self.temp_yaml.name)
        self.reload()
        return status

    def delete(self, wait=True):
        return self.ocp.delete(resource_name=self.name, wait=wait)

    def apply(self, **data):
        with open(self.temp_yaml.name, 'w') as yaml_file:
            yaml.dump(data, yaml_file)
        assert self.ocp.apply(
            yaml_file=self.temp_yaml.name), (f"Failed to apply changes {data}")
        self.reload()

    def add_label(self, label):
        """
        Addss a new label

        Args:
            label (str): New label to be assigned for this pod
                E.g: "label=app='rook-ceph-mds'"
        """
        status = self.ocp.add_label(resource_name=self.name, label=label)
        self.reload()
        return status

    def delete_temp_yaml_file(self):
        utils.delete_file(self.temp_yaml.name)
Beispiel #16
0
    def test_rbd_based_rwo_pvc(self, reclaim_policy):
        """
        Verifies RBD Based RWO Dynamic PVC creation with Reclaim policy set to
        Delete/Retain

        Steps:
        1. Create Storage Class with reclaimPolicy: Delete/Retain
        2. Create PVC with 'accessModes' 'ReadWriteOnce'
        3. Create two pods using same PVC
        4. Run IO on first pod
        5. Verify second pod is not getting into Running state
        6. Delete first pod
        7. Verify second pod is in Running state
        8. Verify usage of volume in second pod is matching with usage in
           first pod
        9. Run IO on second pod
        10. Delete second pod
        11. Delete PVC
        12. Verify PV associated with deleted PVC is also deleted/released
        """
        # Create Storage Class with reclaimPolicy: Delete
        sc_obj = helpers.create_storage_class(
            interface_type=constants.CEPHBLOCKPOOL,
            interface_name=self.cbp_obj.name,
            secret_name=self.rbd_secret_obj.name,
            reclaim_policy=reclaim_policy
        )

        # Create PVC with 'accessModes' 'ReadWriteOnce'
        pvc_data = templating.load_yaml_to_dict(constants.CSI_PVC_YAML)
        pvc_data['metadata']['name'] = helpers.create_unique_resource_name(
            'test', 'pvc'
        )
        pvc_data['metadata']['namespace'] = self.namespace
        pvc_data['spec']['storageClassName'] = sc_obj.name
        pvc_data['spec']['accessModes'] = ['ReadWriteOnce']
        pvc_obj = PVC(**pvc_data)
        pvc_obj.create()

        # Create first pod
        log.info(f"Creating two pods which use PVC {pvc_obj.name}")
        pod_data = templating.load_yaml_to_dict(constants.CSI_RBD_POD_YAML)
        pod_data['metadata']['name'] = helpers.create_unique_resource_name(
            'test', 'pod'
        )
        pod_data['metadata']['namespace'] = self.namespace
        pod_data['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_obj.name

        pod_obj = Pod(**pod_data)
        pod_obj.create()
        assert helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)

        node_pod1 = pod_obj.get()['spec']['nodeName']

        # Create second pod
        # Try creating pod until it is on a different node than first pod
        for retry in range(1, 6):
            pod_data = templating.load_yaml_to_dict(constants.CSI_RBD_POD_YAML)
            pod_data['metadata']['name'] = helpers.create_unique_resource_name(
                'test', 'pod'
            )
            pod_data['metadata']['namespace'] = self.namespace
            pod_data['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_obj.name
            pod_obj2 = Pod(**pod_data)
            pod_obj2.create()
            assert helpers.wait_for_resource_state(pod_obj2, constants.STATUS_PENDING)

            node_pod2 = pod_obj2.get()['spec']['nodeName']
            if node_pod1 != node_pod2:
                break
            log.info(
                f"Both pods are on same node. Deleting second pod and "
                f"creating another pod. Retry count:{retry}"
            )
            pod_obj2.delete()
            if retry == 5:
                raise UnexpectedBehaviour(
                    "Second pod is always created on same node as of first "
                    "pod even after trying 5 times."
                )

        # Run IO on first pod
        log.info(f"Running IO on first pod {pod_obj.name}")
        pod_obj.run_io('fs', '1G')
        logging.info(f"Waiting for IO results from pod {pod_obj.name}")
        fio_result = pod_obj.get_fio_results()
        logging.info("IOPs after FIO:")
        logging.info(
            f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}"
        )
        logging.info(
            f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}"
        )

        # Fetch usage details
        mount_point = pod_obj.exec_cmd_on_pod(command="df -kh")
        mount_point = mount_point.split()
        usage = mount_point[mount_point.index('/var/lib/www/html') - 1]

        # Verify that second pod is not getting into Running state. Check it
        # for some period of time.
        try:
            assert not pod_obj2.ocp.wait_for_resource(
                condition='Running', resource_name=pod_obj2.name,
            ), "Unexpected: Second pod is in Running state"
        except TimeoutExpiredError:
            log.info(
                f"Verified: Second pod {pod_obj2.name} is not in "
                f"Running state"
            )

        # Delete first pod
        pod_obj.delete(wait=True)

        # Verify pod is deleted
        try:
            pod_obj.get()
            raise UnexpectedBehaviour(
                f"First pod {pod_obj.name} is not deleted."
            )
        except CommandFailed as exp:
            assert "not found" in str(exp), (
                "Failed to fetch pod details"
            )
            log.info(f"First pod {pod_obj.name} is deleted.")

        # Wait for second pod to be in Running state
        try:
            pod_obj2.ocp.wait_for_resource(
                condition='Running', resource_name=pod_obj2.name, timeout=180
            )
        except TimeoutExpiredError as exp:
            raise TimeoutExpiredError(
                f"Second pod {pod_obj2.name} is not in Running state "
                f"after deleting first pod."
            ) from exp
        log.info(
            f"Second pod {pod_obj2.name} is in Running state after "
            f"deleting the first pod."
        )

        # Verify that volume usage in second pod is matching with the usage in
        # first pod
        mount_point = pod_obj2.exec_cmd_on_pod(command="df -kh")
        mount_point = mount_point.split()
        usage_re = mount_point[mount_point.index('/var/lib/www/html') - 1]
        assert usage_re == usage, (
            "Use percentage in new pod is not matching with old pod"
        )

        # Run IO on second pod
        log.info(f"Running IO on second pod {pod_obj2.name}")
        pod_obj2.run_io('fs', '1G')
        logging.info(f"Waiting for IO results from pod {pod_obj2.name}")
        fio_result = pod_obj2.get_fio_results()
        logging.info("IOPs after FIO:")
        logging.info(
            f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}"
        )
        logging.info(
            f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}"
        )

        # Delete second pod
        pod_obj2.delete()

        # Verify pod is deleted
        try:
            pod_obj2.get()
            raise UnexpectedBehaviour(
                f"Second pod {pod_obj2.name} is not deleted."
            )
        except CommandFailed as exp:
            assert "not found" in str(exp), (
                "Failed to fetch pod details"
            )
            log.info(f"Second pod {pod_obj2.name} is deleted.")

        # Get PV name
        pvc_obj.reload()
        pv_name = pvc_obj.backed_pv

        # Delete PVC
        pvc_obj.delete()

        # Verify PVC is deleted
        try:
            pvc_obj.get()
            raise UnexpectedBehaviour(
                f"PVC {pvc_obj.name} is not deleted."
            )
        except CommandFailed as exp:
            assert "not found" in str(exp), (
                "Failed to verify PVC deletion."
            )
            log.info(f"PVC {pvc_obj.name} is deleted.")

        pv_obj = OCP(
            kind=constants.PV, namespace=self.namespace
        )

        if reclaim_policy == "Delete":
            # Verify PV is deleted
            for pv_info in TimeoutSampler(
                    30, 2, pv_obj.get, out_yaml_format=False
            ):
                if pv_name not in pv_info:
                    break
                log.warning(
                    f"PV {pv_name} exists after deleting PVC {pvc_obj.name}. "
                    f"Checking again."
                )

            # TODO: Verify PV using ceph toolbox. PV should be deleted.
            # Blocked by bz 1723656

        elif reclaim_policy == "Retain":
            # Wait for PV to be in Released state
            assert pv_obj.wait_for_resource(
                condition='Released', resource_name=pv_name
            )
            log.info(f"PV {pv_name} is in Released state")

            # TODO: Delete PV from backend and verify
            # Blocked by bz 1723656
            pv_obj.delete(resource_name=pv_name)

        # Delete Storage Class
        sc_obj.delete()