Esempio n. 1
0
    def test_rgw_pod_existence(self):
        if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS
                or storagecluster_independent_check()):
            if (not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM
                    and not config.ENV_DATA["platform"]
                    == constants.IBMCLOUD_PLATFORM
                    and (version.get_semantic_ocs_version_from_config() >
                         version.VERSION_4_5)):
                logger.info("Checking whether RGW pod is not present")
                assert (
                    not pod.get_rgw_pods()
                ), "RGW pods should not exist in the current platform/cluster"

        elif (config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS
              and not config.ENV_DATA["mcg_only_deployment"]):
            rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"],
                                      check_if_cluster_was_upgraded(), None)
            logger.info(
                f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform'
            )
            rgw_pod = OCP(kind=constants.POD,
                          namespace=config.ENV_DATA["cluster_namespace"])
            assert rgw_pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.RGW_APP_LABEL,
                resource_count=rgw_count,
                timeout=60,
            )
Esempio n. 2
0
def template_pvc(
    name,
    namespace=config.ENV_DATA["cluster_namespace"],
    storageclass=constants.CEPHFILESYSTEM_SC,
    access_mode=constants.ACCESS_MODE_RWX,
    size="20Gi",
):
    """
    Create a PVC using the MCG CLI

    Args:
        name (str): Name of the PVC
        namespace (str): Namespace to create the PVC in
        access_mode (str): Access mode for the PVC
        size (str): Size of the PVC in GiB

    """
    pvc_data = templating.load_yaml(constants.CSI_PVC_YAML)
    pvc_data["metadata"]["name"] = name
    pvc_data["metadata"]["namespace"] = namespace
    pvc_data["spec"]["accessModes"] = [access_mode]
    pvc_data["spec"]["resources"]["requests"]["storage"] = size
    pvc_data["spec"]["storageClassName"] = (
        constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS
        if storagecluster_independent_check()
        else storageclass
    )
    return pvc_data
Esempio n. 3
0
    def create_couchbase_worker(self, replicas=1, sc_name=None):
        """
        Deploy a Couchbase server and pillowfight workload using operator

        The couchbase workers do not come up unless there is an admission controller
        running.  The admission controller is started from the default project prior
        to bringing up the operator.  Secrets, rolebindings and serviceaccounts
        need to also be generated.

        Once the couchbase operator is running, we need to wait for the three
        worker pods to also be up.  Then a pillowfight task is started.

        After the pillowfight task has finished, the log is collected and
        analyzed.

        Raises:
            Exception: If pillowfight results indicate that a minimum performance
                level is not reached (1 second response time, less than 1000 ops
                per second)

        """
        logging.info("Creating pods..")
        cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE)
        if storagecluster_independent_check():
            cb_example["spec"]["volumeClaimTemplates"][0]["spec"][
                "storageClassName"
            ] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
        cb_example["spec"]["servers"][0]["size"] = replicas
        if sc_name:
            cb_example["spec"]["volumeClaimTemplates"][0]["spec"][
                "storageClassName"
            ] = sc_name
        self.cb_examples = OCS(**cb_example)
        self.cb_examples.create()

        # Wait for last of three workers to be running.

        logging.info("Waiting for the pods to Running")
        for cb_wrk_pods in TimeoutSampler(
            self.WAIT_FOR_TIME,
            3,
            get_pod_name_by_pattern,
            "cb-example",
            constants.COUCHBASE_OPERATOR,
        ):
            try:
                if len(cb_wrk_pods) == replicas:
                    counter = 0
                    for cb_pod in cb_wrk_pods:
                        if self.is_up_and_running(cb_pod, self.up_check):
                            counter += 1
                            logging.info(f"Couchbase worker {cb_pod} is up")
                    if counter == replicas:
                        break
            except IndexError:
                logging.info(
                    f"Expected number of couchbase pods are {replicas} "
                    f"but only found {len(cb_wrk_pods)}"
                )
Esempio n. 4
0
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        if storagecluster_independent_check():
            self.sanity_helpers = SanityExternalCluster()
        else:
            self.sanity_helpers = Sanity()
Esempio n. 5
0
 def setup_postgresql(self, replicas, node_selector=None):
     # Node selector for postgresql
     pgsql_sset = templating.load_yaml(constants.PGSQL_STATEFULSET_YAML)
     if node_selector is not None:
         pgsql_sset["spec"]["template"]["spec"][
             "nodeSelector"] = node_selector
     if helpers.storagecluster_independent_check():
         pgsql_sset["spec"]["volumeClaimTemplates"][0]["metadata"][
             "annotations"][
                 "volume.beta.kubernetes.io/storage-class"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
     Postgresql.setup_postgresql(self, replicas=replicas)
Esempio n. 6
0
def create_instance_in_clusterlogging():
    """
    Creation of instance for clusterlogging that creates PVCs,
    ElasticSearch, curator fluentd and kibana pods and checks for all
    the pods and PVCs

    Args:
        sc_name (str): Storage class name to create PVCs

    Returns:
        dict: Contains all detailed information of the
            instance such as pods that got created, its resources and limits
            values, storage class and size details etc.

    """

    nodes_in_cluster = len(get_all_nodes())
    inst_data = templating.load_yaml(constants.CL_INSTANCE_YAML)
    es_node_count = inst_data["spec"]["logStore"]["elasticsearch"]["nodeCount"]
    if helpers.storagecluster_independent_check():
        inst_data["spec"]["logStore"]["elasticsearch"]["storage"][
            "storageClassName"
        ] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
    helpers.create_resource(wait=False, **inst_data)
    oc = ocp.OCP("v1", "ClusterLogging", "openshift-logging")
    logging_instance = oc.get(resource_name="instance", out_yaml_format="True")
    if logging_instance:
        logger.info("Successfully created instance for cluster-logging")
        logger.debug(logging_instance)
    else:
        logger.error("Instance for clusterlogging is not created properly")

    pod_obj = ocp.OCP(kind=constants.POD, namespace="openshift-logging")
    pod_status = pod_obj.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        resource_count=2 + es_node_count + nodes_in_cluster,
        timeout=500,
        sleep=2,
    )
    assert pod_status, "Pods are not in Running state."
    logger.info("All pods are in Running state")
    pvc_obj = ocp.OCP(kind=constants.PVC, namespace="openshift-logging")
    pvc_status = pvc_obj.wait_for_resource(
        condition=constants.STATUS_BOUND,
        resource_count=es_node_count,
        timeout=150,
        sleep=5,
    )
    assert pvc_status, "PVCs are not in bound state."
    logger.info("PVCs are Bound")
    return logging_instance
Esempio n. 7
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     obc_data = templating.load_yaml(constants.MCG_OBC_YAML)
     if self.name is None:
         self.name = create_unique_resource_name("oc", "obc")
     obc_data["metadata"]["name"] = self.name
     obc_data["spec"]["bucketName"] = self.name
     if storagecluster_independent_check():
         obc_data["spec"][
             "storageClassName"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RGW
     else:
         obc_data["spec"][
             "storageClassName"] = constants.DEFAULT_STORAGECLASS_RGW
     obc_data["metadata"]["namespace"] = self.namespace
     create_resource(**obc_data)
Esempio n. 8
0
    def create_ocs_jenkins_template(self):
        """

        Create OCS Jenkins Template
        """
        log.info("Create Jenkins Template, jenkins-persistent-ocs")
        ocp_obj = OCP(namespace="openshift", kind="template")
        tmp_dict = ocp_obj.get(resource_name="jenkins-persistent",
                               out_yaml_format=True)
        tmp_dict["labels"]["app"] = "jenkins-persistent-ocs"
        tmp_dict["labels"]["template"] = "jenkins-persistent-ocs-template"
        tmp_dict["metadata"]["name"] = "jenkins-persistent-ocs"
        # Find Kind: 'PersistentVolumeClaim' position in the objects list, differs in OCP 4.5 and OCP 4.6.
        sc_name = (constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
                   if storagecluster_independent_check() else
                   constants.DEFAULT_STORAGECLASS_RBD)
        for i in range(len(tmp_dict["objects"])):
            if tmp_dict["objects"][i]["kind"] == constants.PVC:
                tmp_dict["objects"][i]["metadata"]["annotations"] = {
                    "volume.beta.kubernetes.io/storage-class": sc_name
                }

        tmp_dict["parameters"][4]["value"] = "10Gi"
        tmp_dict["parameters"].append({
            "description":
            "Override jenkins options to speed up slave spawning",
            "displayName":
            "Override jenkins options to speed up slave spawning",
            "name":
            "JAVA_OPTS",
            "value":
            "-Dhudson.slaves.NodeProvisioner.initialDelay=0 "
            "-Dhudson.slaves.NodeProvisioner.MARGIN=50 -Dhudson."
            "slaves.NodeProvisioner.MARGIN0=0.85",
        })
        if Version.coerce(self.ocp_version) >= Version.coerce("4.8"):
            # Added "Pipeline Utility Steps" plugin via Jenkins Template
            # OCP team changed the default plugin list on OCP4.9
            tmp_dict["objects"][3]["spec"]["template"]["spec"]["containers"][
                0]["env"].append({
                    "name":
                    "INSTALL_PLUGINS",
                    "value":
                    "scm-api:2.6.5,pipeline-utility-steps:2.12.0,workflow-step-api:622."
                    "vb_8e7c15b_c95a_,workflow-cps:2648.va9433432b33c,workflow-api:2.47",
                })
        ocs_jenkins_template_obj = OCS(**tmp_dict)
        ocs_jenkins_template_obj.create()
Esempio n. 9
0
    def setup_amq_kafka_persistent(self, sc_name, size=100, replicas=3):
        """
        Function to setup amq-kafka-persistent, the file is pulling from github
        it will make kind: Kafka and will make sure the status is running

        Args:
            sc_name (str): Name of sc
            size (int): Size of the storage in Gi
            replicas (int): Number of kafka and zookeeper pods to be created

        return : kafka_persistent

        """
        if storagecluster_independent_check():
            sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
        try:
            kafka_persistent = templating.load_yaml(
                os.path.join(self.dir, self.amq_kafka_pers_yaml)
            )
            kafka_persistent["spec"]["kafka"]["replicas"] = replicas
            kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][
                "class"
            ] = sc_name
            kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][
                "size"
            ] = f"{size}Gi"

            kafka_persistent["spec"]["zookeeper"]["replicas"] = replicas
            kafka_persistent["spec"]["zookeeper"]["storage"]["class"] = sc_name
            kafka_persistent["spec"]["zookeeper"]["storage"]["size"] = f"{size}Gi"
            self.kafka_persistent = OCS(**kafka_persistent)
            self.kafka_persistent.create()

        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during setup of AMQ Kafka-persistent")
            raise cf
        time.sleep(40)

        if self.is_amq_pod_running(
            pod_pattern="my-cluster", expected_pods=(replicas * 2) + 1
        ):
            return self.kafka_persistent
        else:
            raise ResourceWrongStatusException(
                "my-cluster-kafka and my-cluster-zookeeper "
                "Pod is not getting to running state"
            )
Esempio n. 10
0
    def get_credentials(self,
                        secret_name=constants.NOOBAA_OBJECTSTOREUSER_SECRET):
        """
        Get Endpoint, Access key and Secret key from OCS secret. Endpoint is
        taken from rgw exposed service. Use rgw_endpoint fixture in test to get
        it exposed.

        Args:
            secret_name (str): Name of secret to be used
                for getting RGW credentials

        Returns:
            tuple: Endpoint, Access key, Secret key

        """
        secret_ocp_obj = OCP(kind=constants.SECRET, namespace=self.namespace)
        route_ocp_obj = OCP(kind=constants.ROUTE,
                            namespace=config.ENV_DATA["cluster_namespace"])

        if storagecluster_independent_check():
            if version.get_semantic_ocs_version_from_config(
            ) < version.VERSION_4_7:
                endpoint = route_ocp_obj.get(
                    resource_name=constants.RGW_SERVICE_EXTERNAL_MODE)
            else:
                endpoint = route_ocp_obj.get(
                    resource_name=constants.RGW_ROUTE_EXTERNAL_MODE)
            if secret_name == constants.NOOBAA_OBJECTSTOREUSER_SECRET:
                secret_name = constants.EXTERNAL_MODE_NOOBAA_OBJECTSTOREUSER_SECRET
            elif secret_name == constants.CEPH_OBJECTSTOREUSER_SECRET:
                secret_name = constants.CEPH_EXTERNAL_OBJECTSTOREUSER_SECRET
        else:
            if version.get_semantic_ocs_version_from_config(
            ) < version.VERSION_4_7:
                endpoint = route_ocp_obj.get(
                    resource_name=constants.RGW_SERVICE_INTERNAL_MODE)
            else:
                endpoint = route_ocp_obj.get(
                    resource_name=constants.RGW_ROUTE_INTERNAL_MODE)

        creds_secret_obj = secret_ocp_obj.get(secret_name)
        endpoint = f"http://{endpoint['status']['ingress'][0]['host']}"
        access_key = base64.b64decode(
            creds_secret_obj.get("data").get("AccessKey")).decode("utf-8")
        secret_key = base64.b64decode(
            creds_secret_obj.get("data").get("SecretKey")).decode("utf-8")
        return (endpoint, access_key, secret_key)
Esempio n. 11
0
    def setup_postgresql(self, replicas, sc_name=None):
        """
        Deploy postgres sql server

        Args:
            replicas (int): Number of postgresql pods to be deployed

        Raises:
            CommandFailed: If PostgreSQL server setup fails

        """
        log.info("Deploying postgres database")
        try:
            pgsql_service = templating.load_yaml(constants.PGSQL_SERVICE_YAML)
            pgsql_cmap = templating.load_yaml(constants.PGSQL_CONFIGMAP_YAML)
            pgsql_sset = templating.load_yaml(constants.PGSQL_STATEFULSET_YAML)
            pgsql_sset["spec"]["replicas"] = replicas
            if (
                storagecluster_independent_check()
                and config.ENV_DATA["platform"].lower()
                not in constants.MANAGED_SERVICE_PLATFORMS
            ):
                pgsql_sset["spec"]["volumeClaimTemplates"][0]["spec"][
                    "storageClassName"
                ] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
            if sc_name:
                pgsql_sset["spec"]["volumeClaimTemplates"][0]["spec"][
                    "storageClassName"
                ] = sc_name
            self.pgsql_service = OCS(**pgsql_service)
            self.pgsql_service.create()
            self.pgsql_cmap = OCS(**pgsql_cmap)
            self.pgsql_cmap.create()
            self.pgsql_sset = OCS(**pgsql_sset)
            self.pgsql_sset.create()
            self.pod_obj.wait_for_resource(
                condition="Running",
                selector="app=postgres",
                resource_count=replicas,
                timeout=3600,
            )
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Failed during setup of PostgreSQL server")
            raise cf
        self.pgsql_is_setup = True
        log.info("Successfully deployed postgres database")
Esempio n. 12
0
    def __init__(self):
        """
        Initializer function

        """
        self.namespace = constants.OPENSHIFT_OPERATORS
        self.quay_operator = None
        self.quay_registry = None
        self.quay_pod_obj = OCP(kind=constants.POD, namespace=self.namespace)
        self.quay_registry_name = ""
        self.quay_operator_csv = ""
        self.sc_default = False
        self.sc_name = (
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
            if storagecluster_independent_check()
            else constants.DEFAULT_STORAGECLASS_RBD
        )
Esempio n. 13
0
    def search_file_path(self):
        """
        Search File Path

        """
        version = get_ocs_parsed_version()
        if self.type_log == "OTHERS" and storagecluster_independent_check():
            files = GATHER_COMMANDS_VERSION[version]["OTHERS_EXTERNAL"]
        else:
            files = GATHER_COMMANDS_VERSION[version][self.type_log]
        for file in files:
            self.files_not_exist.append(file)
            for dir_name, subdir_list, files_list in os.walk(self.root):
                if file in files_list:
                    self.files_path[file] = os.path.join(dir_name, file)
                    self.files_not_exist.remove(file)
                    break
Esempio n. 14
0
    def create_cb_cluster(self, replicas=1, sc_name=None):
        """
        Deploy a Couchbase server using Couchbase operator

        Once the couchbase operator is running, we need to wait for the
        worker pods to be up.  Once the Couchbase worker pods are up, pillowfight
        task is started.

        After the pillowfight task has finished, the log is collected and
        analyzed.

        Raises:
            Exception: If pillowfight results indicate that a minimum performance
                level is not reached (1 second response time, less than 1000 ops
                per second)

        """
        log.info("Creating Couchbase worker pods...")
        cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE)

        if (storagecluster_independent_check()
                and config.ENV_DATA["platform"].lower()
                not in constants.MANAGED_SERVICE_PLATFORMS):
            cb_example["spec"]["volumeClaimTemplates"][0]["spec"][
                "storageClassName"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
        cb_example["spec"]["servers"][0]["size"] = replicas
        if sc_name:
            cb_example["spec"]["volumeClaimTemplates"][0]["spec"][
                "storageClassName"] = sc_name
        self.cb_example = OCS(**cb_example)
        self.cb_example.create()
        self.cb_create_cb_cluster = True

        # Wait for the Couchbase workers to be running.

        log.info("Waiting for the Couchbase pods to be Running")
        self.pod_obj.wait_for_resource(
            condition="Running",
            selector="app=couchbase",
            resource_count=replicas,
            timeout=900,
        )
        log.info(
            f"Expected number: {replicas} of couchbase workers reached running state"
        )
Esempio n. 15
0
    def search_file_path(self):
        """
        Search File Path

        """
        ocs_version = float(
            f"{version.get_ocs_version_from_csv(only_major_minor=True)}")
        if self.type_log == "OTHERS" and storagecluster_independent_check():
            files = GATHER_COMMANDS_VERSION[ocs_version]["OTHERS_EXTERNAL"]
        else:
            files = GATHER_COMMANDS_VERSION[ocs_version][self.type_log]
        for file in files:
            self.files_not_exist.append(file)
            for dir_name, subdir_list, files_list in os.walk(self.root):
                if file in files_list:
                    self.files_path[file] = os.path.join(dir_name, file)
                    self.files_not_exist.remove(file)
                    break
Esempio n. 16
0
def test_monitoring_enabled():
    """
    OCS Monitoring is enabled after OCS installation (which is why this test
    has a post deployment marker) by asking for values of one ceph and one
    noobaa related metrics.
    """
    prometheus = PrometheusAPI()

    if (
        storagecluster_independent_check()
        and float(config.ENV_DATA["ocs_version"]) < 4.6
    ):
        logger.info(
            f"Skipping ceph metrics because it is not enabled for external "
            f"mode for OCS {float(config.ENV_DATA['ocs_version'])}"
        )

    else:
        # ask for values of ceph_pool_stored metric
        logger.info("Checking that ceph data are provided in OCS monitoring")
        result = prometheus.query("ceph_pool_stored")
        msg = "check that we actually received some values for a ceph query"
        assert len(result) > 0, msg
        for metric in result:
            _, value = metric["value"]
            assert_msg = "number of bytes in a pool isn't a positive integer or zero"
            assert int(value) >= 0, assert_msg
        # additional check that values makes at least some sense
        logger.info(
            "Checking that size of ceph_pool_stored result matches number of pools"
        )
        ct_pod = pod.get_ceph_tools_pod()
        ceph_pools = ct_pod.exec_ceph_cmd("ceph osd pool ls")
        assert len(result) == len(ceph_pools)

    # again for a noobaa metric
    logger.info("Checking that MCG/NooBaa data are provided in OCS monitoring")
    result = prometheus.query("NooBaa_bucket_status")
    msg = "check that we actually received some values for a MCG/NooBaa query"
    assert len(result) > 0, msg
    for metric in result:
        _, value = metric["value"]
        assert int(value) >= 0, "bucket status isn't a positive integer or zero"
Esempio n. 17
0
    def __init__(self, namespace=None):
        self.namespace = (namespace if namespace else
                          config.ENV_DATA["cluster_namespace"])

        if storagecluster_independent_check():
            sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RGW
        else:
            sc_name = constants.DEFAULT_STORAGECLASS_RGW

        self.storageclass = OCP(kind="storageclass",
                                namespace=namespace,
                                resource_name=sc_name)
        self.s3_internal_endpoint = (
            self.storageclass.get().get("parameters").get("endpoint"))
        self.region = self.storageclass.get().get("parameters").get("region")
        # Todo: Implement retrieval in cases where CephObjectStoreUser is available
        self.key_id = None
        self.secret_key = None
        self.s3_resource = None
Esempio n. 18
0
    def setup_amq_cluster(
        self, sc_name, namespace=constants.AMQ_NAMESPACE, size=100, replicas=3
    ):
        """
        Creates amq cluster with persistent storage.

        Args:
            sc_name (str): Name of sc
            namespace (str): Namespace for amq cluster
            size (int): Size of the storage
            replicas (int): Number of kafka and zookeeper pods to be created

        """
        if storagecluster_independent_check():
            sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
        self.setup_amq_cluster_operator(namespace)
        self.setup_amq_kafka_persistent(sc_name, size, replicas)
        self.setup_amq_kafka_connect()
        self.setup_amq_kafka_bridge()
        self.amq_is_setup = True
        return self
Esempio n. 19
0
    def create_jenkins_pvc(self):
        """
        create jenkins pvc

        Returns:
            List: pvc_objs
        """
        pvc_objs = []
        sc_name = (constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
                   if storagecluster_independent_check() else
                   constants.DEFAULT_STORAGECLASS_RBD)
        for project in self.projects:
            log.info(f"create jenkins pvc on project {project}")
            pvc_obj = create_pvc(
                pvc_name="dependencies",
                size="10Gi",
                sc_name=sc_name,
                namespace=project,
            )
            pvc_objs.append(pvc_obj)
        return pvc_objs
Esempio n. 20
0
    def __init__(
        self,
        project,
        tmp_path,
        storage_size=2,
    ):
        """
        Init of the LogReaderWriterParallel object

        Args:
            project (pytest fixture): The project fixture.
            tmp_path (pytest fixture): The tmp_path fixture.
            storage_size (str): The size of the storage in GB. The default value is 2 GB.

        """
        self.project = project
        self.tmp_path = tmp_path

        self.pvc_dict = get_pvc_dict()
        # we need to mount the volume on every worker node, so RWX/cephfs
        self.pvc_dict["metadata"]["name"] = "logwriter-cephfs-many"
        self.pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX]
        if storagecluster_independent_check(
        ) and not is_managed_service_cluster():
            sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS
        else:
            sc_name = constants.CEPHFILESYSTEM_SC
        logger.info(f"Storage class name = {sc_name}")
        self.pvc_dict["spec"]["storageClassName"] = sc_name
        self.pvc_dict["spec"]["resources"]["requests"][
            "storage"] = f"{storage_size}Gi"

        self.deploy_dict = {}
        self.workload_file = None
        self.ocp_pod = None

        self.local_dir = self.tmp_path / "logwriter"
        self.local_dir.mkdir()
Esempio n. 21
0
    def setup(
        self,
        request,
        scenario,
        num_of_nodes,
        num_of_fail_nodes,
        disrupt_provisioner,
        project_factory,
        multi_pvc_factory,
        dc_pod_factory,
    ):
        """
        Identify the nodes and start DeploymentConfig based app pods using
        PVC with ReadWriteOnce (RWO) access mode on selected nodes

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test
            num_of_fail_nodes (int): number of nodes to make unresponsive during test
            disrupt_provisioner (bool): True to disrupt the leader provisioner
                pods if not running on selected nodes, else False
            project_factory: A fixture to create new project
            multi_pvc_factory: A fixture create a set of new PVCs
            dc_pod_factory: A fixture to create deploymentconfig pods

        Returns:
            tuple: containing the params used in test cases

        """
        ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes(
            scenario, num_of_nodes)
        test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes
        logger.info(f"Using nodes {test_nodes} for running test")

        def finalizer():
            helpers.remove_label_from_worker_node(node_list=test_nodes,
                                                  label_key="nodetype")

            # Check ceph health
            ceph_health_check(tries=40)

        request.addfinalizer(finalizer)

        project = project_factory()

        if helpers.storagecluster_independent_check():
            ceph_cluster = CephClusterExternal()
        else:
            ceph_cluster = CephCluster()
            # Wait for mon pods to reach expected count
            # Bug 1778273 - [RFE]: Configure 5 MONs for OCS cluster with 5 or more nodes
            # This wait is required for some of the previous OCS versions (< 4.5)
            current_mon_count = int(
                ceph_cluster.CEPHCLUSTER.get_resource(resource_name="",
                                                      column="MONCOUNT"))
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=current_mon_count,
                timeout=900,
            )
            ceph_cluster.mons = []
            ceph_cluster.scan_cluster()

        # Select nodes for running app pods and inducing network failure later
        app_pod_nodes = self.select_nodes_for_app_pods(scenario, ceph_cluster,
                                                       ocs_nodes,
                                                       non_ocs_nodes,
                                                       num_of_fail_nodes)

        # Create multiple RBD and CephFS backed PVCs with RWO accessmode
        num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes
        rbd_pvcs = multi_pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            project=project,
            size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO],
            num_of_pvc=num_of_pvcs,
        )
        cephfs_pvcs = multi_pvc_factory(
            interface=constants.CEPHFILESYSTEM,
            project=project,
            size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO],
            num_of_pvc=num_of_pvcs,
        )

        # Create deploymentconfig based pods
        dc_pods = []
        # Start app-pods on selected node(s)
        for node_name in app_pod_nodes:
            logger.info(f"Starting app pods on the node {node_name}")
            helpers.label_worker_node(node_list=[node_name],
                                      label_key="nodetype",
                                      label_value="app-pod")

            for num in range(self.num_of_app_pods_per_node):
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHBLOCKPOOL,
                        pvc=rbd_pvcs.pop(0),
                        node_selector={"nodetype": "app-pod"},
                    ))
                assert pod.verify_node_name(
                    dc_pods[-1], node_name
                ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHFILESYSTEM,
                        pvc=cephfs_pvcs.pop(0),
                        node_selector={"nodetype": "app-pod"},
                    ))
                assert pod.verify_node_name(
                    dc_pods[-1], node_name
                ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
            helpers.remove_label_from_worker_node(node_list=[node_name],
                                                  label_key="nodetype")

        # Label other test nodes to be able to run app pods later
        helpers.label_worker_node(node_list=test_nodes,
                                  label_key="nodetype",
                                  label_value="app-pod")

        # Get ceph mon,osd pods running on selected node if colocated scenario
        # and extra OCS nodes are present
        # Recovery steps for MON and OSDS not required from OCS 4.4 onwards
        # Refer to BZ 1830015 and BZ 1835908
        ceph_pods = []
        if float(config.ENV_DATA["ocs_version"]) < 4.4 and (
                scenario == "colocated" and len(test_nodes) > 3):
            pods_to_check = ceph_cluster.osds
            # Skip mon pods if mon_count is 5 as there may not be enough nodes
            # for all mons to run after multiple node failures
            if ceph_cluster.mon_count == 3:
                pods_to_check.extend(ceph_cluster.mons)
            for pod_obj in pods_to_check:
                if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]:
                    ceph_pods.append(pod_obj)
            logger.info(
                f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}"
            )

        disruptor = []
        if disrupt_provisioner:
            disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes)

        return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
Esempio n. 22
0
    def __init__(self, *args, **kwargs):
        """
        Constructor for the MCG class
        """
        self.namespace = config.ENV_DATA["cluster_namespace"]
        self.operator_pod = Pod(**get_pods_having_label(
            constants.NOOBAA_OPERATOR_POD_LABEL, self.namespace)[0])
        self.core_pod = Pod(**get_pods_having_label(
            constants.NOOBAA_CORE_POD_LABEL, self.namespace)[0])

        self.retrieve_noobaa_cli_binary()
        """
        The certificate will be copied on each mcg_obj instantiation since
        the process is so light and quick, that the time required for the redundant
        copy is neglible in comparison to the time a hash comparison will take.
        """
        retrieve_default_ingress_crt()

        get_noobaa = OCP(kind="noobaa", namespace=self.namespace).get()

        self.s3_endpoint = (get_noobaa.get("items")[0].get("status").get(
            "services").get("serviceS3").get("externalDNS")[0])
        self.s3_internal_endpoint = (get_noobaa.get("items")[0].get(
            "status").get("services").get("serviceS3").get("internalDNS")[0])
        self.mgmt_endpoint = (get_noobaa.get("items")[0].get("status").get(
            "services").get("serviceMgmt").get("externalDNS")[0]) + "/rpc"
        self.region = config.ENV_DATA["region"]

        creds_secret_name = (get_noobaa.get("items")[0].get("status").get(
            "accounts").get("admin").get("secretRef").get("name"))
        secret_ocp_obj = OCP(kind="secret", namespace=self.namespace)
        creds_secret_obj = secret_ocp_obj.get(creds_secret_name)

        self.access_key_id = base64.b64decode(
            creds_secret_obj.get("data").get("AWS_ACCESS_KEY_ID")).decode(
                "utf-8")
        self.access_key = base64.b64decode(
            creds_secret_obj.get("data").get("AWS_SECRET_ACCESS_KEY")).decode(
                "utf-8")

        self.noobaa_user = base64.b64decode(
            creds_secret_obj.get("data").get("email")).decode("utf-8")
        self.noobaa_password = base64.b64decode(
            creds_secret_obj.get("data").get("password")).decode("utf-8")

        self.noobaa_token = self.retrieve_nb_token()

        self.s3_resource = boto3.resource(
            "s3",
            verify=retrieve_verification_mode(),
            endpoint_url=self.s3_endpoint,
            aws_access_key_id=self.access_key_id,
            aws_secret_access_key=self.access_key,
        )

        self.s3_client = self.s3_resource.meta.client

        if config.ENV_DATA["platform"].lower() == "aws" and kwargs.get(
                "create_aws_creds"):
            (
                self.cred_req_obj,
                self.aws_access_key_id,
                self.aws_access_key,
            ) = self.request_aws_credentials()

            self.aws_s3_resource = boto3.resource(
                "s3",
                endpoint_url="https://s3.amazonaws.com",
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_access_key,
            )

        if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS
                or storagecluster_independent_check()):
            if not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM and (
                    float(config.ENV_DATA["ocs_version"]) > 4.5):
                logger.info("Checking whether RGW pod is not present")
                pods = pod.get_pods_having_label(label=constants.RGW_APP_LABEL,
                                                 namespace=self.namespace)
                assert (
                    not pods
                ), "RGW pods should not exist in the current platform/cluster"

        elif config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
            rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"],
                                      check_if_cluster_was_upgraded(), None)
            logger.info(
                f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform'
            )
            rgw_pod = OCP(kind=constants.POD, namespace=self.namespace)
            assert rgw_pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.RGW_APP_LABEL,
                resource_count=rgw_count,
                timeout=60,
            )
Esempio n. 23
0
    def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure(
            self, nodes, setup, node_restart_teardown):
        """
        OCS-1431/OCS-1436:
        - Start DeploymentConfig based app pods on 1 node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Disrupt the leader provisioner pods if not running on above selected
            node
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive node
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods
        - Again make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods scheduled on another node are stuck due to
            Multi-Attach error.
        - Reboot the unresponsive node
        - When unresponsive node recovers, run IOs on new app pods

        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        external_mode = helpers.storagecluster_independent_check()
        extra_nodes = list(set(test_nodes) - set(app_pod_nodes))
        helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1],
                                              label_key="nodetype")

        # Run IO on pods
        md5sum_data = self.run_and_verify_io(pod_list=dc_pods,
                                             fio_filename="io_file1",
                                             run_io_in_bg=True)

        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds")
        sleep(self.prolong_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name)

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(
            dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods)

        new_ceph_pods = []
        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, "Unexpected number of osd pods"
            self.verify_multi_attach_error(new_ceph_pods)

        logger.info("Executing manual recovery steps")
        # Power off the unresponsive node
        logger.info(f"Powering off the unresponsive node: {app_pod_nodes}")
        nodes.stop_nodes(node.get_node_objs(app_pod_nodes))

        # Force delete the app pods and/or mon,osd pods on the unresponsive node
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.delete(force=True)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        if not external_mode:
            # Wait for mon and osd pods to reach Running state
            selectors_to_check = {
                constants.MON_APP_LABEL: self.expected_mon_count,
                constants.OSD_APP_LABEL: ceph_cluster.osd_count,
            }
            for selector, count in selectors_to_check.items():
                assert ceph_cluster.POD.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=selector,
                    resource_count=count,
                    timeout=1800,
                    sleep=60,
                ), f"{count} expected pods with selector {selector} are not in Running state"

            if ceph_cluster.mon_count == self.expected_mon_count:
                # Check ceph health
                toolbox_status = ceph_cluster.POD.get_resource_status(
                    ceph_cluster.toolbox.name)
                if toolbox_status == constants.STATUS_TERMINATING:
                    ceph_cluster.toolbox.delete(force=True)

                assert ceph_health_check(), "Ceph cluster health is not OK"
                logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods,
                                              fio_filename="io_file2",
                                              run_io_in_bg=True)

        helpers.label_worker_node(node_list=extra_nodes[:-1],
                                  label_key="nodetype",
                                  label_value="app-pod")

        # Induce network failure on the node
        node.node_network_failure(extra_nodes[-1])
        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name,
                timeout=600,
                sleep=30,
            )

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods2 = self.get_new_pods(new_dc_pods)
        assert len(new_dc_pods2) == len(
            new_dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods2)

        # Reboot the unresponsive node
        logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}")
        nodes.restart_nodes_by_stop_and_start(
            node.get_node_objs([extra_nodes[-1]]))
        node.wait_for_nodes_status(node_names=[extra_nodes[-1]],
                                   status=constants.NODE_READY)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods2:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        if not external_mode:
            # Wait for mon and osd pods to reach Running state
            for selector, count in selectors_to_check.items():
                assert ceph_cluster.POD.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=selector,
                    resource_count=count,
                    timeout=1800,
                    sleep=60,
                ), f"{count} expected pods with selector {selector} are not in Running state"

            if ceph_cluster.mon_count == 3:
                # Check ceph health
                assert ceph_health_check(), "Ceph cluster health is not OK"
                logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods2):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file2",
                                      original_md5sum=md5sum_data2[num])

        for num, pod_obj in enumerate(new_dc_pods2):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        self.run_and_verify_io(pod_list=new_dc_pods2,
                               fio_filename="io_file3",
                               return_md5sum=False)
Esempio n. 24
0
    def test_rwo_pvc_fencing_node_prolonged_network_failure(
            self, nodes, setup, node_restart_teardown):
        """
        OCS-1427/OCS-1429:
        - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive node
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods

        OCS-1430/OCS-1435:
        - Start DeploymentConfig based app pods on multiple node
            Colocated scenario: Select 1 node where osd and/or mon is running,
                select other 2 nodes where mon/osd are not running
            Dedicated scenario: 3 Non-OCS nodes
        - Disrupt the leader provisioner pods if not running on above selected
            nodes
        - Make the nodes (where app pods are running) unresponsive
            by bringing their main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive nodes
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods

        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        external_mode = helpers.storagecluster_independent_check()
        # Run IO on pods
        md5sum_data = self.run_and_verify_io(pod_list=dc_pods,
                                             fio_filename="io_file1",
                                             run_io_in_bg=True)

        # OCS-1430/OCS-1435
        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds")
        sleep(self.prolong_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name)

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(
            dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods)

        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, "Unexpected number of osd pods"
            self.verify_multi_attach_error(new_ceph_pods)

        logger.info("Executing manual recovery steps")
        # Power off the unresponsive node(s)
        logger.info(f"Powering off the unresponsive node(s): {app_pod_nodes}")
        nodes.stop_nodes(node.get_node_objs(app_pod_nodes))

        # Force delete the app pods and/or mon,osd pods on the unresponsive node
        if float(config.ENV_DATA["ocs_version"]
                 ) < 4.4 and ceph_cluster.mon_count == 5:
            for pod_obj in ceph_cluster.mons:
                if pod.get_pod_node(pod_obj).name in app_pod_nodes:
                    ceph_pods.append(pod_obj)

        for pod_obj in dc_pods + ceph_pods:
            pod_obj.delete(force=True)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        if not external_mode:
            # Wait for mon and osd pods to reach Running state
            selectors_to_check = {
                constants.MON_APP_LABEL: self.expected_mon_count,
                constants.OSD_APP_LABEL: ceph_cluster.osd_count,
            }
            for selector, count in selectors_to_check.items():
                assert ceph_cluster.POD.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=selector,
                    resource_count=count,
                    timeout=1800,
                    sleep=60,
                ), f"{count} expected pods with selector {selector} are not in Running state"

            if ceph_cluster.mon_count == self.expected_mon_count:
                # Check ceph health
                toolbox_status = ceph_cluster.POD.get_resource_status(
                    ceph_cluster.toolbox.name)
                if toolbox_status == constants.STATUS_TERMINATING:
                    ceph_cluster.toolbox.delete(force=True)

                assert ceph_health_check(), "Ceph cluster health is not OK"
                logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        self.run_and_verify_io(pod_list=new_dc_pods,
                               fio_filename="io_file2",
                               return_md5sum=False)
Esempio n. 25
0
def test_log_reader_writer_parallel(project, tmp_path):
    """
    Write and read logfile stored on cephfs volume, from all worker nodes of a
    cluster via k8s Deployment, while fetching content of the stored data via
    oc rsync to check the data locally.

    Reproduces BZ 1989301. Test failure means new blocker high priority bug.
    """
    pvc_dict = get_pvc_dict()
    # we need to mount the volume on every worker node, so RWX/cephfs
    pvc_dict["metadata"]["name"] = "logwriter-cephfs-many"
    pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX]
    if (
        config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS
    ) and storagecluster_independent_check():
        sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS
    else:
        sc_name = constants.CEPHFILESYSTEM_SC
    pvc_dict["spec"]["storageClassName"] = sc_name
    # there is no need for lot of storage capacity for this test
    pvc_dict["spec"]["resources"]["requests"]["storage"] = "1Gi"

    # get deployment dict for the reproducer logwriter workload
    with open(constants.LOGWRITER_CEPHFS_REPRODUCER, "r") as deployment_file:
        deploy_dict = yaml.safe_load(deployment_file.read())
    # if we are running in disconnected environment, we need to mirror the
    # container image first, and then use the mirror instead of the original
    if config.DEPLOYMENT.get("disconnected"):
        update_container_with_mirrored_image(deploy_dict["spec"]["template"])
    # we need to match deployment replicas with number of worker nodes
    deploy_dict["spec"]["replicas"] = len(get_worker_nodes())
    # drop topology spread constraints related to zones
    topology.drop_topology_constraint(
        deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL
    )
    # and link the deployment with the pvc
    try:
        link_spec_volume(
            deploy_dict["spec"]["template"]["spec"],
            "logwriter-cephfs-volume",
            pvc_dict["metadata"]["name"],
        )
    except Exception as ex:
        error_msg = "LOGWRITER_CEPHFS_REPRODUCER no longer matches code of this test"
        raise Exception(error_msg) from ex

    # prepare k8s yaml file for deployment
    workload_file = ObjectConfFile(
        "log_reader_writer_parallel", [pvc_dict, deploy_dict], project, tmp_path
    )
    # deploy the workload, starting the log reader/writer pods
    logger.info(
        "starting log reader/writer workload via Deployment, one pod per worker"
    )
    workload_file.create()

    logger.info("waiting for all pods of the workload Deployment to run")
    ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace)
    try:
        ocp_pod.wait_for_resource(
            resource_count=deploy_dict["spec"]["replicas"],
            condition=constants.STATUS_RUNNING,
            error_condition=constants.STATUS_ERROR,
            timeout=300,
            sleep=30,
        )
    except Exception as ex:
        # this is not a problem with feature under test, but with infra,
        # cluster configuration or unrelated bug which must have happened
        # before this test case
        error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken"
        logger.exception(error_msg)
        logger.debug(workload_file.describe())
        raise exceptions.UnexpectedBehaviour(error_msg) from ex

    # while the workload is running, we will try to fetch and validate data
    # from the cephfs volume of the workload 120 times (this number of retries
    # is a bit larger than usual number required to reproduce bug from
    # BZ 1989301, but we need to be sure here)
    number_of_fetches = 120
    # if given fetch fail, we will ignore the failure unless the number of
    # failures is too high (this has no direct impact on feature under test,
    # we should be able to detect the bug even with 10% of rsync failures,
    # since data corruption doesn't simply go away ...)
    number_of_failures = 0
    allowed_failures = 12
    is_local_data_ok = True
    local_dir = tmp_path / "logwriter"
    local_dir.mkdir()
    workload_pods = ocp_pod.get()
    workload_pod_name = workload_pods["items"][0]["metadata"]["name"]
    logger.info(
        "while the workload is running, we will fetch and check data from the cephfs volume %d times",
        number_of_fetches,
    )
    for _ in range(number_of_fetches):
        # fetch data from cephfs volume into the local dir
        oc_cmd = [
            "oc",
            "rsync",
            "--loglevel=4",
            "-n",
            project.namespace,
            f"pod/{workload_pod_name}:/mnt/target",
            local_dir,
        ]
        try:
            run_cmd(cmd=oc_cmd, timeout=300)
        except Exception as ex:
            number_of_failures += 1
            # in case this fails, we are going to fetch extra evidence, that
            # said such failure is most likely related to OCP or infrastructure
            error_msg = "oc rsync failed: something is wrong with the cluster"
            logger.exception(error_msg)
            logger.debug(workload_file.describe())
            oc_rpm_debug = [
                "oc",
                "rsh",
                "-n",
                project.namespace,
                f"pod/{workload_pod_name}",
                "bash",
                "-c",
                ";".join(
                    [
                        "rpm -qa",
                        "rpm -qaV",
                        "type -a tar",
                        "tar --version",
                        "type -a rsync",
                        "rsync --version",
                    ]
                ),
            ]
            try:
                run_cmd(cmd=oc_rpm_debug, timeout=600)
            except Exception:
                # if fetch of additional evidence fails, log and ignore the
                # exception (so that we can retry if needed)
                logger.exception("failed to fetch additional evidence")
            # in case the rsync run failed because of a container restart,
            # we assume the pod name hasn't changed, and just wait for the
            # container to be running again - unless the number of rsync
            # failures is too high
            if number_of_failures > allowed_failures:
                logger.error("number of ignored rsync failures is too high")
            else:
                ocp_pod.wait_for_resource(
                    resource_count=deploy_dict["spec"]["replicas"],
                    condition=constants.STATUS_RUNNING,
                    error_condition=constants.STATUS_ERROR,
                    timeout=300,
                    sleep=30,
                )
                continue
            logger.debug(
                "before this failure, we ignored %d previous failures",
                number_of_failures,
            )
            raise exceptions.UnexpectedBehaviour(error_msg) from ex
        # look for null bytes in the just fetched local files in target dir,
        # and if these binary bytes are found, the test failed (the bug
        # was reproduced)
        target_dir = os.path.join(local_dir, "target")
        for file_name in os.listdir(target_dir):
            with open(os.path.join(target_dir, file_name), "r") as fo:
                data = fo.read()
                if "\0" in data:
                    is_local_data_ok = False
                    logger.error(
                        "file %s is corrupted: null byte found in a text file",
                        file_name,
                    )
        # is_local_data_ok = False
        assert is_local_data_ok, "data corruption detected"
        time.sleep(2)

    logger.debug("number of ignored rsync failures: %d", number_of_failures)

    # if no obvious problem was detected, run the logreader job to validate
    # checksums in the log files (so that we are 100% sure that nothing went
    # wrong with the IO or the data)
    with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file:
        job_dict = yaml.safe_load(job_file.read())
    # mirroring for disconnected environment, if necessary
    if config.DEPLOYMENT.get("disconnected"):
        update_container_with_mirrored_image(job_dict["spec"]["template"])
    # drop topology spread constraints related to zones
    topology.drop_topology_constraint(
        job_dict["spec"]["template"]["spec"], topology.ZONE_LABEL
    )
    # we need to match number of jobs with the number used in the workload
    job_dict["spec"]["completions"] = deploy_dict["spec"]["replicas"]
    job_dict["spec"]["parallelism"] = deploy_dict["spec"]["replicas"]
    # and reffer to the correct pvc name
    try:
        link_spec_volume(
            job_dict["spec"]["template"]["spec"],
            "logwriter-cephfs-volume",
            pvc_dict["metadata"]["name"],
        )
    except Exception as ex:
        error_msg = "LOGWRITER_CEPHFS_READER no longer matches code of this test"
        raise Exception(error_msg) from ex
    # prepare k8s yaml file for the job
    job_file = ObjectConfFile("log_reader", [job_dict], project, tmp_path)
    # deploy the job, starting the log reader pods
    logger.info(
        "starting log reader data validation job to fully check the log data",
    )
    job_file.create()
    # wait for the logreader job to complete (this should be rather quick)
    try:
        job.wait_for_job_completion(
            job_name=job_dict["metadata"]["name"],
            namespace=project.namespace,
            timeout=300,
            sleep_time=30,
        )
    except exceptions.TimeoutExpiredError:
        error_msg = (
            "verification failed to complete in time: data loss or broken cluster?"
        )
        logger.exception(error_msg)
    # and then check that the job completed with success
    logger.info("checking the result of data validation job")
    logger.debug(job_file.describe())
    ocp_job = ocp.OCP(
        kind="Job",
        namespace=project.namespace,
        resource_name=job_dict["metadata"]["name"],
    )
    job_status = ocp_job.get()["status"]
    logger.info("last status of data verification job: %s", job_status)
    if (
        "failed" in job_status
        or job_status["succeeded"] != deploy_dict["spec"]["replicas"]
    ):
        error_msg = "possible data corruption: data verification job failed!"
        logger.error(error_msg)
        job.log_output_of_job_pods(
            job_name=job_dict["metadata"]["name"], namespace=project.namespace
        )
        raise Exception(error_msg)
Esempio n. 26
0
    def test_rwo_pvc_fencing_node_short_network_failure(
            self, nodes, setup, node_restart_teardown):
        """
        OCS-1423/OCS-1428/OCS-1426:
        - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Reboot the unresponsive node
        - When unresponsive node recovers, run IOs on new app pods

        OCS-1424/OCS-1434:
        - Start DeploymentConfig based app pods on multiple node
            Colocated scenario: Select 1 node where osd and/or mon is running,
                select other 2 nodes where mon/osd are not running
            Dedicated scenario: 3 Non-OCS nodes
        - Disrupt the leader provisioner pods if not running on above selected
            nodes
        - Make the nodes (where app pods are running) unresponsive
            by bringing their main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node and
            are stuck due to Multi-Attach error.
        - Reboot the unresponsive nodes
        - When unresponsive nodes recover, run IOs on new app pods

        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        # Run IO on pods
        md5sum_data = self.run_and_verify_io(pod_list=dc_pods,
                                             fio_filename="io_file1",
                                             run_io_in_bg=True)

        # OCS-1424/OCS-1434
        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name,
                timeout=600,
                sleep=30,
            )

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(
            dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods)

        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, "Unexpected number of osd pods"
            self.verify_multi_attach_error(new_ceph_pods)

        # Reboot the unresponsive node(s)
        logger.info(f"Rebooting the unresponsive node(s): {app_pod_nodes}")
        nodes.restart_nodes_by_stop_and_start(
            node.get_node_objs(app_pod_nodes))
        node.wait_for_nodes_status(node_names=app_pod_nodes,
                                   status=constants.NODE_READY)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        if not helpers.storagecluster_independent_check():
            # Wait for mon and osd pods to reach Running state
            selectors_to_check = {
                constants.MON_APP_LABEL: ceph_cluster.mon_count,
                constants.OSD_APP_LABEL: ceph_cluster.osd_count,
            }
            for selector, count in selectors_to_check.items():
                assert ceph_cluster.POD.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=selector,
                    resource_count=count,
                    timeout=1800,
                    sleep=60,
                ), f"{count} expected pods with selector {selector} are not in Running state"

            assert ceph_health_check(), "Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            assert pod.verify_data_integrity(pod_obj=pod_obj,
                                             file_name="io_file1",
                                             original_md5sum=md5sum_data[num]
                                             ), "Data integrity check failed"

        # Run IO on new pods
        self.run_and_verify_io(pod_list=new_dc_pods,
                               fio_filename="io_file2",
                               return_md5sum=False)
Esempio n. 27
0
    def _create_backingstore(method, uls_dict):
        """
        Tracks creation and cleanup of all the backing stores that were created in the scope

        Args:
            method (str): String for selecting method of backing store creation (CLI/OC)
            uls_dict (dict): Dictionary containing storage provider as key and a list of tuples
            as value.
            Cloud backing stores form - 'CloudName': [(amount, region), (amount, region)]
            i.e. - 'aws': [(3, us-west-1),(2, eu-west-2)]
            PV form - 'pv': [(amount, size_in_gb, storagecluster), ...]
            i.e. - 'pv': [(3, 32, ocs-storagecluster-ceph-rbd),(2, 100, ocs-storagecluster-ceph-rbd)]

        Returns:
            list: A list of backingstore names.

        """
        if method.lower() not in cmdMap:
            raise RuntimeError(f"Invalid method type received: {method}. "
                               f'available types: {", ".join(cmdMap.keys())}')
        for cloud, uls_lst in uls_dict.items():
            for uls_tup in uls_lst:
                # Todo: Replace multiple .append calls, create names in advance, according to amountoc
                if cloud.lower() not in cmdMap[method.lower()]:
                    raise RuntimeError(
                        f"Invalid cloud type received: {cloud}. "
                        f'available types: {", ".join(cmdMap[method.lower()].keys())}'
                    )
                if cloud == "pv":
                    vol_num, size, storagecluster = uls_tup
                    if (storagecluster == constants.DEFAULT_STORAGECLASS_RBD
                            and storagecluster_independent_check()):
                        storagecluster = (
                            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
                    backingstore_name = create_unique_resource_name(
                        resource_description="backingstore",
                        resource_type=cloud.lower())
                    created_backingstores.append(
                        BackingStore(
                            name=backingstore_name,
                            method=method.lower(),
                            type="pv",
                            mcg_obj=mcg_obj,
                            vol_num=vol_num,
                            vol_size=size,
                        ))
                    if method.lower() == "cli":
                        cmdMap[method.lower()][cloud.lower()](
                            mcg_obj, backingstore_name, vol_num, size,
                            storagecluster)
                    else:
                        cmdMap[method.lower()][cloud.lower()](
                            backingstore_name, vol_num, size, storagecluster)
                else:
                    _, region = uls_tup
                    uls_dict = cloud_uls_factory({cloud: [uls_tup]})
                    for uls_name in uls_dict[cloud.lower()]:
                        backingstore_name = create_unique_resource_name(
                            resource_description="backingstore",
                            resource_type=cloud.lower(),
                        )
                        created_backingstores.append(
                            BackingStore(
                                name=backingstore_name,
                                method=method.lower(),
                                type="cloud",
                                uls_name=uls_name,
                                mcg_obj=mcg_obj,
                            ))
                        if method.lower() == "cli":
                            cmdMap[method.lower()][cloud.lower()](
                                mcg_obj, cld_mgr, backingstore_name, uls_name,
                                region)
                        elif method.lower() == "oc":
                            cmdMap[method.lower()][cloud.lower()](
                                cld_mgr, backingstore_name, uls_name, region)
                        mcg_obj.check_backingstore_state(
                            backingstore_name, constants.BS_OPTIMAL)
                        # TODO: Verify OC\CLI BS health by using the appropriate methods

        return created_backingstores