def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory):
        """
        Check storage utilization, if its less then runs IO,
        Scale osds from 3-6, check for rebalance and reboot workers
        """
        current_osd_count = count_cluster_osd()
        proj_obj = project_factory()
        if current_osd_count == 3:
            while not validate_osd_utilization(osd_used=50):
                # Create pvc
                pvc_objs = multi_pvc_factory(
                    project=proj_obj,
                    interface=interface, size=self.pvc_size,
                    num_of_pvc=self.num_of_pvcs
                )

                dc_pod_objs = list()
                for pvc_obj in pvc_objs:
                    dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj))

                wait_for_dc_app_pods_to_reach_running_state(
                    dc_pod_objs, timeout=1200
                )

                for pod_obj in dc_pod_objs:
                    pod_obj.run_io(
                        storage_type='fs', size='3G', runtime='60',
                        fio_filename=f'{pod_obj.name}_io'
                    )

        # Add capacity
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = OCP(
            kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']
        )
        pod.wait_for_resource(
            timeout=300,
            condition=constants.STATUS_RUNNING,
            selector='app=rook-ceph-osd',
            resource_count=count * 3
        )
        assert ceph_health_check(), "New OSDs failed to reach running state"

        cluster = CephCluster()

        # Get rebalance status
        rebalance_status = cluster.get_rebalance_status()
        logger.info(rebalance_status)
        if rebalance_status:
            time_taken = cluster.time_taken_to_complete_rebalance()
            logger.info(f"The time taken to complete rebalance {time_taken}")

        # Rolling reboot on worker nodes
        worker_nodes = get_typed_nodes(node_type='worker')

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in worker_nodes:
            nodes.restart_nodes(nodes=[node])
            wait_for_nodes_status()

        assert ceph_health_check(delay=180), "Failed, Ceph health bad after nodes reboot"
Beispiel #2
0
    def __init__(self, *args, **kwargs):
        """
        Constructor for the MCG class
        """
        self.namespace = config.ENV_DATA['cluster_namespace']
        self.operator_pod = Pod(**get_pods_having_label(
            constants.NOOBAA_OPERATOR_POD_LABEL, self.namespace)[0])
        self.core_pod = Pod(**get_pods_having_label(
            constants.NOOBAA_CORE_POD_LABEL, self.namespace)[0])

        self.retrieve_noobaa_cli_binary()
        """
        The certificate will be copied on each mcg_obj instantiation since
        the process is so light and quick, that the time required for the redundant
        copy is neglible in comparison to the time a hash comparison will take.
        """
        retrieve_default_ingress_crt()

        get_noobaa = OCP(kind='noobaa', namespace=self.namespace).get()

        self.s3_endpoint = (get_noobaa.get('items')[0].get('status').get(
            'services').get('serviceS3').get('externalDNS')[0])
        self.s3_internal_endpoint = (get_noobaa.get('items')[0].get(
            'status').get('services').get('serviceS3').get('internalDNS')[0])
        self.mgmt_endpoint = (get_noobaa.get('items')[0].get('status').get(
            'services').get('serviceMgmt').get('externalDNS')[0]) + '/rpc'
        self.region = config.ENV_DATA['region']

        creds_secret_name = (get_noobaa.get('items')[0].get('status').get(
            'accounts').get('admin').get('secretRef').get('name'))
        secret_ocp_obj = OCP(kind='secret', namespace=self.namespace)
        creds_secret_obj = secret_ocp_obj.get(creds_secret_name)

        self.access_key_id = base64.b64decode(
            creds_secret_obj.get('data').get('AWS_ACCESS_KEY_ID')).decode(
                'utf-8')
        self.access_key = base64.b64decode(
            creds_secret_obj.get('data').get('AWS_SECRET_ACCESS_KEY')).decode(
                'utf-8')

        self.noobaa_user = base64.b64decode(
            creds_secret_obj.get('data').get('email')).decode('utf-8')
        self.noobaa_password = base64.b64decode(
            creds_secret_obj.get('data').get('password')).decode('utf-8')

        self.noobaa_token = self.send_rpc_query(
            'auth_api',
            'create_auth',
            params={
                'role': 'admin',
                'system': 'noobaa',
                'email': self.noobaa_user,
                'password': self.noobaa_password
            }).json().get('reply').get('token')

        self.s3_resource = boto3.resource(
            's3',
            verify=constants.DEFAULT_INGRESS_CRT_LOCAL_PATH,
            endpoint_url=self.s3_endpoint,
            aws_access_key_id=self.access_key_id,
            aws_secret_access_key=self.access_key)

        self.s3_client = self.s3_resource.meta.client

        if (config.ENV_DATA['platform'].lower() == 'aws'
                and kwargs.get('create_aws_creds')):
            (self.cred_req_obj, self.aws_access_key_id,
             self.aws_access_key) = self.request_aws_credentials()

            self.aws_s3_resource = boto3.resource(
                's3',
                endpoint_url="https://s3.amazonaws.com",
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_access_key)

        if (config.ENV_DATA['platform'].lower() in constants.CLOUD_PLATFORMS
                or storagecluster_independent_check()):
            logger.info(
                'Checking whether RGW pod is not present on AWS platform')
            pods = pod.get_pods_having_label(label=constants.RGW_APP_LABEL,
                                             namespace=self.namespace)
            assert not pods, 'RGW pods should not exist in the current platform/cluster'

        elif config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS:
            rgw_count = 2 if float(
                config.ENV_DATA['ocs_version']) >= 4.5 else 1
            logger.info(
                f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform'
            )
            rgw_pod = OCP(kind=constants.POD, namespace=self.namespace)
            assert rgw_pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.RGW_APP_LABEL,
                resource_count=rgw_count,
                timeout=60)
Beispiel #3
0
class PASTest(BaseTest):
    """
    Base class for QPAS team - Performance and Scale tests

    This class contain functions which used by performance and scale test,
    and also can be used by E2E test which used the benchmark-operator (ripsaw)
    """
    def setup(self):
        """
        Setting up the environment for each performance and scale test

        """
        log.info("Setting up test environment")
        self.crd_data = None  # place holder for Benchmark CDR data
        self.es_backup = None  # place holder for the elasticsearch backup
        self.main_es = None  # place holder for the main elasticsearch object
        self.benchmark_obj = None  # place holder for the benchmark object
        self.client_pod = None  # Place holder for the client pod object
        self.dev_mode = config.RUN["cli_params"].get("dev_mode")
        self.pod_obj = OCP(kind="pod")

        # Collecting all Environment configuration Software & Hardware
        # for the performance report.
        self.environment = get_environment_info()
        self.environment["clusterID"] = get_running_cluster_id()

        self.get_osd_info()

        self.get_node_info(node_type="master")
        self.get_node_info(node_type="worker")

    def get_osd_info(self):
        """
        Getting the OSD's information and update the main environment
        dictionary.

        """
        ct_pod = pod.get_ceph_tools_pod()
        osd_info = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df")
        self.environment["osd_size"] = osd_info.get("nodes")[0].get(
            "crush_weight")
        self.environment["osd_num"] = len(osd_info.get("nodes"))
        self.environment["total_capacity"] = osd_info.get("summary").get(
            "total_kb_avail")
        self.environment["ocs_nodes_num"] = len(node.get_ocs_nodes())

    def get_node_info(self, node_type="master"):
        """
        Getting node type hardware information and update the main environment
        dictionary.

        Args:
            node_type (str): the node type to collect data about,
              can be : master / worker - the default is master

        """
        if node_type == "master":
            nodes = node.get_master_nodes()
        elif node_type == "worker":
            nodes = node.get_worker_nodes()
        else:
            log.warning(f"Node type ({node_type}) is invalid")
            return

        oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        self.environment[f"{node_type}_nodes_num"] = len(nodes)
        self.environment[
            f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd(
                node=nodes[0],
                cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"],
            ).rstrip()
        self.environment[
            f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd(
                node=nodes[0],
                cmd_list=["free | grep Mem | awk '{print $2}'"]).rstrip()

    def ripsaw_deploy(self, ripsaw):
        """
        Deploy the benchmark operator (formally ripsaw) CRD

        Args:
            ripsaw (obj): benchmark operator object

        """
        log.info("Deploying benchmark operator (ripsaw)")
        ripsaw.apply_crd("resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml")

    def es_info_backup(self, elasticsearch):
        """
        Saving the Original elastic-search IP and PORT - if defined in yaml

        Args:
            elasticsearch (obj): elasticsearch object

        """

        self.crd_data["spec"]["elasticsearch"] = {}

        # for development mode use the Dev ES server
        if self.dev_mode and config.PERF.get("dev_lab_es"):
            log.info("Using the development ES server")
            self.crd_data["spec"]["elasticsearch"] = {
                "server": config.PERF.get("dev_es_server"),
                "port": config.PERF.get("dev_es_port"),
                "url":
                f"http://{config.PERF.get('dev_es_server')}:{config.PERF.get('dev_es_port')}",
                "parallel": True,
            }

        # for production mode use the Lab ES server
        if not self.dev_mode and config.PERF.get("production_es"):
            self.crd_data["spec"]["elasticsearch"] = {
                "server": config.PERF.get("production_es_server"),
                "port": config.PERF.get("production_es_port"),
                "url":
                f"http://{config.PERF.get('production_es_server')}:{config.PERF.get('production_es_port')}",
                "parallel": True,
            }

        # backup the Main ES info (if exists)
        if not self.crd_data["spec"]["elasticsearch"] == {}:
            self.backup_es = self.crd_data["spec"]["elasticsearch"]
            log.info(
                f"Creating object for the Main ES server on {self.backup_es['url']}"
            )
            self.main_es = Elasticsearch([self.backup_es["url"]],
                                         verify_certs=True)
        else:
            log.warning(
                "Elastic Search information does not exists for this test")

        # Use the internal define elastic-search server in the test - if exist
        if elasticsearch:

            if not isinstance(elasticsearch, dict):
                # elasticsearch is an internally deployed server (obj)
                ip = elasticsearch.get_ip()
                port = elasticsearch.get_port()
            else:
                # elasticsearch is an existing server (dict)
                ip = elasticsearch.get("server")
                port = elasticsearch.get("port")

            self.crd_data["spec"]["elasticsearch"] = {
                "server": ip,
                "port": port,
                "url": f"http://{ip}:{port}",
                "parallel": True,
            }
            log.info(
                f"Going to use the ES : {self.crd_data['spec']['elasticsearch']}"
            )
        elif config.PERF.get("internal_es_server"):
            # use an in-cluster elastic-search (not deployed by the test)
            self.crd_data["spec"]["elasticsearch"] = {
                "server": config.PERF.get("internal_es_server"),
                "port": config.PERF.get("internal_es_port"),
                "url":
                f"http://{config.PERF.get('internal_es_server')}:{config.PERF.get('internal_es_port')}",
                "parallel": True,
            }

    def set_storageclass(self, interface):
        """
        Setting the benchmark CRD storageclass

        Args:
            interface (str): The interface which will used in the test

        """
        if interface == constants.CEPHBLOCKPOOL:
            storageclass = constants.DEFAULT_STORAGECLASS_RBD
        else:
            storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS
        log.info(f"Using [{storageclass}] Storageclass")
        self.crd_data["spec"]["workload"]["args"][
            "storageclass"] = storageclass

    def get_env_info(self):
        """
        Getting the environment information and update the workload RC if
        necessary.

        """
        if not self.environment["user"] == "":
            self.crd_data["spec"]["test_user"] = self.environment["user"]
        else:
            # since full results object need this parameter, initialize it from CR file
            self.environment["user"] = self.crd_data["spec"]["test_user"]
        self.crd_data["spec"]["clustername"] = self.environment["clustername"]

        log.debug(f"Environment information is : {self.environment}")

    def deploy_and_wait_for_wl_to_start(self, timeout=300, sleep=20):
        """
        Deploy the workload and wait until it start working

        Args:
            timeout (int): time in second to wait until the benchmark start
            sleep (int): Sleep interval seconds

        """
        log.debug(f"The {self.benchmark_name} CR file is {self.crd_data}")
        self.benchmark_obj = OCS(**self.crd_data)
        self.benchmark_obj.create()

        # This time is only for reporting - when the benchmark started.
        self.start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())

        # Wait for benchmark client pod to be created
        log.info(f"Waiting for {self.client_pod_name} to Start")
        for bm_pod in TimeoutSampler(
                timeout,
                sleep,
                get_pod_name_by_pattern,
                self.client_pod_name,
                constants.RIPSAW_NAMESPACE,
        ):
            try:
                if bm_pod[0] is not None:
                    self.client_pod = bm_pod[0]
                    break
            except IndexError:
                log.info("Bench pod is not ready yet")
        # Sleeping for 15 sec for the client pod to be fully accessible
        time.sleep(15)
        log.info(f"The benchmark pod {self.client_pod_name} is Running")

    def wait_for_wl_to_finish(self, timeout=18000, sleep=300):
        """
        Waiting until the workload is finished and get the test log

        Args:
            timeout (int): time in second to wait until the benchmark start
            sleep (int): Sleep interval seconds

        Raise:
            exception for too much restarts of the test.

        """
        log.info(f"Waiting for {self.client_pod_name} to complete")

        Finished = 0
        restarts = 0
        while not Finished:
            results = run_oc_command(
                "get pod --no-headers -o custom-columns=:metadata.name,:status.phase",
                namespace="my-ripsaw",
            )
            fname = ""
            for name in results:
                if re.search(self.client_pod_name, name):
                    (fname, status) = name.split()
                    continue
            if not fname == self.client_pod:
                log.info(
                    f"The pod {self.client_pod} was restart. the new client pod is {fname}"
                )
                self.client_pod = fname
                restarts += 1
            if restarts > 3:
                err_msg = f"Too much restarts of the benchmark ({restarts})"
                log.error(err_msg)
                raise Exception(err_msg)
            if status == "Succeeded":
                self.end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT",
                                              time.gmtime())
                self.test_logs = self.pod_obj.exec_oc_cmd(
                    f"logs {self.client_pod}", out_yaml_format=False)
                log.info(f"{self.client_pod} completed successfully")
                Finished = 1
            else:
                log.info(
                    f"{self.client_pod} is in {status} State, and wait to Succeeded State."
                    f" wait another {sleep} sec. for benchmark to complete")
                time.sleep(sleep)

        self.pod_obj.wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=self.client_pod,
            timeout=timeout,
            sleep=sleep,
        )

        # Getting the end time of the benchmark - for reporting.
        self.end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())
        self.test_logs = self.pod_obj.exec_oc_cmd(f"logs {self.client_pod}",
                                                  out_yaml_format=False)
        # Saving the benchmark internal log into a file at the logs directory
        log_file_name = f"{self.full_log_path}/test-pod.log"
        try:
            with open(log_file_name, "w") as f:
                f.write(self.test_logs)
            log.info(f"The Test log can be found at : {log_file_name}")
        except Exception:
            log.warning(f"Cannot write the log to the file {log_file_name}")
        log.info(f"The {self.benchmark_name} benchmark complete")

    def copy_es_data(self, elasticsearch):
        """
        Copy data from Internal ES (if exists) to the main ES

        Args:
            elasticsearch (obj): elasticsearch object (if exits)

        """
        log.info(f"In copy_es_data Function - {elasticsearch}")
        if elasticsearch:
            log.info("Copy all data from Internal ES to Main ES")
            log.info("Dumping data from the Internal ES to tar ball file")
            elasticsearch.dumping_all_data(self.full_log_path)
            es_connection = self.backup_es
            es_connection["host"] = es_connection.pop("server")
            es_connection.pop("url")
            if elasticsearch_load(self.main_es, self.full_log_path):
                # Adding this sleep between the copy and the analyzing of the results
                # since sometimes the results of the read (just after write) are empty
                time.sleep(10)
                log.info(
                    f"All raw data for tests results can be found at : {self.full_log_path}"
                )
                return True
            else:
                log.warning("Cannot upload data into the Main ES server")
                return False

    def read_from_es(self, es, index, uuid):
        """
        Reading all results from elasticsearch server

        Args:
            es (dict): dictionary with elasticsearch info  {server, port}
            index (str): the index name to read from the elasticsearch server
            uuid (str): the test UUID to find in the elasticsearch server

        Returns:
            list : list of all results

        """

        con = Elasticsearch([{"host": es["server"], "port": es["port"]}])
        query = {"size": 1000, "query": {"match": {"uuid": uuid}}}

        try:
            results = con.search(index=index, body=query)
            full_data = []
            for res in results["hits"]["hits"]:
                full_data.append(res["_source"])
            return full_data

        except Exception as e:
            log.warning(f"{index} Not found in the Internal ES. ({e})")
            return []

    def es_connect(self):
        """
        Create elasticsearch connection to the server

        Return:
            bool : True if there is a connection to the ES, False if not.

        """

        OK = True  # the return value
        try:
            log.info(
                f"try to connect the ES : {self.es['server']}:{self.es['port']}"
            )
            self.es_con = Elasticsearch([{
                "host": self.es["server"],
                "port": self.es["port"]
            }])
        except Exception:
            log.error(f"Cannot connect to ES server {self.es}")
            OK = False

        # Testing the connection to the elastic-search
        if not self.es_con.ping():
            log.error(f"Cannot connect to ES server {self.es}")
            OK = False

        return OK
Beispiel #4
0
class CouchBase(PillowFight):
    """
    CouchBase workload operation
    """

    WAIT_FOR_TIME = 1800
    admission_parts = [
        constants.COUCHBASE_ADMISSION_SERVICE_ACCOUNT_YAML,
        constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_YAML,
        constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_BINDING_YAML,
        constants.COUCHBASE_ADMISSION_SECRET_YAML,
        constants.COUCHBASE_ADMISSION_DEPLOYMENT_YAML,
        constants.COUCHBASE_ADMISSION_SERVICE_YAML,
        constants.COUCHBASE_MUTATING_WEBHOOK_YAML,
        constants.COUCHBASE_VALIDATING_WEBHOOK_YAML,
    ]
    pod_obj = OCP(kind="pod")
    ns_obj = OCP(kind="namespace")
    couchbase_pod = OCP(kind="pod")
    secretsadder = OCP(kind="pod")
    admission_pod = []
    cb_worker = OCS()
    cb_examples = OCS()

    def __init__(self, **kwargs):
        """
        Initializer function

        """
        super().__init__(**kwargs)

    def is_up_and_running(self, pod_name, ocp_value):
        """
        Test if the pod specified is up and running.

        Args:
            pod_name (str): Name of pod being checked.
            ocp_value (object): object used for running oc commands

        Returns:
            bool; True if pod is running, False otherwise

        """
        if not pod_name:
            return False
        pod_info = ocp_value.exec_oc_cmd(f"get pods {pod_name} -o json")
        if pod_info["status"]["containerStatuses"][0]["ready"]:
            if "running" in pod_info["status"]["containerStatuses"][0][
                    "state"]:
                return True
        return False

    def setup_cb(self):
        """
        Creating admission parts,couchbase operator pod, couchbase worker secret

        """
        # Create admission controller
        log.info("Create admission controller process for Couchbase")
        self.up_adm_chk = OCP(namespace="default")
        self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR)
        self.adm_objects = []
        for adm_yaml in self.admission_parts:
            adm_data = templating.load_yaml(adm_yaml)
            adm_obj = OCS(**adm_data)
            adm_obj.create()
            self.adm_objects.append(adm_obj)

        # Wait for admission pod to be created
        for adm_pod in TimeoutSampler(
                self.WAIT_FOR_TIME,
                3,
                get_pod_name_by_pattern,
                "couchbase-operator-admission",
                "default",
        ):
            try:
                if self.is_up_and_running(adm_pod[0], self.up_adm_chk):
                    self.admission_pod = adm_pod[0]
                    break
            except IndexError:
                log.info("Admission pod is not ready yet")

        # Wait for admission pod to be running
        log.info("Waiting for admission pod to be running")
        admission_pod_obj = get_pod_obj(self.admission_pod,
                                        namespace="default")
        wait_for_resource_state(
            resource=admission_pod_obj,
            state=constants.STATUS_RUNNING,
            timeout=self.WAIT_FOR_TIME,
        )

        self.ns_obj.new_project(constants.COUCHBASE_OPERATOR)
        couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML)
        self.couchbase_obj = OCS(**couchbase_data)
        self.couchbase_obj.create()
        op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE)
        self.operator_role = OCS(**op_data)
        self.operator_role.create()
        self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR)
        self.serviceaccount.exec_oc_cmd(
            "create serviceaccount couchbase-operator")

        dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets")
        startloc = dockercfgs.find("couchbase-operator-dockercfg")
        newdockerstr = dockercfgs[startloc:]
        endloc = newdockerstr.find(" ")
        dockerstr = newdockerstr[:endloc]
        self.secretsadder.exec_oc_cmd(
            f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}"
        )
        self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR)
        rolebind_cmd = "".join([
            "create rolebinding couchbase-operator-rolebinding ",
            "--role couchbase-operator ",
            "--serviceaccount couchbase-operator-namespace:couchbase-operator",
        ])
        self.rolebinding.exec_oc_cmd(rolebind_cmd)
        dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY)
        self.cb_deploy = OCS(**dep_data)
        self.cb_deploy.create()
        # Wait for couchbase operator pod to be running
        for couchbase_pod in TimeoutSampler(
                self.WAIT_FOR_TIME,
                3,
                get_pod_name_by_pattern,
                "couchbase-operator",
                constants.COUCHBASE_OPERATOR,
        ):
            try:
                if self.is_up_and_running(couchbase_pod[0], self.up_check):
                    break
            except IndexError:
                log.info("Couchbase operator is not up")

        cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET)
        self.cb_worker = OCS(**cb_work)
        self.cb_worker.create()

    def create_couchbase_worker(self, replicas=1, sc_name=None):
        """
        Deploy a Couchbase server and pillowfight workload using operator

        The couchbase workers do not come up unless there is an admission controller
        running.  The admission controller is started from the default project prior
        to bringing up the operator.  Secrets, rolebindings and serviceaccounts
        need to also be generated.

        Once the couchbase operator is running, we need to wait for the three
        worker pods to also be up.  Then a pillowfight task is started.

        After the pillowfight task has finished, the log is collected and
        analyzed.

        Raises:
            Exception: If pillowfight results indicate that a minimum performance
                level is not reached (1 second response time, less than 1000 ops
                per second)

        """
        logging.info("Creating pods..")
        cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE)
        if storagecluster_independent_check():
            cb_example["spec"]["volumeClaimTemplates"][0]["spec"][
                "storageClassName"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
        cb_example["spec"]["servers"][0]["size"] = replicas
        if sc_name:
            cb_example["spec"]["volumeClaimTemplates"][0]["spec"][
                "storageClassName"] = sc_name
        self.cb_examples = OCS(**cb_example)
        self.cb_examples.create()

        # Wait for last of three workers to be running.

        logging.info("Waiting for the pods to Running")
        for cb_wrk_pods in TimeoutSampler(
                self.WAIT_FOR_TIME,
                3,
                get_pod_name_by_pattern,
                "cb-example",
                constants.COUCHBASE_OPERATOR,
        ):
            try:
                if len(cb_wrk_pods) == replicas:
                    counter = 0
                    for cb_pod in cb_wrk_pods:
                        if self.is_up_and_running(cb_pod, self.up_check):
                            counter += 1
                            logging.info(f"Couchbase worker {cb_pod} is up")
                    if counter == replicas:
                        break
            except IndexError:
                logging.info(
                    f"Expected number of couchbase pods are {replicas} "
                    f"but only found {len(cb_wrk_pods)}")

    def run_workload(self,
                     replicas,
                     num_items=None,
                     num_threads=None,
                     run_in_bg=False):
        """
        Running workload with pillow fight operator
        Args:
            replicas (int): Number of pods
            num_items (int): Number of items to be loaded to the cluster
            num_threads (int): Number of threads
            run_in_bg (bool) : Optional run IOs in background

        """
        self.result = None
        logging.info("Running IOs...")
        if run_in_bg:
            executor = ThreadPoolExecutor(1)
            self.result = executor.submit(
                PillowFight.run_pillowfights,
                self,
                replicas=replicas,
                num_items=num_items,
                num_threads=num_threads,
            )
            return self.result
        PillowFight.run_pillowfights(self,
                                     replicas=replicas,
                                     num_items=num_items,
                                     num_threads=num_threads)

    def analyze_run(self, skip_analyze=False):
        """
        Analyzing the workload run logs

        Args:
            skip_analyze (bool): Option to skip logs analysis

        """
        if not skip_analyze:
            logging.info("Analyzing  workload run logs..")
            PillowFight.analyze_all(self)

    def respin_couchbase_app_pod(self):
        """
        Respin the couchbase app pod

        Returns:
            pod status

        """
        app_pod_list = get_pod_name_by_pattern("cb-example",
                                               constants.COUCHBASE_OPERATOR)
        app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)]
        logging.info(f"respin pod {app_pod}")
        app_pod_obj = get_pod_obj(app_pod,
                                  namespace=constants.COUCHBASE_OPERATOR)
        app_pod_obj.delete(wait=True, force=False)
        wait_for_resource_state(resource=app_pod_obj,
                                state=constants.STATUS_RUNNING,
                                timeout=300)

    def get_couchbase_nodes(self):
        """
        Get nodes that contain a couchbase app pod

        Returns:
            list: List of nodes

        """
        app_pods_list = get_pod_name_by_pattern("cb-example",
                                                constants.COUCHBASE_OPERATOR)
        app_pod_objs = list()
        for pod in app_pods_list:
            app_pod_objs.append(
                get_pod_obj(pod, namespace=constants.COUCHBASE_OPERATOR))

        log.info("Create a list of nodes that contain a couchbase app pod")
        nodes_set = set()
        for pod in app_pod_objs:
            logging.info(f"pod {pod.name} located on "
                         f"node {pod.get().get('spec').get('nodeName')}")
            nodes_set.add(pod.get().get("spec").get("nodeName"))
        return list(nodes_set)

    def teardown(self):
        """
        Delete objects created in roughly reverse order of how they were created.

        """
        self.cb_examples.delete()
        self.cb_worker.delete()
        self.cb_deploy.delete()
        self.pod_obj.exec_oc_cmd(
            command=
            "delete rolebinding couchbase-operator-rolebinding -n couchbase-operator-namespace"
        )
        self.pod_obj.exec_oc_cmd(
            command=
            "delete serviceaccount couchbase-operator -n couchbase-operator-namespace"
        )
        self.operator_role.delete()
        self.couchbase_obj.delete()
        switch_to_project("default")
        self.ns_obj.delete_project(constants.COUCHBASE_OPERATOR)
        self.ns_obj.wait_for_delete(resource_name=constants.COUCHBASE_OPERATOR,
                                    timeout=90)
        for adm_obj in self.adm_objects:
            adm_obj.delete()

        # Before the code below was added, the teardown task would sometimes
        # fail with the leftover objects because it would still see one of the
        # couchbase pods.
        for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3,
                                        get_pod_name_by_pattern, "couchbase",
                                        "default"):
            if admin_pod:
                continue
            else:
                break
        PillowFight.cleanup(self)
        switch_to_default_rook_cluster_project()
Beispiel #5
0
class ElasticSearch(object):
    """
    ElasticSearch Environment
    """
    def __init__(self):
        """
        Initializer function

        """
        log.info("Initializing the Elastic-Search environment object")
        self.namespace = "elastic-system"
        self.eck_path = "https://download.elastic.co/downloads/eck/1.1.2"
        self.eck_file = "all-in-one.yaml"
        self.pvc = "ocs_ci/templates/app-pods/es-pvc.yaml"
        self.crd = "ocs_ci/templates/app-pods/esq.yaml"
        self.lspid = None

        # Creating some different types of OCP objects
        self.ocp = OCP(kind="pod",
                       resource_name="elastic-operator-0",
                       namespace=self.namespace)
        self.ns_obj = OCP(kind="namespace", namespace=self.namespace)
        self.es = OCP(resource_name="quickstart-es-http",
                      namespace=self.namespace)
        self.elasticsearch = OCP(namespace=self.namespace,
                                 kind="elasticsearch")
        self.password = OCP(
            kind="secret",
            resource_name="quickstart-es-elastic-user",
            namespace=self.namespace,
        )

        # Fetch the all-in-one.yaml from the official repository
        self._get_eck_file()
        # Deploy the ECK all-in-one.yaml file
        self._deploy_eck()
        # Deploy the Elastic-Search server
        self._deploy_es()

        # Verify that ES is Up & Running
        timeout = 600
        while timeout > 0:
            if self.get_health():
                log.info("The ElasticSearch server is ready !")
                break
            else:
                log.warning("The ElasticSearch server is not ready yet")
                log.info("going to sleep for 30 sec. before next check")
                time.sleep(30)
                timeout -= 30

        # Starting LocalServer process - port forwarding
        self.local_server()

        # Connect to the server
        self.con = self._es_connect()

    def _get_eck_file(self):
        """
        Getting the ECK file from the official Elasticsearch web site and store
        it as a temporary file.

        Current version is 1.1.2, this need to be update with new versions,
        after testing it, and also it may need to update the CRD file (esq.yaml)
        with the new version as well.

        """

        self.dir = tempfile.mkdtemp(prefix="elastic-system_")
        src_file = f"{self.eck_path}/{self.eck_file}"
        trg_file = f"{self.dir}/{self.eck_file}"
        log.info(f"Retrieving the ECK CR file from {src_file} into {trg_file}")
        try:
            urllib.request.urlretrieve(src_file, trg_file)
        except urllib.error.HTTPError as e:
            log.error(f"Can not connect to {src_file} : {e}")
            raise e

    def _deploy_eck(self):
        """
        Deploying the ECK environment for the Elasticsearch, and make sure it
        is in Running mode

        """

        log.info("Deploying the ECK environment for the ES cluster")
        self.ocp.apply(f"{self.dir}/{self.eck_file}")

        for es_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern,
                                     "elastic-operator", self.namespace):
            try:
                if es_pod[0] is not None:
                    self.eckpod = es_pod[0]
                    log.info(f"The ECK pod {self.eckpod} is ready !")
                    break
            except IndexError:
                log.info("ECK operator pod not ready yet")

    def get_ip(self):
        """
        This function return the IP address of the Elasticsearch cluster.
        this IP is to use inside the OCP cluster

        Return
            str : String that represent the Ip Address.

        """
        return self.es.get()["spec"]["clusterIP"]

    def get_port(self):
        """
        This function return the port of the Elasticsearch cluster.

        Return
            str : String that represent the port.

        """
        return self.es.get()["spec"]["ports"][0]["port"]

    def _deploy_es(self):
        log.info("Deploy the PVC for the ElasticSearch cluster")
        self.ocp.apply(self.pvc)

        log.info("Deploy the ElasticSearch cluster")
        self.ocp.apply(self.crd)

        for es_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                     "quickstart-es-default", self.namespace):
            try:
                if es_pod[0] is not None:
                    self.espod = es_pod[0]
                    log.info(f"The ElasticSearch pod {self.espod} Started")
                    break
            except IndexError:
                log.info("elasticsearch pod not ready yet")

        es_pod = OCP(kind="pod", namespace=self.namespace)
        log.info("Waiting for ElasticSearch to Run")
        assert es_pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_name=self.espod,
            sleep=30,
            timeout=600,
        )
        log.info("Elastic Search is ready !!!")

    def get_health(self):
        """
        This method return the health status of the Elasticsearch.

        Returns:
            bool : True if the status is green (OK) otherwise - False

        """
        return self.elasticsearch.get(
        )["items"][0]["status"]["health"] == "green"

    def get_password(self):
        """
        This method return the password used to connect the Elasticsearch.

        Returns:
            str : The password as text

        """
        return base64.b64decode(
            self.password.get()["data"]["elastic"]).decode("utf-8")

    def cleanup(self):
        """
        Cleanup the environment from all Elasticsearch components, and from the
        port forwarding process.

        """
        log.info("Teardown the Elasticsearch environment")
        log.info(f"Killing the local server process ({self.lspid})")
        os.kill(self.lspid, signal.SIGKILL)
        log.info("Deleting all resources")
        subprocess.run(f"oc delete -f {self.crd}", shell=True)
        subprocess.run(f"oc delete -f {self.eck_file}",
                       shell=True,
                       cwd=self.dir)
        self.ns_obj.wait_for_delete(resource_name=self.namespace)

    def local_server(self):
        """
        Starting sub-process that will do port-forwarding, to allow access from
        outside the open-shift cluster into the Elasticsearch server.

        """
        cmd = f"oc -n {self.namespace } "
        cmd += f"port-forward service/quickstart-es-http {self.get_port()}"
        log.info(f"Going to run : {cmd}")
        proc = subprocess.Popen(cmd, shell=True)
        log.info(f"Starting LocalServer with PID of {proc.pid}")
        self.lspid = proc.pid

    def _es_connect(self):
        """
        Create a connection to the ES via the localhost port-fwd

        Returns:
            Elasticsearch: elasticsearch connection object

        Raise:
            ConnectionError: if can not connect to the server

        """
        try:
            es = Elasticsearch([{
                "host": "localhost",
                "port": self.get_port()
            }])
        except esexp.ConnectionError:
            log.error("Can not connect to ES server in the LocalServer")
            raise
        return es

    def get_indices(self):
        """
        Getting list of all indices in the ES server - all created by the test,
        the installation of the ES was without any indexes pre-installed.

        Returns:
            list : list of all indices defined in the ES server

        """
        results = []
        log.info("Getting all indices")
        for ind in self.con.indices.get_alias("*"):
            results.append(ind)
        return results

    def _copy(self, es):
        """
        Copy All data from the internal ES server to the main ES

        Args:
            es (obj): elasticsearch object which connected to the main ES

        """

        query = {"size": 1000, "query": {"match_all": {}}}
        for ind in self.get_indices():
            log.info(f"Reading {ind} from internal ES server")
            try:
                result = self.con.search(index=ind, body=query)
            except esexp.NotFoundError:
                log.warning(f"{ind} Not found in the Internal ES.")
                continue

            log.debug(f"The results from internal ES for {ind} are :{result}")
            log.info(f"Writing {ind} into main ES server")
            for doc in result["hits"]["hits"]:
                log.debug(f"Going to write : {doc}")
                es.index(index=ind, doc_type="_doc", body=doc["_source"])
Beispiel #6
0
    def delete(self, retry=True):
        """
        Deletes the current namespacestore by using OC/CLI commands

        Args:
            retry (bool): Whether to retry the deletion if it fails

        """
        log.info(f"Cleaning up namespacestore {self.name}")

        def _oc_deletion_flow():
            try:
                OCP(
                    kind="namespacestore",
                    namespace=config.ENV_DATA["cluster_namespace"],
                ).delete(resource_name=self.name)
                return True
            except CommandFailed as e:
                if "not found" in str(e).lower():
                    log.warning(f"Namespacestore {self.name} was already deleted.")
                    return True
                elif all(
                    err in e.args[0]
                    for err in ["cannot complete because pool", "in", "state"]
                ):
                    if retry:
                        log.warning(
                            f"Deletion of {self.name} failed due to its state; Retrying"
                        )
                        return False
                    else:
                        raise
                else:
                    raise

        def _cli_deletion_flow():
            try:
                self.mcg_obj.exec_mcg_cmd(f"namespacestore delete {self.name}")
                return True
            except CommandFailed as e:
                if "being used by one or more buckets" in str(e).lower():
                    log.warning(
                        f"Deletion of {self.name} failed because it's being used by a bucket. "
                        "Retrying..."
                    )
                else:
                    log.warning(f"Deletion of self.name failed. Error:\n{str(e)}")
                return False

        cmdMap = {
            "oc": _oc_deletion_flow,
            "cli": _cli_deletion_flow,
        }
        if retry:
            sample = TimeoutSampler(
                timeout=120,
                sleep=20,
                func=cmdMap[self.method],
            )
            if not sample.wait_for_func_status(result=True):
                err_msg = f"Failed to delete {self.name}"
                log.error(err_msg)
                raise TimeoutExpiredError(err_msg)
        else:
            cmdMap[self.method]()

        log.info(f"Verifying whether namespacestore {self.name} exists after deletion")
        ns_deleted_successfully = False

        if self.method == "oc":
            try:
                OCP(
                    kind=constants.NAMESPACESTORE,
                    namespace=config.ENV_DATA["cluster_namespace"],
                    resource_name=self.name,
                ).get()
            except CommandFailed as e:
                if "not found" in str(e).lower():
                    log.info(f"Namespacestore {self.name} was deleted.")
                    ns_deleted_successfully = True
                else:
                    raise
        elif self.method == "cli":
            if self.name not in self.mcg_obj.exec_mcg_cmd("namespacestore list").stdout:
                ns_deleted_successfully = True

        assert (
            ns_deleted_successfully
        ), f"Namespacestore {self.name} was not deleted successfully"
Beispiel #7
0
def test_upgrade():
    ceph_cluster = CephCluster()
    with CephHealthMonitor(ceph_cluster):
        namespace = config.ENV_DATA['cluster_namespace']
        version_before_upgrade = config.ENV_DATA.get("ocs_version")
        upgrade_version = config.UPGRADE.get("upgrade_ocs_version",
                                             version_before_upgrade)
        ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image')
        if ocs_registry_image:
            upgrade_version = get_ocs_version_from_image(ocs_registry_image)
        parsed_version_before_upgrade = parse_version(version_before_upgrade)
        parsed_upgrade_version = parse_version(upgrade_version)
        assert parsed_upgrade_version >= parsed_version_before_upgrade, (
            f"Version you would like to upgrade to: {upgrade_version} "
            f"is not higher or equal to the version you currently running: "
            f"{version_before_upgrade}")
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
        )
        channel = config.DEPLOYMENT.get('ocs_csv_channel')
        csv_name_pre_upgrade = package_manifest.get_current_csv(channel)
        log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}")
        csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade,
                              namespace=namespace)
        pre_upgrade_images = get_images(csv_pre_upgrade.get())
        version_change = parsed_upgrade_version > parsed_version_before_upgrade
        if version_change:
            version_config_file = os.path.join(constants.CONF_DIR,
                                               'ocs_version',
                                               f'ocs-{upgrade_version}.yaml')
            load_config_file(version_config_file)
        ocs_catalog = CatalogSource(
            resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME,
            namespace=constants.MARKETPLACE_NAMESPACE,
        )
        upgrade_in_current_source = config.UPGRADE.get(
            'upgrade_in_current_source', False)
        if not upgrade_in_current_source:
            if not ocs_catalog.is_exist() and not upgrade_in_current_source:
                log.info("OCS catalog source doesn't exist. Creating new one.")
                create_catalog_source(ocs_registry_image, ignore_upgrade=True)
            image_url = ocs_catalog.get_image_url()
            image_tag = ocs_catalog.get_image_name()
            log.info(f"Current image is: {image_url}, tag: {image_tag}")
            if ocs_registry_image:
                image_url, new_image_tag = ocs_registry_image.split(':')
            elif (config.UPGRADE.get('upgrade_to_latest', True)
                  or version_change):
                new_image_tag = get_latest_ds_olm_tag()
            else:
                new_image_tag = get_next_version_available_for_upgrade(
                    image_tag)
            cs_data = deepcopy(ocs_catalog.data)
            image_for_upgrade = ':'.join([image_url, new_image_tag])
            log.info(f"Image: {image_for_upgrade} will be used for upgrade.")
            cs_data['spec']['image'] = image_for_upgrade

            with NamedTemporaryFile() as cs_yaml:
                dump_data_to_temp_yaml(cs_data, cs_yaml.name)
                ocs_catalog.apply(cs_yaml.name)
        # Wait for the new package manifest for upgrade.
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
        )
        package_manifest.wait_for_resource()
        channel = config.DEPLOYMENT.get('ocs_csv_channel')
        if not channel:
            channel = package_manifest.get_default_channel()

        # update subscription
        subscription = OCP(
            resource_name=constants.OCS_SUBSCRIPTION,
            kind='subscription',
            namespace=config.ENV_DATA['cluster_namespace'],
        )
        current_ocs_source = subscription.data['spec']['source']
        log.info(f"Current OCS subscription source: {current_ocs_source}")
        ocs_source = current_ocs_source if upgrade_in_current_source else (
            constants.OPERATOR_CATALOG_SOURCE_NAME)
        patch_subscription_cmd = (
            f'oc patch subscription {constants.OCS_SUBSCRIPTION} '
            f'-n {namespace} --type merge -p \'{{"spec":{{"channel": '
            f'"{channel}", "source": "{ocs_source}"}}}}\'')
        run_cmd(patch_subscription_cmd)

        subscription_plan_approval = config.DEPLOYMENT.get(
            'subscription_plan_approval')
        if subscription_plan_approval == 'Manual':
            wait_for_install_plan_and_approve(namespace)
        attempts = 145
        for attempt in range(1, attempts + 1):
            log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.")
            csv_name_post_upgrade = package_manifest.get_current_csv(channel)
            if csv_name_post_upgrade == csv_name_pre_upgrade:
                log.info(f"CSV is still: {csv_name_post_upgrade}")
                sleep(5)
            else:
                log.info(f"CSV now upgraded to: {csv_name_post_upgrade}")
                break
            if attempts == attempt:
                raise TimeoutException("No new CSV found after upgrade!")
        csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade,
                               namespace=namespace)
        log.info(
            f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state"
        )
        if version_before_upgrade == '4.2' and upgrade_version == '4.3':
            log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3")
            setup_ceph_toolbox(force_setup=True)
        csv_post_upgrade.wait_for_phase("Succeeded", timeout=600)
        post_upgrade_images = get_images(csv_post_upgrade.get())
        old_images, _, _ = get_upgrade_image_info(pre_upgrade_images,
                                                  post_upgrade_images)
        verify_image_versions(old_images, parsed_upgrade_version)
        ocs_install_verification(
            timeout=600,
            skip_osd_distribution_check=True,
            ocs_registry_image=ocs_registry_image,
            post_upgrade_verification=True,
        )
Beispiel #8
0
    def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern):
        """
        This is a basic fio perf test

        """

        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml")
        if interface == "CephBlockPool":
            sc = constants.CEPHBLOCKPOOL_SC
        else:
            sc = constants.CEPHFILESYSTEM_SC

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        if "elasticsearch" in fio_cr["spec"]:
            backup_es = fio_cr["spec"]["elasticsearch"]
        else:
            log.warning(
                "Elastic Search information does not exists in YAML file")
            fio_cr["spec"]["elasticsearch"] = {}

        # Use the internal define elastic-search server in the test - if exist
        if es:
            fio_cr["spec"]["elasticsearch"] = {
                "server": es.get_ip(),
                "port": es.get_port(),
            }

        # Setting the data set to 40% of the total storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()
        total_data_set = int(ceph_capacity * 0.4)
        filesize = int(fio_cr["spec"]["workload"]["args"]["filesize"].replace(
            "GiB", ""))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if total_data_set > 500:
            filesize = int(ceph_capacity * 0.008)
            fio_cr["spec"]["workload"]["args"]["filesize"] = f"{filesize}GiB"
            # make sure that the storage size is larger then the file size
            fio_cr["spec"]["workload"]["args"][
                "storagesize"] = f"{int(filesize * 1.2)}Gi"
        fio_cr["spec"]["workload"]["args"]["servers"] = int(total_data_set /
                                                            filesize)
        log.info(f"Total Data set to work on is : {total_data_set} GiB")

        environment = get_environment_info()
        if not environment["user"] == "":
            fio_cr["spec"]["test_user"] = environment["user"]
        fio_cr["spec"]["clustername"] = environment["clustername"]

        log.debug(f"Environment information is : {environment}")

        fio_cr["spec"]["workload"]["args"]["storageclass"] = sc
        if io_pattern == "sequential":
            fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"]
            fio_cr["spec"]["workload"]["args"]["iodepth"] = 1
        log.info(f"The FIO CR file is {fio_cr}")
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                      "fio-client",
                                      constants.RIPSAW_NAMESPACE):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Getting the start time of the test
        start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())

        # Getting the UUID from inside the benchmark pod
        uuid = ripsaw.get_uuid(fio_client_pod)
        # Setting back the original elastic-search information
        fio_cr["spec"]["elasticsearch"] = backup_es

        full_results = FIOResultsAnalyse(uuid, fio_cr)

        # Initialize the results doc file.
        for key in environment:
            full_results.add_key(key, environment[key])

        # Setting the global parameters of the test
        full_results.add_key("io_pattern", io_pattern)
        full_results.add_key("dataset", f"{total_data_set}GiB")
        full_results.add_key("file_size",
                             fio_cr["spec"]["workload"]["args"]["filesize"])
        full_results.add_key("servers",
                             fio_cr["spec"]["workload"]["args"]["servers"])
        full_results.add_key("samples",
                             fio_cr["spec"]["workload"]["args"]["samples"])
        full_results.add_key("operations",
                             fio_cr["spec"]["workload"]["args"]["jobs"])
        full_results.add_key("block_sizes",
                             fio_cr["spec"]["workload"]["args"]["bs"])
        full_results.add_key("io_depth",
                             fio_cr["spec"]["workload"]["args"]["iodepth"])
        full_results.add_key("jobs",
                             fio_cr["spec"]["workload"]["args"]["numjobs"])
        full_results.add_key(
            "runtime",
            {
                "read": fio_cr["spec"]["workload"]["args"]["read_runtime"],
                "write": fio_cr["spec"]["workload"]["args"]["write_runtime"],
            },
        )
        full_results.add_key(
            "storageclass", fio_cr["spec"]["workload"]["args"]["storageclass"])
        full_results.add_key("vol_size",
                             fio_cr["spec"]["workload"]["args"]["storagesize"])

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind="pod")
        pod_obj.wait_for_resource(
            condition="Completed",
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        # Getting the end time of the test
        end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())
        full_results.add_key("test_time", {
            "start": start_time,
            "end": end_time
        })

        output = run_cmd(f"oc logs {fio_client_pod}")
        log.info(f"The Test log is : {output}")

        try:
            if "Fio failed to execute" not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()

        log.debug(f"Full results is : {full_results.results}")

        # if Internal ES is exists, Copy all data from the Internal to main ES
        if es:
            log.info("Copy all data from Internal ES to Main ES")
            es._copy(full_results.es)
        # Adding this sleep between the copy and the analyzing of the results
        # since sometimes the results of the read (just after write) are empty
        time.sleep(30)
        full_results.analyze_results()  # Analyze the results
        # Writing the analyzed test results to the Elastic-Search server
        full_results.es_write()
        full_results.codespeed_push()  # Push results to codespeed
        # Creating full link to the results on the ES server
        log.info(f"The Result can be found at ; {full_results.results_link()}")
    def test_replication_with_disruptions(
        self,
        awscli_pod_session,
        mcg_obj_session,
        cld_mgr,
        bucket_factory,
        source_bucketclass,
        target_bucketclass,
        test_directory_setup,
        nodes,
    ):

        # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket
        target_bucket_name = bucket_factory(
            bucketclass=target_bucketclass)[0].name
        replication_policy = ("basic-replication-rule", target_bucket_name,
                              None)
        source_bucket_name = bucket_factory(
            bucketclass=source_bucketclass,
            replication_policy=replication_policy)[0].name
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            source_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=5,
            pattern="first-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Uni-directional bucket replication working as expected")

        # change from uni-directional to bi-directional replication policy
        logger.info(
            "Changing the replication policy from uni to bi-directional!")
        bi_replication_policy_dict = {
            "spec": {
                "additionalConfig": {
                    "replicationPolicy":
                    json.dumps([{
                        "rule_id": "basic-replication-rule-2",
                        "destination_bucket": source_bucket_name,
                    }])
                }
            }
        }
        OCP(
            namespace=config.ENV_DATA["cluster_namespace"],
            kind="obc",
            resource_name=target_bucket_name,
        ).patch(params=json.dumps(bi_replication_policy_dict),
                format_type="merge")
        logger.info(
            "Patch ran successfully! Changed the replication policy from uni to bi directional"
        )

        # write objects to the second bucket and see if it's replicated on the other
        logger.info("checking if bi-directional replication works!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=3,
            pattern="second-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")
        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Bi directional bucket replication working as expected")

        # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on
        # write
        logger.info(
            "checking replication when one of the bucket's objects are deleted!!"
        )
        try:
            mcg_obj_session.s3_resource.Bucket(
                target_bucket_name).objects.all().delete()
        except CommandFailed as e:
            logger.error(f"[Error] while deleting objects: {e}")
        if len(
                mcg_obj_session.s3_list_all_objects_in_bucket(
                    target_bucket_name)) != 0:
            assert (
                False
            ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}"
        logger.info("All the objects in RGW namespace buckets are deleted!!!")

        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="third-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info(
            "All the objects retrieved back to s3-compatible bucket on new write!!"
        )

        # restart RGW pods and then see if object sync still works
        logger.info(
            "Checking if the replication works when there is RGW pod restarts!!"
        )
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fourth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        pod_names = get_pod_name_by_pattern(
            "rgw", namespace=config.ENV_DATA["cluster_namespace"])
        pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"])
        delete_pods(pod_objs=pod_objs)
        wait_for_pods_to_be_running(
            pod_names=pod_names,
            namespace=config.ENV_DATA["cluster_namespace"])

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Object sync works after the RGW pod restarted!!")

        # write some object to any of the bucket, followed by immediate cluster restart
        logger.info("Checking replication when there is a cluster reboot!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fifth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        node_list = get_worker_nodes()
        node_objs = get_node_objs(node_list)
        nodes.restart_nodes(node_objs, timeout=500)
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        wait_for_pods_to_be_running(
            namespace=config.ENV_DATA["cluster_namespace"], timeout=800)
        logger.info("Nodes rebooted successfully!!")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Objects sync works even when the cluster is rebooted")
Beispiel #10
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data['spec']['version']
    ocs_version = config.ENV_DATA['ocs_version']
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert ocs_version in csv_version, (
        f"OCS version: {ocs_version} mismatch with CSV version {csv_version}")
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        'ocs_registry_image')
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch')
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT['external_mode']:
        osd_count = (int(
            storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
                     int(storage_cluster.data['spec']['storageDeviceSets'][0]
                         ['replica']))
    rgw_count = None
    if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS:
        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1
        # post upgrade to OCS 4.5. Tracked with
        # https://github.com/red-hat-storage/ocs-ci/issues/2532
        rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not (
            post_upgrade_verification) else 1

    # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore
    if float(config.ENV_DATA['ocs_version']) == 4.4 and config.ENV_DATA.get(
            'platform') == constants.AZURE_PLATFORM:
        rgw_count = 1
    if float(config.ENV_DATA['ocs_version']) == 4.5 and config.ENV_DATA.get(
            'platform'
    ) == constants.AZURE_PLATFORM and post_upgrade_verification:
        rgw_count = 1

    # Fetch the min and max Noobaa endpoints from the run config
    if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU,
                         min_memory=constants.MIN_NODE_MEMORY):
        min_eps = config.DEPLOYMENT.get('min_noobaa_endpoints')
        max_eps = config.DEPLOYMENT.get('max_noobaa_endpoints')
    else:
        min_eps = 1
        max_eps = 1 if float(config.ENV_DATA['ocs_version']) < 4.6 else 2

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps
    }
    if not config.DEPLOYMENT['external_mode']:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if not config.ENV_DATA.get(
                    'platform') in constants.ON_PREM_PLATFORMS:
                continue
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector=label,
                                     resource_count=count,
                                     timeout=timeout)

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL,
        namespace=defaults.ROOK_CLUSTER_NAMESPACE)
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})")

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd'
    }
    if config.DEPLOYMENT['external_mode']:
        required_storage_classes.update({
            f'{storage_cluster_name}-ceph-rgw',
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io'
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name']
        for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not config.DEPLOYMENT['external_mode']:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
            deviceset_count = get_deviceset_count()
            node_names = [osd['spec']['nodeName'] for osd in osds]
            for node in node_names:
                assert not node_names.count(node) > deviceset_count, (
                    "OSD's are not distributed evenly across worker nodes")

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = ({
        item['metadata']['name']
        for item in csi_driver.get()['items']
    })
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT['external_mode']:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    if not config.DEPLOYMENT['external_mode']:
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")

        if (config.DEPLOYMENT.get('local_storage')
                and config.ENV_DATA['platform'] !=
                constants.BAREMETALPSI_PLATFORM):
            deviceset_pvcs = get_compute_node_names()
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        ct_pod = get_ceph_tools_pod()
        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree',
                                        format='json')
        schemas = {
            'root': constants.OSD_TREE_ROOT,
            'rack': constants.OSD_TREE_RACK,
            'host': constants.OSD_TREE_HOST,
            'osd': constants.OSD_TREE_OSD,
            'region': constants.OSD_TREE_REGION,
            'zone': constants.OSD_TREE_ZONE
        }
        schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
        for item in osd_tree['nodes']:
            validate(instance=item, schema=schemas[item['type']])
            if item['type'] == 'host':
                deviceset_pvcs.remove(item['name'])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if float(config.ENV_DATA['ocs_version']) < 4.6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            ])
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ('snapshot' not in container) and (
                    'snapshot' not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()['spec']['install']['spec']['deployments']
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val['name'] == 'rook-ceph-operator'
        ]
        assert {
            'name': 'CSI_ENABLE_SNAPSHOTTER',
            'value': 'false'
        } in (rook_ceph_operator_deployment[0]['spec']['template']['spec']
              ['containers'][0]['env']
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump',
                                          format='')
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [
            rule for rule in crush_dump['rules']
            if rule['rule_name'] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps']
                if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get('fips'):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
Beispiel #11
0
    def test_smallfile_workload(
        self, ripsaw, es, file_size, files, threads, samples, interface
    ):
        """
        Run SmallFile Workload
        """

        # Loading the main template yaml file for the benchmark
        sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        if "elasticsearch" in sf_data["spec"]:
            sf_data["spec"]["elasticsearch"][
                "url"
            ] = f"http://{sf_data['spec']['elasticsearch']['server']}:{sf_data['spec']['elasticsearch']['port']}"
            backup_es = sf_data["spec"]["elasticsearch"]
        else:
            log.warning("Elastic Search information does not exists in YAML file")
            sf_data["spec"]["elasticsearch"] = {}

        # Use the internal define elastic-search server in the test - if exist
        if es:
            sf_data["spec"]["elasticsearch"] = {
                "url": f"http://{es.get_ip()}:{es.get_port()}",
                "server": es.get_ip(),
                "port": es.get_port(),
            }

        log.info("Apply Operator CRD")
        ripsaw.apply_crd("resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml")
        if interface == constants.CEPHBLOCKPOOL:
            storageclass = constants.DEFAULT_STORAGECLASS_RBD
        else:
            storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS
        log.info(f"Using {storageclass} Storageclass")
        sf_data["spec"]["workload"]["args"]["storageclass"] = storageclass
        log.info("Running SmallFile bench")

        """
            Setting up the parameters for this test
        """
        sf_data["spec"]["workload"]["args"]["file_size"] = file_size
        sf_data["spec"]["workload"]["args"]["files"] = files
        sf_data["spec"]["workload"]["args"]["threads"] = threads
        sf_data["spec"]["workload"]["args"]["samples"] = samples
        """
        Calculating the size of the volume that need to be test, it should
        be at least twice in the size then the size of the files, and at
        least 100Gi.

        Since the file_size is in Kb and the vol_size need to be in Gb, more
        calculation is needed.
        """
        vol_size = int(files * threads * file_size * 3)
        vol_size = int(vol_size / constants.GB2KB)
        if vol_size < 100:
            vol_size = 100
        sf_data["spec"]["workload"]["args"]["storagesize"] = f"{vol_size}Gi"
        environment = get_environment_info()
        if not environment["user"] == "":
            sf_data["spec"]["test_user"] = environment["user"]
        else:
            # since full results object need this parameter, initialize it from CR file
            environment["user"] = sf_data["spec"]["test_user"]

        sf_data["spec"]["clustername"] = environment["clustername"]

        sf_obj = OCS(**sf_data)
        sf_obj.create()
        log.info(f"The smallfile yaml file is {sf_data}")

        # wait for benchmark pods to get created - takes a while
        for bench_pod in TimeoutSampler(
            240,
            10,
            get_pod_name_by_pattern,
            "smallfile-client",
            constants.RIPSAW_NAMESPACE,
        ):
            try:
                if bench_pod[0] is not None:
                    small_file_client_pod = bench_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        bench_pod = OCP(kind="pod", namespace=constants.RIPSAW_NAMESPACE)
        log.info("Waiting for SmallFile benchmark to Run")
        assert bench_pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_name=small_file_client_pod,
            sleep=30,
            timeout=600,
        )
        # Getting the start time of the test
        start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())

        test_start_time = time.time()

        # After testing manually, changing the timeout
        timeout = 3600

        # Getting the UUID from inside the benchmark pod
        uuid = ripsaw.get_uuid(small_file_client_pod)
        # Setting back the original elastic-search information
        if backup_es:
            sf_data["spec"]["elasticsearch"] = backup_es

        full_results = SmallFileResultsAnalyse(uuid, sf_data)

        # Initialize the results doc file.
        for key in environment:
            full_results.add_key(key, environment[key])

        # Calculating the total size of the working data set - in GB
        full_results.add_key(
            "dataset",
            file_size
            * files
            * threads
            * full_results.results["clients"]
            / constants.GB2KB,
        )

        full_results.add_key(
            "global_options",
            {
                "files": files,
                "file_size": file_size,
                "storageclass": sf_data["spec"]["workload"]["args"]["storageclass"],
                "vol_size": sf_data["spec"]["workload"]["args"]["storagesize"],
            },
        )

        while True:
            logs = bench_pod.exec_oc_cmd(
                f"logs {small_file_client_pod}", out_yaml_format=False
            )
            if "RUN STATUS DONE" in logs:
                # Getting the end time of the test
                end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())
                full_results.add_key(
                    "test_time", {"start": start_time, "end": end_time}
                )
                # if Internal ES is exists, Copy all data from the Internal to main ES
                if es:
                    log.info("Copy all data from Internal ES to Main ES")
                    es._copy(full_results.es)
                full_results.read()
                if not full_results.dont_check:
                    full_results.add_key("hosts", full_results.get_clients_list())
                    full_results.init_full_results()
                    full_results.aggregate_host_results()
                    test_status = full_results.aggregate_samples_results()
                    full_results.es_write()

                    # Creating full link to the results on the ES server
                    log.info(
                        f"The Result can be found at ; {full_results.results_link()}"
                    )
                else:
                    test_status = True

                break

            if timeout < (time.time() - test_start_time):
                raise TimeoutError("Timed out waiting for benchmark to complete")
            time.sleep(30)
        assert not get_logs_with_errors() and test_status, "Test Failed"
Beispiel #12
0
def add_capacity(osd_size_capacity_requested):
    """
    Add storage capacity to the cluster

    Args:
        osd_size_capacity_requested(int): Requested osd size capacity

    Returns:
        new storage device set count (int) : Returns True if all OSDs are in Running state

    Note:
    "StoragedeviceSets->count" represents the set of 3 OSDs.
    That is, if there are 3 OSDs in the system then count will be 1.
    If there are 6 OSDs then count is 2 and so on.
    By changing this value,we can add extra devices to the cluster.
    For example, if we want to expand the cluster by 3 more osds in a cluster that already has 3 osds,
    we can set count as 2. So, with each increase of count by 1,
    we get 3 OSDs extra added to the cluster.
    This is how we are going to 'add capacity' via automation.
    As we know that OCS has 3 way replica. That is, same data is placed in 3 OSDs.
    Because of this, the total usable capacity for apps from 3 OSDs
    will be the size of one OSD (all osds are of same size).
    If we want to add more capacity to the cluster then we need to add 3 OSDs of same size
    as that of the original OSD. add_capacity needs to accept the 'capacity_to_add' as an argument.
    From this we need to arrive at storagedeviceSets -> count and then
    "Patch" this count to get the required capacity to add.
    To do so, we use following formula:
    storageDeviceSets->count = (capacity reqested / osd capacity ) + existing count storageDeviceSets

    """
    osd_size_existing = get_osd_size()
    device_sets_required = int(osd_size_capacity_requested / osd_size_existing)
    old_storage_devices_sets_count = get_deviceset_count()
    new_storage_devices_sets_count = int(device_sets_required +
                                         old_storage_devices_sets_count)
    lvpresent = localstorage.check_local_volume()
    if lvpresent:
        ocp_obj = OCP(kind='localvolume',
                      namespace=constants.LOCAL_STORAGE_NAMESPACE)
        localvolume_data = ocp_obj.get(resource_name='local-block')
        device_list = localvolume_data['spec']['storageClassDevices'][0][
            'devicePaths']
        final_device_list = localstorage.get_new_device_paths(
            device_sets_required, osd_size_capacity_requested)
        device_list.sort()
        final_device_list.sort()
        if device_list == final_device_list:
            raise ResourceNotFoundError("No Extra device found")
        param = f"""[{{ "op": "replace", "path": "/spec/storageClassDevices/0/devicePaths",
                                                 "value": {final_device_list}}}]"""
        log.info(f"Final device list : {final_device_list}")
        lvcr = localstorage.get_local_volume_cr()
        log.info("Patching Local Volume CR...")
        lvcr.patch(resource_name=lvcr.get()['items'][0]['metadata']['name'],
                   params=param.strip('\n'),
                   format_type='json')
        localstorage.check_pvs_created(
            int(len(final_device_list) / new_storage_devices_sets_count))
    sc = get_storage_cluster()
    # adding the storage capacity to the cluster
    params = f"""[{{ "op": "replace", "path": "/spec/storageDeviceSets/0/count",
                "value": {new_storage_devices_sets_count}}}]"""
    sc.patch(resource_name=sc.get()['items'][0]['metadata']['name'],
             params=params.strip('\n'),
             format_type='json')
    return new_storage_devices_sets_count
Beispiel #13
0
    def run_pillowfights(self,
                         replicas=1,
                         num_items=None,
                         num_threads=None,
                         timeout=1800):
        """
        loop through all the yaml files extracted from the pillowfight repo
        and run them.  Run oc logs on the results and save the logs in self.logs
        directory

        Args:
            replicas (int): Number of pod replicas
            num_items (int): Number of items to be loaded to the cluster
            num_threads (int): Number of threads

        """
        ocp_local = OCP(namespace=self.namespace)
        self.replicas = replicas
        for i in range(self.replicas):
            # for basic-fillowfight.yaml
            pfight = templating.load_yaml(constants.COUCHBASE_PILLOWFIGHT)
            pfight["metadata"]["name"] = "pillowfight-rbd-simple" + f"{i}"
            # change the name
            pfight["spec"]["template"]["spec"]["containers"][0]["command"][2] = (
                f"couchbase://cb-example-000{i}.cb-example."
                f"couchbase-operator-namespace.svc:8091/default?select_bucket=true"
            )
            # num of items
            pfight["spec"]["template"]["spec"]["containers"][0]["command"][
                4] = (str(num_items) if num_items else "20000")
            # num of threads
            pfight["spec"]["template"]["spec"]["containers"][0]["command"][
                13] = (str(num_threads) if num_threads else "20")
            lpillowfight = OCS(**pfight)
            lpillowfight.create()
        self.pods_info = {}

        for pillowfight_pods in TimeoutSampler(
                timeout,
                9,
                get_pod_name_by_pattern,
                "pillowfight",
                constants.COUCHBASE_OPERATOR,
        ):
            try:
                counter = 0
                for pf_pod in pillowfight_pods:
                    pod_info = self.up_check.exec_oc_cmd(
                        f"get pods {pf_pod} -o json")
                    pf_status = pod_info["status"]["containerStatuses"][0][
                        "state"]
                    if "terminated" in pf_status:
                        pf_completion_info = pf_status["terminated"]["reason"]
                        if pf_completion_info == constants.STATUS_COMPLETED:
                            counter += 1
                            self.pods_info.update({pf_pod: pf_completion_info})
                    elif "running" in pf_status:
                        pass
                if counter == self.replicas:
                    break
            except IndexError:
                log.info("Pillowfight not yet completed")

        log.info(self.pods_info)
        for pod, pf_completion_info in self.pods_info.items():
            if pf_completion_info == "Completed":
                pf_endlog = f"{pod}.log"
                pf_log = join(self.logs, pf_endlog)
                data_from_log = ocp_local.exec_oc_cmd(
                    f"logs -f {pod} --ignore-errors", out_yaml_format=False)
                data_from_log = data_from_log.replace("\x00", "")
                with open(pf_log, "w") as fd:
                    fd.write(data_from_log)

            elif pf_completion_info == "Error":
                raise Exception("Pillowfight failed to complete")
Beispiel #14
0
class PillowFight(object):
    """
    Workload operation using PillowFight
    This class was modelled after the RipSaw class in this directory.
    """

    MIN_ACCEPTABLE_OPS_PER_SEC = 2000
    MAX_ACCEPTABLE_RESPONSE_TIME = 2000

    def __init__(self, **kwargs):
        """
        Initializer function

        Args:
            kwargs (dict):
                Following kwargs are valid
                repo: PillowFight repo to used - a github link
                branch: branch to use from the repo
                namespace: namespace for the operator

        Example Usage:
            r1 = PillowFight()
            r1.run_pillowfights()
            # To run a private yaml
            my_custom_bench = my_custom_bench.yaml
            run_cmd('oc apply -f my_custom_bench')
            # To get pillowfight data from log file
            data = r1.extract_data(log_file)
            # To do basic sanity checking of data
            r1.sanity_check(data)

        """
        self.args = kwargs
        self.namespace = self.args.get("namespace",
                                       "couchbase-operator-namespace")
        self.ocp = OCP()
        self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR)
        self.logs = tempfile.mkdtemp(prefix="pf_logs_")

    def run_pillowfights(self,
                         replicas=1,
                         num_items=None,
                         num_threads=None,
                         timeout=1800):
        """
        loop through all the yaml files extracted from the pillowfight repo
        and run them.  Run oc logs on the results and save the logs in self.logs
        directory

        Args:
            replicas (int): Number of pod replicas
            num_items (int): Number of items to be loaded to the cluster
            num_threads (int): Number of threads

        """
        ocp_local = OCP(namespace=self.namespace)
        self.replicas = replicas
        for i in range(self.replicas):
            # for basic-fillowfight.yaml
            pfight = templating.load_yaml(constants.COUCHBASE_PILLOWFIGHT)
            pfight["metadata"]["name"] = "pillowfight-rbd-simple" + f"{i}"
            # change the name
            pfight["spec"]["template"]["spec"]["containers"][0]["command"][2] = (
                f"couchbase://cb-example-000{i}.cb-example."
                f"couchbase-operator-namespace.svc:8091/default?select_bucket=true"
            )
            # num of items
            pfight["spec"]["template"]["spec"]["containers"][0]["command"][
                4] = (str(num_items) if num_items else "20000")
            # num of threads
            pfight["spec"]["template"]["spec"]["containers"][0]["command"][
                13] = (str(num_threads) if num_threads else "20")
            lpillowfight = OCS(**pfight)
            lpillowfight.create()
        self.pods_info = {}

        for pillowfight_pods in TimeoutSampler(
                timeout,
                9,
                get_pod_name_by_pattern,
                "pillowfight",
                constants.COUCHBASE_OPERATOR,
        ):
            try:
                counter = 0
                for pf_pod in pillowfight_pods:
                    pod_info = self.up_check.exec_oc_cmd(
                        f"get pods {pf_pod} -o json")
                    pf_status = pod_info["status"]["containerStatuses"][0][
                        "state"]
                    if "terminated" in pf_status:
                        pf_completion_info = pf_status["terminated"]["reason"]
                        if pf_completion_info == constants.STATUS_COMPLETED:
                            counter += 1
                            self.pods_info.update({pf_pod: pf_completion_info})
                    elif "running" in pf_status:
                        pass
                if counter == self.replicas:
                    break
            except IndexError:
                log.info("Pillowfight not yet completed")

        log.info(self.pods_info)
        for pod, pf_completion_info in self.pods_info.items():
            if pf_completion_info == "Completed":
                pf_endlog = f"{pod}.log"
                pf_log = join(self.logs, pf_endlog)
                data_from_log = ocp_local.exec_oc_cmd(
                    f"logs -f {pod} --ignore-errors", out_yaml_format=False)
                data_from_log = data_from_log.replace("\x00", "")
                with open(pf_log, "w") as fd:
                    fd.write(data_from_log)

            elif pf_completion_info == "Error":
                raise Exception("Pillowfight failed to complete")

    def analyze_all(self):
        """
        Analyze the data extracted into self.logs files

        """
        for path in listdir(self.logs):
            full_path = join(self.logs, path)
            log.info(f"Analyzing {full_path}")
            with open(full_path, "r") as fdesc:
                data_from_log = fdesc.read()
            log_data = self.parse_pillowfight_log(data_from_log)
            self.sanity_check(log_data)

    def sanity_check(self, stats):
        """
        Make sure the worst cases for ops per second and response times are
        within an acceptable range.

        """
        stat1 = min(stats["opspersec"])
        if stat1 < self.MIN_ACCEPTABLE_OPS_PER_SEC:
            raise Exception(f"Worst OPS/SEC value reported is {stat1}")
        stat2 = max(stats["resptimes"].keys()) / 1000
        if stat2 > self.MAX_ACCEPTABLE_RESPONSE_TIME:
            raise Exception(
                f"Worst response time reported is {stat2} milliseconds")

    def parse_pillowfight_log(self, data_from_log):
        """
        Run oc logs on the pillowfight pod passed in.  Cleanup the output
        from oc logs to handle peculiarities in the couchbase log results,
        and generate a summary of the results.

        The dictionary returned has two values; 'opspersec' and 'resptimes'.
        opspersec is a list of ops per second numbers reported.'
        resptimes is a dictionary index by the max response time of a range.
        Each entry in resptimes contains a minimum response time for that range,
        and a count of how many messages fall within that range.

        Args:
            data_from_log (str): log data

        Returns:
            dict: ops per sec and response time information

        """
        # The data in the couchbase logs is kind of abnormal.
        # It contains histograms with invalid unicode charaters for yaml
        # output (which is why out_yaml_format=False is used).
        # It also seems to write a block of text inside another block at
        # an unpredictable location.  The value good_txt below is the output
        # of the log with that data removed..
        #
        # So what's left is a list of OPS/SEC values and a histogram of
        # response times.  This routine organizes that data.

        ops_per_sec = []
        resp_hist = {}
        log.info("*******Couchbase raw output log*********\n"
                 f"{data_from_log}")
        lines = data_from_log.split("\n")
        for dline in lines:
            try:
                if dline.startswith("OPS/SEC"):
                    dfields = dline.split(" ")
                    dnumb = int(dfields[-1].strip())
                    ops_per_sec.append(dnumb)
                if re.match("^\\[\\d+ +- \\d+ *\\][um]s \\|#* - \\d+", dline):
                    for element in ["[", "]", "|", "-", "#"]:
                        dline = dline.replace(element, " ")
                    parts = dline.split()
                    i1 = int(parts[0])
                    i2 = int(parts[1])
                    if parts[2] == "ms":
                        i1 *= 1000
                        i2 *= 1000
                    resp_hist[i2] = {"minindx": i1, "number": int(parts[3])}
            except ValueError:
                log.info(f"{dline} -- contains invalid data")
        ret_data = {"opspersec": ops_per_sec, "resptimes": resp_hist}
        return ret_data

    def export_pfoutput_to_googlesheet(self, sheet_name, sheet_index):
        """
        Collect pillowfight output to google spreadsheet

        Args:
            sheet_name (str): Name of the sheet
            sheet_index (int): Index of sheet

        """
        # Collect data and export to Google doc spreadsheet
        g_sheet = GoogleSpreadSheetAPI(sheet_name=sheet_name,
                                       sheet_index=sheet_index)
        log.info("Exporting pf data to google spreadsheet")
        for path in listdir(self.logs):
            full_path = join(self.logs, path)
            with open(full_path, "r") as fdesc:
                data_from_log = fdesc.read()
            log_data = self.parse_pillowfight_log(data_from_log)

            g_sheet.insert_row(
                [
                    f"{path}",
                    min(log_data["opspersec"]),
                    max(log_data["resptimes"].keys()) / 1000,
                ],
                2,
            )
        g_sheet.insert_row(["", "opspersec", "resptimes"], 2)

        # Capturing versions(OCP, OCS and Ceph) and test run name
        g_sheet.insert_row(
            [
                f"ocp_version:{utils.get_cluster_version()}",
                f"ocs_build_number:{utils.get_ocs_build_number()}",
                f"ceph_version:{utils.get_ceph_version()}",
                f"test_run_name:{utils.get_testrun_name()}",
            ],
            2,
        )

    def cleanup(self):
        """
        Remove pillowfight pods and temp files

        """
        rmtree(self.logs)
Beispiel #15
0
def teardown_module():
    ocs_obj = OCP()
    ocs_obj.login_as_sa()
Beispiel #16
0
class RipSaw(object):
    """
    Workload operation using RipSaw
    """
    def __init__(self, **kwargs):
        """
        Initializer function

        Args:
            kwargs (dict):
                Following kwargs are valid
                repo: Ripsaw repo to used - a github link
                branch: branch to use from the repo
                namespace: namespace for the operator

        Example Usage:
            r1 = RipSaw()
            r1.apply_crd(crd='ripsaw_v1alpha1_ripsaw_crd.yaml')
            # use oc apply to apply custom modified bench
            my_custom_bench = my_custom_bench.yaml
            run_cmd('oc apply -f my_custom_bench')
        """
        self.args = kwargs
        self.repo = self.args.get(
            "repo", "https://github.com/cloud-bulldozer/benchmark-operator")
        self.branch = self.args.get("branch", "master")
        self.namespace = self.args.get("namespace", RIPSAW_NAMESPACE)
        self.pgsql_is_setup = False
        self.ocp = OCP()
        self.ns_obj = OCP(kind="namespace")
        self.pod_obj = OCP(namespace=RIPSAW_NAMESPACE, kind="pod")
        self._create_namespace()
        self._clone_ripsaw()
        self.worker_nodes = [node.name for node in get_nodes()]
        helpers.label_worker_node(self.worker_nodes,
                                  label_key="kernel-cache-dropper",
                                  label_value="yes")

    def _create_namespace(self):
        """
        create namespace for RipSaw
        """
        self.ocp.new_project(self.namespace)

    def _clone_ripsaw(self):
        """
        clone the ripaw repo
        """
        self.dir = tempfile.mkdtemp(prefix="ripsaw_")
        try:
            log.info(f"cloning ripsaw in {self.dir}")
            git_clone_cmd = f"git clone -b {self.branch} {self.repo} "
            run(git_clone_cmd, shell=True, cwd=self.dir, check=True)
            self.crd = "resources/crds/"
            self.operator = "resources/operator.yaml"
        except (CommandFailed, CalledProcessError) as cf:
            log.error("Error during cloning of ripsaw repository")
            raise cf

    def apply_crd(self, crd):
        """
        Apply the CRD

        Args:
            crd (str): Name of file to apply
        """
        self.dir += "/benchmark-operator"
        run("oc apply -f deploy", shell=True, check=True, cwd=self.dir)
        run(f"oc apply -f {crd}", shell=True, check=True, cwd=self.dir)
        run(f"oc apply -f {self.operator}",
            shell=True,
            check=True,
            cwd=self.dir)
        run(
            "oc create -f resources/kernel-cache-drop-clusterrole.yaml",
            shell=True,
            check=True,
            cwd=self.dir,
        )

    def get_uuid(self, benchmark):
        """
        Getting the UUID of the test.
           when ripsaw used for running a benchmark tests, each run get its own
           UUID, so the results in the elastic-search server can be sorted.

        Args:
            benchmark (str): the name of the main pod in the test

        Return:
            str: the UUID of the test

        """
        count = 0
        while count <= 5:
            try:
                output = self.pod_obj.exec_oc_cmd(f"exec {benchmark} -- env")
                break
            except CommandFailed:
                time.sleep(3)
                count += 1
        uuid = ""
        if output:
            for line in output.split():
                if "uuid=" in line:
                    uuid = line.split("=")[1]
                    break
            log.info(f"The UUID of the test is : {uuid}")
        else:
            log.error(f"Can not get the UUID from {benchmark}")

        return uuid

    def cleanup(self):
        run(f"oc delete -f {self.crd}", shell=True, cwd=self.dir)
        run(f"oc delete -f {self.operator}", shell=True, cwd=self.dir)
        run("oc delete -f deploy", shell=True, cwd=self.dir)
        run_cmd(f"oc delete project {self.namespace}")
        run(
            "oc delete -f resources/kernel-cache-drop-clusterrole.yaml",
            shell=True,
            check=True,
            cwd=self.dir,
        )
        self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180)
        # Reset namespace to default
        switch_to_default_rook_cluster_project()
        helpers.remove_label_from_worker_node(self.worker_nodes,
                                              label_key="kernel-cache-dropper")
Beispiel #17
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    operator_selector = get_selector_for_ocs_operator()
    ocs_package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME,
        selector=operator_selector,
    )
    channel = config.DEPLOYMENT.get('ocs_csv_channel')
    ocs_csv_name = ocs_package_manifest.get_current_csv(channel=channel)
    ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace)
    log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.")
    ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout)
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data['spec']['version']
    ocs_version = config.ENV_DATA['ocs_version']
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert ocs_version in csv_version, (
        f"OCS version: {ocs_version} mismatch with CSV version {csv_version}")
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        'ocs_registry_image')
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch')
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    # ocs-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OCS_OPERATOR_LABEL,
                                 timeout=timeout)
    # rook-ceph-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OPERATOR_LABEL,
                                 timeout=timeout)
    # noobaa
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.NOOBAA_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)
    # mons
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MON_APP_LABEL,
                                 resource_count=3,
                                 timeout=timeout)
    # csi-cephfsplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_CEPHFSPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-cephfsplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # csi-rbdplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_RBDPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-rbdplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # osds
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']))
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OSD_APP_LABEL,
                                 resource_count=osd_count,
                                 timeout=timeout)
    # mgr
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MGR_APP_LABEL,
                                 timeout=timeout)
    # mds
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MDS_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)

    # rgw check only for VmWare
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector=constants.RGW_APP_LABEL,
                                     resource_count=1,
                                     timeout=timeout)

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name']
        for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSDs are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        deviceset_count = get_deviceset_count()
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > deviceset_count, (
                "OSD's are not distributed evenly across worker nodes")

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({
        item['metadata']['name']
        for item in csi_driver.get()['items']
    })

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD)
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output.")

    if config.DEPLOYMENT.get('local_storage'):
        deviceset_pvcs = get_compute_node_names()
    else:
        deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}")
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ])
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and (
                'snapshot' not in image), (
                    f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
    deployments = ocs_csv.get()['spec']['install']['spec']['deployments']
    rook_ceph_operator_deployment = [
        deployment_val for deployment_val in deployments
        if deployment_val['name'] == 'rook-ceph-operator'
    ]
    assert {
        'name': 'CSI_ENABLE_SNAPSHOTTER',
        'value': 'false'
    } in (rook_ceph_operator_deployment[0]['spec']['template']['spec']
          ['containers'][0]['env']
          ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump',
                                          format='')
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [
            rule for rule in crush_dump['rules']
            if rule['rule_name'] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps']
                if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
Beispiel #18
0
    def __init__(self, *args, **kwargs):
        """
        Constructor for the MCG class
        """
        self.namespace = config.ENV_DATA["cluster_namespace"]
        self.operator_pod = Pod(**get_pods_having_label(
            constants.NOOBAA_OPERATOR_POD_LABEL, self.namespace)[0])
        self.core_pod = Pod(**get_pods_having_label(
            constants.NOOBAA_CORE_POD_LABEL, self.namespace)[0])

        self.retrieve_noobaa_cli_binary()
        """
        The certificate will be copied on each mcg_obj instantiation since
        the process is so light and quick, that the time required for the redundant
        copy is neglible in comparison to the time a hash comparison will take.
        """
        retrieve_default_ingress_crt()

        get_noobaa = OCP(kind="noobaa", namespace=self.namespace).get()

        self.s3_endpoint = (get_noobaa.get("items")[0].get("status").get(
            "services").get("serviceS3").get("externalDNS")[0])
        self.s3_internal_endpoint = (get_noobaa.get("items")[0].get(
            "status").get("services").get("serviceS3").get("internalDNS")[0])
        self.mgmt_endpoint = (get_noobaa.get("items")[0].get("status").get(
            "services").get("serviceMgmt").get("externalDNS")[0]) + "/rpc"
        self.region = config.ENV_DATA["region"]

        creds_secret_name = (get_noobaa.get("items")[0].get("status").get(
            "accounts").get("admin").get("secretRef").get("name"))
        secret_ocp_obj = OCP(kind="secret", namespace=self.namespace)
        creds_secret_obj = secret_ocp_obj.get(creds_secret_name)

        self.access_key_id = base64.b64decode(
            creds_secret_obj.get("data").get("AWS_ACCESS_KEY_ID")).decode(
                "utf-8")
        self.access_key = base64.b64decode(
            creds_secret_obj.get("data").get("AWS_SECRET_ACCESS_KEY")).decode(
                "utf-8")

        self.noobaa_user = base64.b64decode(
            creds_secret_obj.get("data").get("email")).decode("utf-8")
        self.noobaa_password = base64.b64decode(
            creds_secret_obj.get("data").get("password")).decode("utf-8")

        self.noobaa_token = self.retrieve_nb_token()

        self.s3_resource = boto3.resource(
            "s3",
            verify=retrieve_verification_mode(),
            endpoint_url=self.s3_endpoint,
            aws_access_key_id=self.access_key_id,
            aws_secret_access_key=self.access_key,
        )

        self.s3_client = self.s3_resource.meta.client

        if config.ENV_DATA["platform"].lower() == "aws" and kwargs.get(
                "create_aws_creds"):
            (
                self.cred_req_obj,
                self.aws_access_key_id,
                self.aws_access_key,
            ) = self.request_aws_credentials()

            self.aws_s3_resource = boto3.resource(
                "s3",
                endpoint_url="https://s3.amazonaws.com",
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_access_key,
            )

        if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS
                or storagecluster_independent_check()):
            if not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM and (
                    version.get_semantic_ocs_version_from_config() >
                    version.VERSION_4_5):
                logger.info("Checking whether RGW pod is not present")
                pods = pod.get_pods_having_label(label=constants.RGW_APP_LABEL,
                                                 namespace=self.namespace)
                assert (
                    not pods
                ), "RGW pods should not exist in the current platform/cluster"

        elif config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
            rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"],
                                      check_if_cluster_was_upgraded(), None)
            logger.info(
                f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform'
            )
            rgw_pod = OCP(kind=constants.POD, namespace=self.namespace)
            assert rgw_pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.RGW_APP_LABEL,
                resource_count=rgw_count,
                timeout=60,
            )
Beispiel #19
0
class RipSaw(object):
    """
      Workload operation using RipSaw
    """
    def __init__(self, **kwargs):
        """
        Initializer function

        Args:
            kwargs (dict):
                Following kwargs are valid
                repo: Ripsaw repo to used - a github link
                branch: branch to use from the repo
                namespace: namespace for the operator

        Example Usage:
            r1 = RipSaw()
            r1.apply_crd(crd='ripsaw_v1alpha1_ripsaw_crd.yaml')
            # use oc apply to apply custom modified bench
            my_custom_bench = my_custom_bench.yaml
            run_cmd('oc apply -f my_custom_bench')
        """
        self.args = kwargs
        self.repo = self.args.get('repo',
                                  'https://github.com/cloud-bulldozer/ripsaw')
        self.branch = self.args.get('branch', 'master')
        self.namespace = self.args.get('namespace', RIPSAW_NAMESPACE)
        self.pgsql_is_setup = False
        self.ocp = OCP()
        self.ns_obj = OCP(kind='namespace')
        self.pod_obj = OCP(kind='pod')
        self._create_namespace()
        self._clone_ripsaw()

    def _create_namespace(self):
        """
        create namespace for RipSaw
        """
        self.ocp.new_project(self.namespace)

    def _clone_ripsaw(self):
        """
        clone the ripaw repo
        """
        self.dir = tempfile.mkdtemp(prefix='ripsaw_')
        try:
            log.info(f'cloning ripsaw in {self.dir}')
            git_clone_cmd = f'git clone -b {self.branch} {self.repo} '
            run(git_clone_cmd, shell=True, cwd=self.dir, check=True)
            self.crd = 'resources/crds/'
            self.operator = 'resources/operator.yaml'
        except (CommandFailed, CalledProcessError) as cf:
            log.error('Error during cloning of ripsaw repository')
            raise cf

    def apply_crd(self, crd):
        """
        Apply the CRD

        Args:
            crd (str): Name of file to apply
        """
        self.dir += '/ripsaw'
        run('oc apply -f deploy', shell=True, check=True, cwd=self.dir)
        run(f'oc apply -f {crd}', shell=True, check=True, cwd=self.dir)
        run(f'oc apply -f {self.operator}',
            shell=True,
            check=True,
            cwd=self.dir)

    def cleanup(self):
        run(f'oc delete -f {self.crd}', shell=True, cwd=self.dir)
        run(f'oc delete -f {self.operator}', shell=True, cwd=self.dir)
        run('oc delete -f deploy', shell=True, cwd=self.dir)
        run_cmd(f'oc delete project {self.namespace}')
        self.ns_obj.wait_for_delete(resource_name=self.namespace)
        # Reset namespace to default
        switch_to_default_rook_cluster_project()
Beispiel #20
0
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME
    )
    ocs_csv_name = ocs_package_manifest.get_current_csv()
    ocs_csv = CSV(
        resource_name=ocs_csv_name, namespace=namespace
    )
    log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.")
    ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout)

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(
        f"Check if StorageCluster: {storage_cluster_name} is in"
        f"Succeeded phase"
    )
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(
        kind=constants.POD, namespace=namespace
    )
    # ocs-operator
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.OCS_OPERATOR_LABEL,
        timeout=timeout
    )
    # rook-ceph-operator
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.OPERATOR_LABEL,
        timeout=timeout
    )
    # noobaa
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.NOOBAA_APP_LABEL,
        resource_count=2,
        timeout=timeout
    )
    # mons
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.MON_APP_LABEL,
        resource_count=3,
        timeout=timeout
    )
    # csi-cephfsplugin
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_CEPHFSPLUGIN_LABEL,
        resource_count=number_of_worker_nodes,
        timeout=timeout
    )
    # csi-cephfsplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout
    )
    # csi-rbdplugin
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_RBDPLUGIN_LABEL,
        resource_count=number_of_worker_nodes,
        timeout=timeout
    )
    # csi-rbdplugin-profisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout
    )
    # osds
    osd_count = storage_cluster.data['spec']['storageDeviceSets'][0]['count']
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.OSD_APP_LABEL,
        resource_count=osd_count,
        timeout=timeout
    )
    # mgr
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.MGR_APP_LABEL,
        timeout=timeout
    )
    # mds
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.MDS_APP_LABEL,
        resource_count=2,
        timeout=timeout
    )

    # rgw check only for VmWare
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.RGW_APP_LABEL,
            resource_count=1,
            timeout=timeout
        )

    # Verify ceph health
    log.info("Verifying ceph health")
    assert utils.ceph_health_check(namespace=namespace)

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(
        kind=constants.STORAGECLASS, namespace=namespace
    )
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs',
        f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name'] for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSD's are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSD's are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > 1, (
                "OSD's are not distributed evenly across worker nodes"
            )
    def test_smallfile_workload(self, ripsaw, es, file_size, files, threads,
                                samples, interface):
        """
        Run SmallFile Workload
        """

        # Loading the main template yaml file for the benchmark
        sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML)

        # getting the name and email  of the user that running the test.
        try:
            user = run_cmd('git config --get user.name').strip()
            email = run_cmd('git config --get user.email').strip()
        except CommandFailed:
            # if no git user define, use the default user from the CR file
            user = sf_data['spec']['test_user']
            email = ''

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        es_server = ""
        es_port = ""
        if 'elasticsearch' in sf_data['spec']:
            if 'server' in sf_data['spec']['elasticsearch']:
                es_server = sf_data['spec']['elasticsearch']['server']
            if 'port' in sf_data['spec']['elasticsearch']:
                es_port = sf_data['spec']['elasticsearch']['port']
        else:
            sf_data['spec']['elasticsearch'] = {}

        # Use the internal define elastic-search server in the test
        sf_data['spec']['elasticsearch'] = {
            'server': es.get_ip(),
            'port': es.get_port()
        }

        log.info("Apply Operator CRD")
        ripsaw.apply_crd('resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml')
        if interface == constants.CEPHBLOCKPOOL:
            storageclass = constants.DEFAULT_STORAGECLASS_RBD
        else:
            storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS
        log.info(f"Using {storageclass} Storageclass")
        sf_data['spec']['workload']['args']['storageclass'] = storageclass
        log.info("Running SmallFile bench")
        """
            Setting up the parameters for this test
        """
        sf_data['spec']['workload']['args']['file_size'] = file_size
        sf_data['spec']['workload']['args']['files'] = files
        sf_data['spec']['workload']['args']['threads'] = threads
        sf_data['spec']['workload']['args']['samples'] = samples
        sf_data['spec']['clustername'] = get_clustername()
        sf_data['spec']['test_user'] = f'{user}<{email}>'
        """
        Calculating the size of the volume that need to be test, it should
        be at least twice in the size then the size of the files, and at
        least 100Gi.

        Since the file_size is in Kb and the vol_size need to be in Gb, more
        calculation is needed.
        """
        vol_size = int(files * threads * file_size * 3)
        vol_size = int(vol_size / constants.GB2KB)
        if vol_size < 100:
            vol_size = 100
        sf_data['spec']['workload']['args']['storagesize'] = f"{vol_size}Gi"

        sf_obj = OCS(**sf_data)
        sf_obj.create()
        log.info(f'The smallfile yaml file is {sf_data}')

        # wait for benchmark pods to get created - takes a while
        for bench_pod in TimeoutSampler(240, 10, get_pod_name_by_pattern,
                                        'smallfile-client',
                                        constants.RIPSAW_NAMESPACE):
            try:
                if bench_pod[0] is not None:
                    small_file_client_pod = bench_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        bench_pod = OCP(kind='pod', namespace=constants.RIPSAW_NAMESPACE)
        log.info("Waiting for SmallFile benchmark to Run")
        assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                           resource_name=small_file_client_pod,
                                           sleep=30,
                                           timeout=600)
        start_time = time.time()

        # After testing manually, changing the timeout
        timeout = 3600

        # Getting the UUID from inside the benchmark pod
        output = bench_pod.exec_oc_cmd(f'exec {small_file_client_pod} -- env')
        for line in output.split():
            if 'uuid=' in line:
                uuid = line.split('=')[1]
        log.info(f'the UUID of the test is : {uuid}')

        # Setting back the original elastic-search information
        sf_data['spec']['elasticsearch'] = {
            'server': es_server,
            'port': es_port
        }

        full_results = SmallFileResultsAnalyse(uuid, sf_data)

        # Initialaize the results doc file.
        full_results.add_key('user', sf_data['spec']['test_user'])
        full_results.add_key('ocp_version', get_ocp_version())
        full_results.add_key('ocp_build', get_build())
        full_results.add_key('ocp_channel', get_ocp_channel())

        # Getting the OCS version
        (ocs_ver_info, _) = get_ocs_version()
        ocs_ver_full = ocs_ver_info['status']['desired']['version']
        m = re.match(r"(\d.\d).(\d)", ocs_ver_full)
        if m and m.group(1) is not None:
            ocs_ver = m.group(1)

        full_results.add_key('ocs_version', ocs_ver)
        full_results.add_key('vendor', get_provider())
        full_results.add_key(
            'start_time', time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()))

        # Calculating the total size of the working data set - in GB
        full_results.add_key(
            'dataset', file_size * files * threads *
            full_results.results['clients'] / constants.GB2KB)

        full_results.add_key(
            'global_options', {
                'files': files,
                'file_size': file_size,
                'storageclass':
                sf_data['spec']['workload']['args']['storageclass'],
                'vol_size': sf_data['spec']['workload']['args']['storagesize']
            })

        while True:
            logs = bench_pod.exec_oc_cmd(f'logs {small_file_client_pod}',
                                         out_yaml_format=False)
            if "RUN STATUS DONE" in logs:
                full_results.add_key(
                    'end_time',
                    time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()))
                full_results.read()
                if not full_results.dont_check:
                    full_results.add_key('hosts',
                                         full_results.get_clients_list())
                    full_results.init_full_results()
                    full_results.aggregate_host_results()
                    test_status = full_results.aggregate_samples_results()
                    full_results.write()

                    # Creating full link to the results on the ES server
                    res_link = 'http://'
                    res_link += f'{full_results.server}:{full_results.port}/'
                    res_link += f'{full_results.new_index}/_search?q='
                    res_link += f'uuid:{full_results.uuid}'
                    log.info(f'Full results can be found as : {res_link}')
                else:
                    test_status = True

                break

            if timeout < (time.time() - start_time):
                raise TimeoutError(
                    "Timed out waiting for benchmark to complete")
            time.sleep(30)
        assert (not get_logs_with_errors() and test_status), 'Test Failed'
Beispiel #22
0
class OCS(object):
    """
    Base OCSClass
    """

    def __init__(self, **kwargs):
        """
        Initializer function

        Args:
            kwargs (dict):
                1) For existing resource, use OCP.reload() to get the
                resource's dictionary and use it to pass as **kwargs
                2) For new resource, use yaml files templates under
                /templates/CSI like:
                obj_dict = load_yaml(
                    os.path.join(
                        TEMPLATE_DIR, "some_resource.yaml"
                        )
                    )
        """
        self.data = kwargs
        self._api_version = self.data.get('api_version')
        self._kind = self.data.get('kind')
        self._namespace = None
        if 'metadata' in self.data:
            self._namespace = self.data.get('metadata').get('namespace')
            self._name = self.data.get('metadata').get('name')
        self.ocp = OCP(
            api_version=self._api_version, kind=self.kind,
            namespace=self._namespace
        )
        self.temp_yaml = tempfile.NamedTemporaryFile(
            mode='w+', prefix=self._kind, delete=False
        )
        # This _is_delete flag is set to True if the delete method was called
        # on object of this class and was successfull.
        self._is_deleted = False

    @property
    def api_version(self):
        return self._api_version

    @property
    def kind(self):
        return self._kind

    @property
    def namespace(self):
        return self._namespace

    @property
    def name(self):
        return self._name

    @property
    def is_deleted(self):
        return self._is_deleted

    def reload(self):
        """
        Reloading the OCS instance with the new information from its actual
        data.
        After creating a resource from a yaml file, the actual yaml file is
        being changed and more information about the resource is added.
        """
        self.data = self.get()
        self.__init__(**self.data)

    def get(self, out_yaml_format=True):
        return self.ocp.get(
            resource_name=self.name, out_yaml_format=out_yaml_format
        )

    def describe(self):
        return self.ocp.describe(resource_name=self.name)

    def create(self, do_reload=True):
        log.info(f"Adding {self.kind} with name {self.name}")
        templating.dump_data_to_temp_yaml(self.data, self.temp_yaml.name)
        status = self.ocp.create(yaml_file=self.temp_yaml.name)
        if do_reload:
            self.reload()
        return status

    def delete(self, wait=True, force=False):
        """
        Delete the OCS object if its not already deleted
        (using the internal is_deleted flag)

        Args:
            wait (bool): Wait for object to be deleted
            force (bool): Force delete object

        Returns:
            bool: True if deleted, False otherwise

        """
        if self._is_deleted:
            log.info(
                f"Attempt to remove resource: {self.name} which is"
                f"already deleted! Skipping delete of this resource!"
            )
            result = True
        else:
            result = self.ocp.delete(
                resource_name=self.name, wait=wait, force=force
            )
            self._is_deleted = True
        return result

    def apply(self, **data):
        with open(self.temp_yaml.name, 'w') as yaml_file:
            yaml.dump(data, yaml_file)
        assert self.ocp.apply(yaml_file=self.temp_yaml.name), (
            f"Failed to apply changes {data}"
        )
        self.reload()

    def add_label(self, label):
        """
        Addss a new label

        Args:
            label (str): New label to be assigned for this pod
                E.g: "label=app='rook-ceph-mds'"
        """
        status = self.ocp.add_label(resource_name=self.name, label=label)
        self.reload()
        return status

    def delete_temp_yaml_file(self):
        utils.delete_file(self.temp_yaml.name)
Beispiel #23
0
    def setup_cb(self):
        """
        Creating admission parts,couchbase operator pod, couchbase worker secret

        """
        # Create admission controller
        log.info("Create admission controller process for Couchbase")
        self.up_adm_chk = OCP(namespace="default")
        self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR)
        self.adm_objects = []
        for adm_yaml in self.admission_parts:
            adm_data = templating.load_yaml(adm_yaml)
            adm_obj = OCS(**adm_data)
            adm_obj.create()
            self.adm_objects.append(adm_obj)

        # Wait for admission pod to be created
        for adm_pod in TimeoutSampler(
                self.WAIT_FOR_TIME,
                3,
                get_pod_name_by_pattern,
                "couchbase-operator-admission",
                "default",
        ):
            try:
                if self.is_up_and_running(adm_pod[0], self.up_adm_chk):
                    self.admission_pod = adm_pod[0]
                    break
            except IndexError:
                log.info("Admission pod is not ready yet")

        # Wait for admission pod to be running
        log.info("Waiting for admission pod to be running")
        admission_pod_obj = get_pod_obj(self.admission_pod,
                                        namespace="default")
        wait_for_resource_state(
            resource=admission_pod_obj,
            state=constants.STATUS_RUNNING,
            timeout=self.WAIT_FOR_TIME,
        )

        self.ns_obj.new_project(constants.COUCHBASE_OPERATOR)
        couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML)
        self.couchbase_obj = OCS(**couchbase_data)
        self.couchbase_obj.create()
        op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE)
        self.operator_role = OCS(**op_data)
        self.operator_role.create()
        self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR)
        self.serviceaccount.exec_oc_cmd(
            "create serviceaccount couchbase-operator")

        dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets")
        startloc = dockercfgs.find("couchbase-operator-dockercfg")
        newdockerstr = dockercfgs[startloc:]
        endloc = newdockerstr.find(" ")
        dockerstr = newdockerstr[:endloc]
        self.secretsadder.exec_oc_cmd(
            f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}"
        )
        self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR)
        rolebind_cmd = "".join([
            "create rolebinding couchbase-operator-rolebinding ",
            "--role couchbase-operator ",
            "--serviceaccount couchbase-operator-namespace:couchbase-operator",
        ])
        self.rolebinding.exec_oc_cmd(rolebind_cmd)
        dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY)
        self.cb_deploy = OCS(**dep_data)
        self.cb_deploy.create()
        # Wait for couchbase operator pod to be running
        for couchbase_pod in TimeoutSampler(
                self.WAIT_FOR_TIME,
                3,
                get_pod_name_by_pattern,
                "couchbase-operator",
                constants.COUCHBASE_OPERATOR,
        ):
            try:
                if self.is_up_and_running(couchbase_pod[0], self.up_check):
                    break
            except IndexError:
                log.info("Couchbase operator is not up")

        cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET)
        self.cb_worker = OCS(**cb_work)
        self.cb_worker.create()
Beispiel #24
0
    def test_pvpool_cpu_and_memory_modifications(
        self,
        awscli_pod_session,
        backingstore_factory,
        bucket_factory,
        test_directory_setup,
        mcg_obj_session,
    ):
        """
        Test to modify the CPU and Memory resource limits for BS and see if its reflecting
        """
        bucketclass_dict = {
            "interface": "OC",
            "backingstore_dict": {
                "pv": [(
                    1,
                    MIN_PV_BACKINGSTORE_SIZE_IN_GB,
                    "ocs-storagecluster-ceph-rbd",
                )]
            },
        }
        bucket = bucket_factory(1, "OC", bucketclass=bucketclass_dict)[0]
        bucket_name = bucket.name
        pv_backingstore = bucket.bucketclass.backingstores[0]
        pv_bs_name = pv_backingstore.name
        pv_pod_label = f"pool={pv_bs_name}"
        pv_pod_info = get_pods_having_label(
            label=pv_pod_label,
            namespace=config.ENV_DATA["cluster_namespace"])[0]
        pv_pod_obj = Pod(**pv_pod_info)
        pv_pod_name = pv_pod_obj.name
        logger.info(
            f"Pod created for PV Backingstore {pv_bs_name}: {pv_pod_name}")
        new_cpu = "500m"
        new_mem = "500Mi"
        new_resource_patch = {
            "spec": {
                "pvPool": {
                    "resources": {
                        "limits": {
                            "cpu": f"{new_cpu}",
                            "memory": f"{new_mem}",
                        },
                        "requests": {
                            "cpu": f"{new_cpu}",
                            "memory": f"{new_mem}",
                        },
                    }
                }
            }
        }
        try:
            OCP(
                namespace=config.ENV_DATA["cluster_namespace"],
                kind="backingstore",
                resource_name=pv_bs_name,
            ).patch(params=json.dumps(new_resource_patch), format_type="merge")
        except CommandFailed as e:
            logger.error(f"[ERROR] Failed to patch: {e}")
        else:
            logger.info("Patched new resource limits")
        wait_for_pods_to_be_running(
            namespace=config.ENV_DATA["cluster_namespace"],
            pod_names=[pv_pod_name])
        pv_pod_ocp_obj = OCP(namespace=config.ENV_DATA["cluster_namespace"],
                             kind="pod").get(resource_name=pv_pod_name)
        resource_dict = pv_pod_ocp_obj["spec"]["containers"][0]["resources"]
        assert (
            resource_dict["limits"]["cpu"] == new_cpu
            and resource_dict["limits"]["memory"] == new_mem
            and resource_dict["requests"]["cpu"] == new_cpu
            and resource_dict["requests"]["memory"] == new_mem
        ), "New resource modification in Backingstore is not reflected in PV Backingstore Pod!!"
        logger.info(
            "Resource modification reflected in the PV Backingstore Pod!!")

        # push some data to the bucket
        file_dir = test_directory_setup.origin_dir
        copy_random_individual_objects(
            podobj=awscli_pod_session,
            file_dir=file_dir,
            target=f"s3://{bucket_name}",
            amount=1,
            s3_obj=OBC(bucket_name),
        )
Beispiel #25
0
    def request_aws_credentials(self):
        """
        Uses a CredentialsRequest CR to create an AWS IAM that allows the program
        to interact with S3

        Returns:
            OCS: The CredentialsRequest resource
        """
        awscreds_data = templating.load_yaml(constants.MCG_AWS_CREDS_YAML)
        req_name = create_unique_resource_name('awscredreq',
                                               'credentialsrequests')
        awscreds_data['metadata']['name'] = req_name
        awscreds_data['metadata']['namespace'] = self.namespace
        awscreds_data['spec']['secretRef']['name'] = req_name
        awscreds_data['spec']['secretRef']['namespace'] = self.namespace

        creds_request = create_resource(**awscreds_data)
        sleep(5)

        secret_ocp_obj = OCP(kind='secret', namespace=self.namespace)
        try:
            cred_req_secret_dict = secret_ocp_obj.get(
                resource_name=creds_request.name, retry=5)
        except CommandFailed:
            logger.error('Failed to retrieve credentials request secret')
            raise CredReqSecretNotFound(
                'Please make sure that the cluster used is an AWS cluster, '
                'or that the `platform` var in your config is correct.')

        aws_access_key_id = base64.b64decode(
            cred_req_secret_dict.get('data').get('aws_access_key_id')).decode(
                'utf-8')

        aws_access_key = base64.b64decode(
            cred_req_secret_dict.get('data').get(
                'aws_secret_access_key')).decode('utf-8')

        def _check_aws_credentials():
            try:
                sts = boto3.client('sts',
                                   aws_access_key_id=aws_access_key_id,
                                   aws_secret_access_key=aws_access_key)
                sts.get_caller_identity()

                return True

            except ClientError:
                logger.info('Credentials are still not active. Retrying...')
                return False

        try:
            for api_test_result in TimeoutSampler(120, 5,
                                                  _check_aws_credentials):
                if api_test_result:
                    logger.info('AWS credentials created successfully.')
                    break

        except TimeoutExpiredError:
            logger.error('Failed to create credentials')
            assert False

        return creds_request, aws_access_key_id, aws_access_key
Beispiel #26
0
def verify_image_versions(old_images, upgrade_version, version_before_upgrade):
    """
    Verify if all the images of OCS objects got upgraded

    Args:
        old_images (set): set with old images
        upgrade_version (packaging.version.Version): version of OCS
        version_before_upgrade (float): version of OCS before upgrade

    """
    number_of_worker_nodes = len(get_nodes())
    verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL)
    verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL)
    default_noobaa_pods = 3
    noobaa_pods = default_noobaa_pods
    if upgrade_version >= parse_version("4.7"):
        noobaa = OCP(kind="noobaa",
                     namespace=config.ENV_DATA["cluster_namespace"])
        resource = noobaa.get()["items"][0]
        endpoints = resource.get("spec", {}).get("endpoints", {})
        max_endpoints = endpoints.get("maxCount",
                                      constants.MAX_NB_ENDPOINT_COUNT)
        min_endpoints = endpoints.get(
            "minCount", constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT)
        noobaa_pods = default_noobaa_pods + min_endpoints
    try:
        verify_pods_upgraded(
            old_images,
            selector=constants.NOOBAA_APP_LABEL,
            count=noobaa_pods,
        )
    except TimeoutException as ex:
        if upgrade_version >= parse_version("4.7"):
            log.info(
                "Nooba pods didn't match. Trying once more with max noobaa endpoints!"
                f"Exception: {ex}")
            noobaa_pods = default_noobaa_pods + max_endpoints
            verify_pods_upgraded(
                old_images,
                selector=constants.NOOBAA_APP_LABEL,
                count=noobaa_pods,
                timeout=60,
            )
        else:
            raise
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_CEPHFSPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_RBDPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
                         count=2)
    if not config.DEPLOYMENT.get("external_mode"):
        verify_pods_upgraded(
            old_images,
            selector=constants.MON_APP_LABEL,
            count=3,
        )
        verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL)
        osd_timeout = 600 if upgrade_version >= parse_version("4.5") else 750
        osd_count = get_osd_count()
        verify_pods_upgraded(
            old_images,
            selector=constants.OSD_APP_LABEL,
            count=osd_count,
            timeout=osd_timeout * osd_count,
        )
        verify_pods_upgraded(old_images,
                             selector=constants.MDS_APP_LABEL,
                             count=2)
        if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
            rgw_count = get_rgw_count(upgrade_version.base_version, True,
                                      version_before_upgrade)
            verify_pods_upgraded(
                old_images,
                selector=constants.RGW_APP_LABEL,
                count=rgw_count,
            )
Beispiel #27
0
class RGW(object):
    """
    Wrapper class for interaction with a cluster's RGW service
    """

    def __init__(self, namespace=None):
        self.namespace = (
            namespace if namespace else config.ENV_DATA["cluster_namespace"]
        )

        if storagecluster_independent_check():
            sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RGW
        else:
            sc_name = constants.DEFAULT_STORAGECLASS_RGW

        self.storageclass = OCP(
            kind="storageclass", namespace=namespace, resource_name=sc_name
        )
        self.s3_internal_endpoint = (
            self.storageclass.get().get("parameters").get("endpoint")
        )
        self.region = self.storageclass.get().get("parameters").get("region")
        # Todo: Implement retrieval in cases where CephObjectStoreUser is available
        self.key_id = None
        self.secret_key = None
        self.s3_resource = None

    def get_credentials(self, secret_name=constants.NOOBAA_OBJECTSTOREUSER_SECRET):
        """
        Get Endpoint, Access key and Secret key from OCS secret. Endpoint is
        taken from rgw exposed service. Use rgw_endpoint fixture in test to get
        it exposed.

        Args:
            secret_name (str): Name of secret to be used
                for getting RGW credentials

        Returns:
            tuple: Endpoint, Access key, Secret key

        """
        if (
            secret_name == constants.NOOBAA_OBJECTSTOREUSER_SECRET
            and storagecluster_independent_check()
        ):
            secret_name = constants.EXTERNAL_MODE_NOOBAA_OBJECTSTOREUSER_SECRET
        secret_ocp_obj = OCP(kind=constants.SECRET, namespace=self.namespace)
        route_ocp_obj = OCP(
            kind=constants.ROUTE, namespace=config.ENV_DATA["cluster_namespace"]
        )
        creds_secret_obj = secret_ocp_obj.get(secret_name)
        if config.DEPLOYMENT["external_mode"]:
            endpoint = route_ocp_obj.get(
                resource_name=constants.RGW_SERVICE_EXTERNAL_MODE
            )
        else:
            endpoint = route_ocp_obj.get(
                resource_name=constants.RGW_SERVICE_INTERNAL_MODE
            )
        endpoint = f"http://{endpoint['status']['ingress'][0]['host']}"
        access_key = base64.b64decode(
            creds_secret_obj.get("data").get("AccessKey")
        ).decode("utf-8")
        secret_key = base64.b64decode(
            creds_secret_obj.get("data").get("SecretKey")
        ).decode("utf-8")
        return (endpoint, access_key, secret_key)
Beispiel #28
0
    def delete(self):
        log.info(f"Cleaning up backingstore {self.name}")
        # If the backingstore utilizes a PV, save its PV name for deletion verification
        if self.type == "pv":
            backingstore_pvc = OCP(
                kind=constants.PVC,
                selector=f"pool={self.name}",
                namespace=config.ENV_DATA["cluster_namespace"],
            ).get()["items"][0]
            pv_name = backingstore_pvc["spec"]["volumeName"]

        if self.method == "oc":
            OCP(
                kind="backingstore", namespace=config.ENV_DATA["cluster_namespace"]
            ).delete(resource_name=self.name)
        elif self.method == "cli":

            def _cli_deletion_flow():
                try:
                    self.mcg_obj.exec_mcg_cmd(f"backingstore delete {self.name}")
                    return True
                except CommandFailed as e:
                    if "being used by one or more buckets" in str(e).lower():
                        log.warning(
                            f"Deletion of {self.name} failed because it's being used by a bucket. "
                            "Retrying..."
                        )
                        return False

            sample = TimeoutSampler(
                timeout=120,
                sleep=20,
                func=_cli_deletion_flow,
            )
            if not sample.wait_for_func_status(result=True):
                log.error(f"Failed to {self.name}")
                raise TimeoutExpiredError

        # Verify deletion was successful
        log.info(f"Verifying whether backingstore {self.name} exists after deletion")
        bs_deleted_successfully = False

        try:
            if self.method == "oc":
                OCP(
                    kind="backingstore",
                    namespace=config.ENV_DATA["cluster_namespace"],
                    resource_name=self.name,
                ).get()
            elif self.method == "cli":
                self.mcg_obj.exec_mcg_cmd(f"backingstore status {self.name}")

        except CommandFailed as e:
            if "Not Found" in str(e) or "NotFound" in str(e):
                bs_deleted_successfully = True
            else:
                raise

        assert (
            bs_deleted_successfully
        ), f"Backingstore {self.name} was not deleted successfully"

        def _wait_for_pv_backingstore_resource_deleted(namespace=None):
            """
            wait for pv backing store resources to be deleted at the end of test teardown

            Args:
                backingstore_name (str): backingstore name
                namespace (str): backing store's namespace

            """
            namespace = namespace or config.ENV_DATA["cluster_namespace"]
            sample = TimeoutSampler(
                timeout=120,
                sleep=15,
                func=_check_resources_deleted,
                namespace=namespace,
            )
            if not sample.wait_for_func_status(result=True):
                log.error(f"{self.name} was not deleted properly, leftovers were found")
                raise TimeoutExpiredError

        def _check_resources_deleted(namespace=None):
            """
            check if resources of the pv pool backingstore deleted properly

            Args:
                namespace (str): backing store's namespace

            Returns:
                bool: True if pvc(s) were deleted

            """
            try:
                OCP(kind=constants.PV, resource_name=pv_name).get()
                log.warning(f"Found PV leftovers belonging to {self.name}")
                return False
            except CommandFailed as e:
                if "not found" in str(e):
                    pass
                else:
                    raise
            pvcs = get_all_pvcs(namespace=namespace, selector=f"pool={self.name}")
            pods = get_pods_having_label(namespace=namespace, label=f"pool={self.name}")
            return len(pvcs["items"]) == 0 and len(pods) == 0

        if self.type == "pv":
            log.info(f"Waiting for backingstore {self.name} resources to be deleted")
            _wait_for_pv_backingstore_resource_deleted()
class CouchBase(PillowFight):
    """
    CouchBase workload operation
    """
    WAIT_FOR_TIME = 600
    admission_parts = [
        constants.COUCHBASE_ADMISSION_SERVICE_ACCOUNT_YAML,
        constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_YAML,
        constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_BINDING_YAML,
        constants.COUCHBASE_ADMISSION_SECRET_YAML,
        constants.COUCHBASE_ADMISSION_DEPLOYMENT_YAML,
        constants.COUCHBASE_ADMISSION_SERVICE_YAML,
        constants.COUCHBASE_MUTATING_WEBHOOK_YAML,
        constants.COUCHBASE_VALIDATING_WEBHOOK_YAML
    ]
    pod_obj = OCP(kind='pod')
    couchbase_pod = OCP(kind='pod')
    secretsadder = OCP(kind='pod')
    admission_pod = []
    cb_worker = OCS()
    cb_examples = OCS()

    def __init__(self, **kwargs):
        """
        Initializer function

        """
        super().__init__(**kwargs)

    def is_up_and_running(self, pod_name, ocp_value):
        """
        Test if the pod specified is up and running.

        Args:
            pod_name (str): Name of pod being checked.
            ocp_value (object): object used for running oc commands

        Returns:
            bool; True if pod is running, False otherwise

        """
        if not pod_name:
            return False
        pod_info = ocp_value.exec_oc_cmd(f"get pods {pod_name} -o json")
        if pod_info['status']['containerStatuses'][0]['ready']:
            if 'running' in pod_info['status']['containerStatuses'][0][
                    'state']:
                return True
        return False

    def setup_cb(self):
        """
        Creating admission parts,couchbase operator pod, couchbase worker secret

        """
        # Create admission controller
        log.info("Create admission controller process for Couchbase")
        switch_to_project('default')
        self.up_adm_chk = OCP(namespace="default")
        self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR)
        for adm_yaml in self.admission_parts:
            adm_data = templating.load_yaml(adm_yaml)
            adm_obj = OCS(**adm_data)
            adm_obj.create()

        # Wait for admission pod to be created
        for adm_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3,
                                      get_pod_name_by_pattern,
                                      'couchbase-operator-admission',
                                      'default'):
            try:
                if self.is_up_and_running(adm_pod[0], self.up_adm_chk):
                    self.admission_pod = adm_pod[0]
                    break
            except IndexError:
                log.info("Admission pod is not ready yet")

        # Wait for admission pod to be running
        log.info("Waiting for admission pod to be running")
        self.pod_obj.wait_for_resource(
            condition='Running',
            resource_name=self.admission_pod,
            timeout=self.WAIT_FOR_TIME,
            sleep=10,
        )
        self.pod_obj.new_project(constants.COUCHBASE_OPERATOR)
        couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML)
        self.couchbase_obj = OCS(**couchbase_data)
        self.couchbase_obj.create()
        op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE)
        self.operator_role = OCS(**op_data)
        self.operator_role.create()
        self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR)
        self.serviceaccount.exec_oc_cmd(
            "create serviceaccount couchbase-operator")

        dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets")
        startloc = dockercfgs.find('couchbase-operator-dockercfg')
        newdockerstr = dockercfgs[startloc:]
        endloc = newdockerstr.find(' ')
        dockerstr = newdockerstr[:endloc]
        self.secretsadder.exec_oc_cmd(
            f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}"
        )
        self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR)
        rolebind_cmd = "".join([
            "create rolebinding couchbase-operator-rolebinding ",
            "--role couchbase-operator ",
            "--serviceaccount couchbase-operator-namespace:couchbase-operator"
        ])
        self.rolebinding.exec_oc_cmd(rolebind_cmd)
        dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY)
        self.cb_deploy = OCS(**dep_data)
        self.cb_deploy.create()
        # Wait for couchbase operator pod to be running
        for couchbase_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3,
                                            get_pod_name_by_pattern,
                                            'couchbase-operator',
                                            constants.COUCHBASE_OPERATOR):
            try:
                if self.is_up_and_running(couchbase_pod[0], self.up_check):
                    break
            except IndexError:
                log.info("Couchbase operator is not up")

        cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET)
        self.cb_worker = OCS(**cb_work)
        self.cb_worker.create()

    def create_couchbase_worker(self, replicas=1):
        """
        Deploy a Couchbase server and pillowfight workload using operator

        The couchbase workers do not come up unless there is an admission controller
        running.  The admission controller is started from the default project prior
        to bringing up the operator.  Secrets, rolebindings and serviceaccounts
        need to also be generated.

        Once the couchbase operator is running, we need to wait for the three
        worker pods to also be up.  Then a pillowfight task is started.

        After the pillowfight task has finished, the log is collected and
        analyzed.

        Raises:
            Exception: If pillowfight results indicate that a minimum performance
                level is not reached (1 second response time, less than 1000 ops
                per second)

        """
        logging.info('Creating pods..')
        cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE)
        cb_example['spec']['servers'][0]['size'] = replicas
        self.cb_examples = OCS(**cb_example)
        self.cb_examples.create()

        # Wait for last of three workers to be running.

        logging.info('Waiting for the pods to Running')
        for cb_wrk_pods in TimeoutSampler(self.WAIT_FOR_TIME, 3,
                                          get_pod_name_by_pattern,
                                          'cb-example',
                                          constants.COUCHBASE_OPERATOR):
            try:
                if len(cb_wrk_pods) == replicas:
                    counter = 0
                    for cb_pod in cb_wrk_pods:
                        if self.is_up_and_running(cb_pod, self.up_check):
                            counter += 1
                            logging.info(f'Couchbase worker {cb_pod} is up')
                    if counter == replicas:
                        break
            except IndexError:
                logging.info(
                    f'Expected number of couchbase pods are {replicas} '
                    f'but only found {len(cb_wrk_pods)}')

    def run_workload(self, replicas):
        """
        Running workload with pillow fight operator
        Args:
         replicas (int): Number of pods

        """
        logging.info('Running IOs...')
        PillowFight.run_pillowfights(self, replicas=replicas)

    def analyze_run(self):
        """
        Analyzing the workload run logs

        """
        logging.info('Analyzing  workload run logs..')
        PillowFight.analyze_all(self)

    def teardown(self):
        """
        Delete objects created in roughly reverse order of how they were created.

        """
        self.cb_examples.delete()
        self.cb_worker.delete()
        self.cb_deploy.delete()
        self.pod_obj.exec_oc_cmd(
            command="delete rolebinding couchbase-operator-rolebinding")
        self.pod_obj.exec_oc_cmd(
            command="delete serviceaccount couchbase-operator")
        self.operator_role.delete()
        self.couchbase_obj.delete()
        switch_to_project('default')
        self.pod_obj.delete_project(constants.COUCHBASE_OPERATOR)
        for adm_yaml in self.admission_parts:
            adm_data = templating.load_yaml(adm_yaml)
            adm_obj = OCS(**adm_data)
            adm_obj.delete()
        # Before the code below was added, the teardown task would sometimes
        # fail with the leftover objects because it would still see one of the
        # couchbase pods.
        for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3,
                                        get_pod_name_by_pattern, 'couchbase',
                                        'default'):
            if admin_pod:
                continue
            else:
                break
        PillowFight.cleanup(self)
        switch_to_default_rook_cluster_project()
    def test_rgw_host_node_failure(
        self, nodes, node_restart_teardown, node_drain_teardown, mcg_obj, bucket_factory
    ):
        """
        Test case to fail node where RGW and the NooBaa DB are hosted
        and verify the new pods spin on a healthy node

        """

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        noobaa_pod_node = None
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name in [
                constants.NB_DB_NAME_46_AND_BELOW,
                constants.NB_DB_NAME_47_AND_ABOVE,
            ]:
                noobaa_pod_node = get_pod_node(noobaa_pod)
        if noobaa_pod_node is None:
            assert False, "Could not find the NooBaa DB pod"

        # Validate if RGW pod and noobaa-db are hosted on same node
        # If not, make sure both pods are hosted on same node
        log.info("Validate if RGW pod and noobaa-db are hosted on same node")
        rgw_pod_obj = get_rgw_pods()
        rgw_pod_node_list = [
            rgw_pod.get().get("spec").get("nodeName") for rgw_pod in rgw_pod_obj
        ]
        if not list(set(rgw_pod_node_list).intersection(noobaa_pod_node.name.split())):
            log.info(
                "Unschedule other two nodes such that RGW "
                "pod moves to node where NooBaa DB pod hosted"
            )
            worker_node_list = get_worker_nodes()
            node_names = list(set(worker_node_list) - set(noobaa_pod_node.name.split()))
            unschedule_nodes(node_names=node_names)
            ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            rgw_pod_obj[0].delete()
            ocp_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_count=len(rgw_pod_obj),
                selector=constants.RGW_APP_LABEL,
                timeout=300,
                sleep=5,
            )
            log.info("Schedule those nodes again")
            schedule_nodes(node_names=node_names)

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Verify all storage pods are running
            wait_for_storage_pods()

            # Check again the rgw pod move to node where NooBaa DB pod hosted
            rgw_pod_obj_list = get_rgw_pods()
            rgw_pod_node_list = [
                get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list
            ]
            value = [
                True if rgw_pod_node == noobaa_pod_node.name else False
                for rgw_pod_node in rgw_pod_node_list
            ]
            assert value, (
                "RGW Pod didn't move to node where NooBaa DB pod"
                " hosted even after cordoned and uncordoned nodes"
                f"RGW pod hosted: {rgw_pod_node_list}"
                f"NooBaa DB pod hosted: {noobaa_pod_node.name}"
            )

        log.info("RGW and noobaa-db are hosted on same node start the test execution")
        rgw_pod_obj = get_rgw_pods()
        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(
                    f"Stopping node {pod_node} where"
                    f" rgw pod {rgw_pod.name} and NooBaa DB are hosted"
                )
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(
                    resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720
                )

                # Validate new rgw pod spun
                ocp_obj = OCP(
                    kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE
                )
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Start the node
                nodes.start_nodes(node_obj)

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)

                # Verify all storage pods are running
                wait_for_storage_pods()

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()