def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=50): # Create pvc pvc_objs = multi_pvc_factory( project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs ) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state( dc_pod_objs, timeout=1200 ) for pod_obj in dc_pod_objs: pod_obj.run_io( storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io' ) # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP( kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'] ) pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3 ) assert ceph_health_check(), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check(delay=180), "Failed, Ceph health bad after nodes reboot"
def __init__(self, *args, **kwargs): """ Constructor for the MCG class """ self.namespace = config.ENV_DATA['cluster_namespace'] self.operator_pod = Pod(**get_pods_having_label( constants.NOOBAA_OPERATOR_POD_LABEL, self.namespace)[0]) self.core_pod = Pod(**get_pods_having_label( constants.NOOBAA_CORE_POD_LABEL, self.namespace)[0]) self.retrieve_noobaa_cli_binary() """ The certificate will be copied on each mcg_obj instantiation since the process is so light and quick, that the time required for the redundant copy is neglible in comparison to the time a hash comparison will take. """ retrieve_default_ingress_crt() get_noobaa = OCP(kind='noobaa', namespace=self.namespace).get() self.s3_endpoint = (get_noobaa.get('items')[0].get('status').get( 'services').get('serviceS3').get('externalDNS')[0]) self.s3_internal_endpoint = (get_noobaa.get('items')[0].get( 'status').get('services').get('serviceS3').get('internalDNS')[0]) self.mgmt_endpoint = (get_noobaa.get('items')[0].get('status').get( 'services').get('serviceMgmt').get('externalDNS')[0]) + '/rpc' self.region = config.ENV_DATA['region'] creds_secret_name = (get_noobaa.get('items')[0].get('status').get( 'accounts').get('admin').get('secretRef').get('name')) secret_ocp_obj = OCP(kind='secret', namespace=self.namespace) creds_secret_obj = secret_ocp_obj.get(creds_secret_name) self.access_key_id = base64.b64decode( creds_secret_obj.get('data').get('AWS_ACCESS_KEY_ID')).decode( 'utf-8') self.access_key = base64.b64decode( creds_secret_obj.get('data').get('AWS_SECRET_ACCESS_KEY')).decode( 'utf-8') self.noobaa_user = base64.b64decode( creds_secret_obj.get('data').get('email')).decode('utf-8') self.noobaa_password = base64.b64decode( creds_secret_obj.get('data').get('password')).decode('utf-8') self.noobaa_token = self.send_rpc_query( 'auth_api', 'create_auth', params={ 'role': 'admin', 'system': 'noobaa', 'email': self.noobaa_user, 'password': self.noobaa_password }).json().get('reply').get('token') self.s3_resource = boto3.resource( 's3', verify=constants.DEFAULT_INGRESS_CRT_LOCAL_PATH, endpoint_url=self.s3_endpoint, aws_access_key_id=self.access_key_id, aws_secret_access_key=self.access_key) self.s3_client = self.s3_resource.meta.client if (config.ENV_DATA['platform'].lower() == 'aws' and kwargs.get('create_aws_creds')): (self.cred_req_obj, self.aws_access_key_id, self.aws_access_key) = self.request_aws_credentials() self.aws_s3_resource = boto3.resource( 's3', endpoint_url="https://s3.amazonaws.com", aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_access_key) if (config.ENV_DATA['platform'].lower() in constants.CLOUD_PLATFORMS or storagecluster_independent_check()): logger.info( 'Checking whether RGW pod is not present on AWS platform') pods = pod.get_pods_having_label(label=constants.RGW_APP_LABEL, namespace=self.namespace) assert not pods, 'RGW pods should not exist in the current platform/cluster' elif config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS: rgw_count = 2 if float( config.ENV_DATA['ocs_version']) >= 4.5 else 1 logger.info( f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform' ) rgw_pod = OCP(kind=constants.POD, namespace=self.namespace) assert rgw_pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=rgw_count, timeout=60)
class PASTest(BaseTest): """ Base class for QPAS team - Performance and Scale tests This class contain functions which used by performance and scale test, and also can be used by E2E test which used the benchmark-operator (ripsaw) """ def setup(self): """ Setting up the environment for each performance and scale test """ log.info("Setting up test environment") self.crd_data = None # place holder for Benchmark CDR data self.es_backup = None # place holder for the elasticsearch backup self.main_es = None # place holder for the main elasticsearch object self.benchmark_obj = None # place holder for the benchmark object self.client_pod = None # Place holder for the client pod object self.dev_mode = config.RUN["cli_params"].get("dev_mode") self.pod_obj = OCP(kind="pod") # Collecting all Environment configuration Software & Hardware # for the performance report. self.environment = get_environment_info() self.environment["clusterID"] = get_running_cluster_id() self.get_osd_info() self.get_node_info(node_type="master") self.get_node_info(node_type="worker") def get_osd_info(self): """ Getting the OSD's information and update the main environment dictionary. """ ct_pod = pod.get_ceph_tools_pod() osd_info = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df") self.environment["osd_size"] = osd_info.get("nodes")[0].get( "crush_weight") self.environment["osd_num"] = len(osd_info.get("nodes")) self.environment["total_capacity"] = osd_info.get("summary").get( "total_kb_avail") self.environment["ocs_nodes_num"] = len(node.get_ocs_nodes()) def get_node_info(self, node_type="master"): """ Getting node type hardware information and update the main environment dictionary. Args: node_type (str): the node type to collect data about, can be : master / worker - the default is master """ if node_type == "master": nodes = node.get_master_nodes() elif node_type == "worker": nodes = node.get_worker_nodes() else: log.warning(f"Node type ({node_type}) is invalid") return oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) self.environment[f"{node_type}_nodes_num"] = len(nodes) self.environment[ f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"], ).rstrip() self.environment[ f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["free | grep Mem | awk '{print $2}'"]).rstrip() def ripsaw_deploy(self, ripsaw): """ Deploy the benchmark operator (formally ripsaw) CRD Args: ripsaw (obj): benchmark operator object """ log.info("Deploying benchmark operator (ripsaw)") ripsaw.apply_crd("resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml") def es_info_backup(self, elasticsearch): """ Saving the Original elastic-search IP and PORT - if defined in yaml Args: elasticsearch (obj): elasticsearch object """ self.crd_data["spec"]["elasticsearch"] = {} # for development mode use the Dev ES server if self.dev_mode and config.PERF.get("dev_lab_es"): log.info("Using the development ES server") self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("dev_es_server"), "port": config.PERF.get("dev_es_port"), "url": f"http://{config.PERF.get('dev_es_server')}:{config.PERF.get('dev_es_port')}", "parallel": True, } # for production mode use the Lab ES server if not self.dev_mode and config.PERF.get("production_es"): self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("production_es_server"), "port": config.PERF.get("production_es_port"), "url": f"http://{config.PERF.get('production_es_server')}:{config.PERF.get('production_es_port')}", "parallel": True, } # backup the Main ES info (if exists) if not self.crd_data["spec"]["elasticsearch"] == {}: self.backup_es = self.crd_data["spec"]["elasticsearch"] log.info( f"Creating object for the Main ES server on {self.backup_es['url']}" ) self.main_es = Elasticsearch([self.backup_es["url"]], verify_certs=True) else: log.warning( "Elastic Search information does not exists for this test") # Use the internal define elastic-search server in the test - if exist if elasticsearch: if not isinstance(elasticsearch, dict): # elasticsearch is an internally deployed server (obj) ip = elasticsearch.get_ip() port = elasticsearch.get_port() else: # elasticsearch is an existing server (dict) ip = elasticsearch.get("server") port = elasticsearch.get("port") self.crd_data["spec"]["elasticsearch"] = { "server": ip, "port": port, "url": f"http://{ip}:{port}", "parallel": True, } log.info( f"Going to use the ES : {self.crd_data['spec']['elasticsearch']}" ) elif config.PERF.get("internal_es_server"): # use an in-cluster elastic-search (not deployed by the test) self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("internal_es_server"), "port": config.PERF.get("internal_es_port"), "url": f"http://{config.PERF.get('internal_es_server')}:{config.PERF.get('internal_es_port')}", "parallel": True, } def set_storageclass(self, interface): """ Setting the benchmark CRD storageclass Args: interface (str): The interface which will used in the test """ if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using [{storageclass}] Storageclass") self.crd_data["spec"]["workload"]["args"][ "storageclass"] = storageclass def get_env_info(self): """ Getting the environment information and update the workload RC if necessary. """ if not self.environment["user"] == "": self.crd_data["spec"]["test_user"] = self.environment["user"] else: # since full results object need this parameter, initialize it from CR file self.environment["user"] = self.crd_data["spec"]["test_user"] self.crd_data["spec"]["clustername"] = self.environment["clustername"] log.debug(f"Environment information is : {self.environment}") def deploy_and_wait_for_wl_to_start(self, timeout=300, sleep=20): """ Deploy the workload and wait until it start working Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds """ log.debug(f"The {self.benchmark_name} CR file is {self.crd_data}") self.benchmark_obj = OCS(**self.crd_data) self.benchmark_obj.create() # This time is only for reporting - when the benchmark started. self.start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) # Wait for benchmark client pod to be created log.info(f"Waiting for {self.client_pod_name} to Start") for bm_pod in TimeoutSampler( timeout, sleep, get_pod_name_by_pattern, self.client_pod_name, constants.RIPSAW_NAMESPACE, ): try: if bm_pod[0] is not None: self.client_pod = bm_pod[0] break except IndexError: log.info("Bench pod is not ready yet") # Sleeping for 15 sec for the client pod to be fully accessible time.sleep(15) log.info(f"The benchmark pod {self.client_pod_name} is Running") def wait_for_wl_to_finish(self, timeout=18000, sleep=300): """ Waiting until the workload is finished and get the test log Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds Raise: exception for too much restarts of the test. """ log.info(f"Waiting for {self.client_pod_name} to complete") Finished = 0 restarts = 0 while not Finished: results = run_oc_command( "get pod --no-headers -o custom-columns=:metadata.name,:status.phase", namespace="my-ripsaw", ) fname = "" for name in results: if re.search(self.client_pod_name, name): (fname, status) = name.split() continue if not fname == self.client_pod: log.info( f"The pod {self.client_pod} was restart. the new client pod is {fname}" ) self.client_pod = fname restarts += 1 if restarts > 3: err_msg = f"Too much restarts of the benchmark ({restarts})" log.error(err_msg) raise Exception(err_msg) if status == "Succeeded": self.end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) self.test_logs = self.pod_obj.exec_oc_cmd( f"logs {self.client_pod}", out_yaml_format=False) log.info(f"{self.client_pod} completed successfully") Finished = 1 else: log.info( f"{self.client_pod} is in {status} State, and wait to Succeeded State." f" wait another {sleep} sec. for benchmark to complete") time.sleep(sleep) self.pod_obj.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=self.client_pod, timeout=timeout, sleep=sleep, ) # Getting the end time of the benchmark - for reporting. self.end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) self.test_logs = self.pod_obj.exec_oc_cmd(f"logs {self.client_pod}", out_yaml_format=False) # Saving the benchmark internal log into a file at the logs directory log_file_name = f"{self.full_log_path}/test-pod.log" try: with open(log_file_name, "w") as f: f.write(self.test_logs) log.info(f"The Test log can be found at : {log_file_name}") except Exception: log.warning(f"Cannot write the log to the file {log_file_name}") log.info(f"The {self.benchmark_name} benchmark complete") def copy_es_data(self, elasticsearch): """ Copy data from Internal ES (if exists) to the main ES Args: elasticsearch (obj): elasticsearch object (if exits) """ log.info(f"In copy_es_data Function - {elasticsearch}") if elasticsearch: log.info("Copy all data from Internal ES to Main ES") log.info("Dumping data from the Internal ES to tar ball file") elasticsearch.dumping_all_data(self.full_log_path) es_connection = self.backup_es es_connection["host"] = es_connection.pop("server") es_connection.pop("url") if elasticsearch_load(self.main_es, self.full_log_path): # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(10) log.info( f"All raw data for tests results can be found at : {self.full_log_path}" ) return True else: log.warning("Cannot upload data into the Main ES server") return False def read_from_es(self, es, index, uuid): """ Reading all results from elasticsearch server Args: es (dict): dictionary with elasticsearch info {server, port} index (str): the index name to read from the elasticsearch server uuid (str): the test UUID to find in the elasticsearch server Returns: list : list of all results """ con = Elasticsearch([{"host": es["server"], "port": es["port"]}]) query = {"size": 1000, "query": {"match": {"uuid": uuid}}} try: results = con.search(index=index, body=query) full_data = [] for res in results["hits"]["hits"]: full_data.append(res["_source"]) return full_data except Exception as e: log.warning(f"{index} Not found in the Internal ES. ({e})") return [] def es_connect(self): """ Create elasticsearch connection to the server Return: bool : True if there is a connection to the ES, False if not. """ OK = True # the return value try: log.info( f"try to connect the ES : {self.es['server']}:{self.es['port']}" ) self.es_con = Elasticsearch([{ "host": self.es["server"], "port": self.es["port"] }]) except Exception: log.error(f"Cannot connect to ES server {self.es}") OK = False # Testing the connection to the elastic-search if not self.es_con.ping(): log.error(f"Cannot connect to ES server {self.es}") OK = False return OK
class CouchBase(PillowFight): """ CouchBase workload operation """ WAIT_FOR_TIME = 1800 admission_parts = [ constants.COUCHBASE_ADMISSION_SERVICE_ACCOUNT_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_BINDING_YAML, constants.COUCHBASE_ADMISSION_SECRET_YAML, constants.COUCHBASE_ADMISSION_DEPLOYMENT_YAML, constants.COUCHBASE_ADMISSION_SERVICE_YAML, constants.COUCHBASE_MUTATING_WEBHOOK_YAML, constants.COUCHBASE_VALIDATING_WEBHOOK_YAML, ] pod_obj = OCP(kind="pod") ns_obj = OCP(kind="namespace") couchbase_pod = OCP(kind="pod") secretsadder = OCP(kind="pod") admission_pod = [] cb_worker = OCS() cb_examples = OCS() def __init__(self, **kwargs): """ Initializer function """ super().__init__(**kwargs) def is_up_and_running(self, pod_name, ocp_value): """ Test if the pod specified is up and running. Args: pod_name (str): Name of pod being checked. ocp_value (object): object used for running oc commands Returns: bool; True if pod is running, False otherwise """ if not pod_name: return False pod_info = ocp_value.exec_oc_cmd(f"get pods {pod_name} -o json") if pod_info["status"]["containerStatuses"][0]["ready"]: if "running" in pod_info["status"]["containerStatuses"][0][ "state"]: return True return False def setup_cb(self): """ Creating admission parts,couchbase operator pod, couchbase worker secret """ # Create admission controller log.info("Create admission controller process for Couchbase") self.up_adm_chk = OCP(namespace="default") self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR) self.adm_objects = [] for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() self.adm_objects.append(adm_obj) # Wait for admission pod to be created for adm_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "couchbase-operator-admission", "default", ): try: if self.is_up_and_running(adm_pod[0], self.up_adm_chk): self.admission_pod = adm_pod[0] break except IndexError: log.info("Admission pod is not ready yet") # Wait for admission pod to be running log.info("Waiting for admission pod to be running") admission_pod_obj = get_pod_obj(self.admission_pod, namespace="default") wait_for_resource_state( resource=admission_pod_obj, state=constants.STATUS_RUNNING, timeout=self.WAIT_FOR_TIME, ) self.ns_obj.new_project(constants.COUCHBASE_OPERATOR) couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML) self.couchbase_obj = OCS(**couchbase_data) self.couchbase_obj.create() op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE) self.operator_role = OCS(**op_data) self.operator_role.create() self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR) self.serviceaccount.exec_oc_cmd( "create serviceaccount couchbase-operator") dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets") startloc = dockercfgs.find("couchbase-operator-dockercfg") newdockerstr = dockercfgs[startloc:] endloc = newdockerstr.find(" ") dockerstr = newdockerstr[:endloc] self.secretsadder.exec_oc_cmd( f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}" ) self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR) rolebind_cmd = "".join([ "create rolebinding couchbase-operator-rolebinding ", "--role couchbase-operator ", "--serviceaccount couchbase-operator-namespace:couchbase-operator", ]) self.rolebinding.exec_oc_cmd(rolebind_cmd) dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY) self.cb_deploy = OCS(**dep_data) self.cb_deploy.create() # Wait for couchbase operator pod to be running for couchbase_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "couchbase-operator", constants.COUCHBASE_OPERATOR, ): try: if self.is_up_and_running(couchbase_pod[0], self.up_check): break except IndexError: log.info("Couchbase operator is not up") cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_worker = OCS(**cb_work) self.cb_worker.create() def create_couchbase_worker(self, replicas=1, sc_name=None): """ Deploy a Couchbase server and pillowfight workload using operator The couchbase workers do not come up unless there is an admission controller running. The admission controller is started from the default project prior to bringing up the operator. Secrets, rolebindings and serviceaccounts need to also be generated. Once the couchbase operator is running, we need to wait for the three worker pods to also be up. Then a pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ logging.info("Creating pods..") cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) if storagecluster_independent_check(): cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD cb_example["spec"]["servers"][0]["size"] = replicas if sc_name: cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName"] = sc_name self.cb_examples = OCS(**cb_example) self.cb_examples.create() # Wait for last of three workers to be running. logging.info("Waiting for the pods to Running") for cb_wrk_pods in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "cb-example", constants.COUCHBASE_OPERATOR, ): try: if len(cb_wrk_pods) == replicas: counter = 0 for cb_pod in cb_wrk_pods: if self.is_up_and_running(cb_pod, self.up_check): counter += 1 logging.info(f"Couchbase worker {cb_pod} is up") if counter == replicas: break except IndexError: logging.info( f"Expected number of couchbase pods are {replicas} " f"but only found {len(cb_wrk_pods)}") def run_workload(self, replicas, num_items=None, num_threads=None, run_in_bg=False): """ Running workload with pillow fight operator Args: replicas (int): Number of pods num_items (int): Number of items to be loaded to the cluster num_threads (int): Number of threads run_in_bg (bool) : Optional run IOs in background """ self.result = None logging.info("Running IOs...") if run_in_bg: executor = ThreadPoolExecutor(1) self.result = executor.submit( PillowFight.run_pillowfights, self, replicas=replicas, num_items=num_items, num_threads=num_threads, ) return self.result PillowFight.run_pillowfights(self, replicas=replicas, num_items=num_items, num_threads=num_threads) def analyze_run(self, skip_analyze=False): """ Analyzing the workload run logs Args: skip_analyze (bool): Option to skip logs analysis """ if not skip_analyze: logging.info("Analyzing workload run logs..") PillowFight.analyze_all(self) def respin_couchbase_app_pod(self): """ Respin the couchbase app pod Returns: pod status """ app_pod_list = get_pod_name_by_pattern("cb-example", constants.COUCHBASE_OPERATOR) app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)] logging.info(f"respin pod {app_pod}") app_pod_obj = get_pod_obj(app_pod, namespace=constants.COUCHBASE_OPERATOR) app_pod_obj.delete(wait=True, force=False) wait_for_resource_state(resource=app_pod_obj, state=constants.STATUS_RUNNING, timeout=300) def get_couchbase_nodes(self): """ Get nodes that contain a couchbase app pod Returns: list: List of nodes """ app_pods_list = get_pod_name_by_pattern("cb-example", constants.COUCHBASE_OPERATOR) app_pod_objs = list() for pod in app_pods_list: app_pod_objs.append( get_pod_obj(pod, namespace=constants.COUCHBASE_OPERATOR)) log.info("Create a list of nodes that contain a couchbase app pod") nodes_set = set() for pod in app_pod_objs: logging.info(f"pod {pod.name} located on " f"node {pod.get().get('spec').get('nodeName')}") nodes_set.add(pod.get().get("spec").get("nodeName")) return list(nodes_set) def teardown(self): """ Delete objects created in roughly reverse order of how they were created. """ self.cb_examples.delete() self.cb_worker.delete() self.cb_deploy.delete() self.pod_obj.exec_oc_cmd( command= "delete rolebinding couchbase-operator-rolebinding -n couchbase-operator-namespace" ) self.pod_obj.exec_oc_cmd( command= "delete serviceaccount couchbase-operator -n couchbase-operator-namespace" ) self.operator_role.delete() self.couchbase_obj.delete() switch_to_project("default") self.ns_obj.delete_project(constants.COUCHBASE_OPERATOR) self.ns_obj.wait_for_delete(resource_name=constants.COUCHBASE_OPERATOR, timeout=90) for adm_obj in self.adm_objects: adm_obj.delete() # Before the code below was added, the teardown task would sometimes # fail with the leftover objects because it would still see one of the # couchbase pods. for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "couchbase", "default"): if admin_pod: continue else: break PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
class ElasticSearch(object): """ ElasticSearch Environment """ def __init__(self): """ Initializer function """ log.info("Initializing the Elastic-Search environment object") self.namespace = "elastic-system" self.eck_path = "https://download.elastic.co/downloads/eck/1.1.2" self.eck_file = "all-in-one.yaml" self.pvc = "ocs_ci/templates/app-pods/es-pvc.yaml" self.crd = "ocs_ci/templates/app-pods/esq.yaml" self.lspid = None # Creating some different types of OCP objects self.ocp = OCP(kind="pod", resource_name="elastic-operator-0", namespace=self.namespace) self.ns_obj = OCP(kind="namespace", namespace=self.namespace) self.es = OCP(resource_name="quickstart-es-http", namespace=self.namespace) self.elasticsearch = OCP(namespace=self.namespace, kind="elasticsearch") self.password = OCP( kind="secret", resource_name="quickstart-es-elastic-user", namespace=self.namespace, ) # Fetch the all-in-one.yaml from the official repository self._get_eck_file() # Deploy the ECK all-in-one.yaml file self._deploy_eck() # Deploy the Elastic-Search server self._deploy_es() # Verify that ES is Up & Running timeout = 600 while timeout > 0: if self.get_health(): log.info("The ElasticSearch server is ready !") break else: log.warning("The ElasticSearch server is not ready yet") log.info("going to sleep for 30 sec. before next check") time.sleep(30) timeout -= 30 # Starting LocalServer process - port forwarding self.local_server() # Connect to the server self.con = self._es_connect() def _get_eck_file(self): """ Getting the ECK file from the official Elasticsearch web site and store it as a temporary file. Current version is 1.1.2, this need to be update with new versions, after testing it, and also it may need to update the CRD file (esq.yaml) with the new version as well. """ self.dir = tempfile.mkdtemp(prefix="elastic-system_") src_file = f"{self.eck_path}/{self.eck_file}" trg_file = f"{self.dir}/{self.eck_file}" log.info(f"Retrieving the ECK CR file from {src_file} into {trg_file}") try: urllib.request.urlretrieve(src_file, trg_file) except urllib.error.HTTPError as e: log.error(f"Can not connect to {src_file} : {e}") raise e def _deploy_eck(self): """ Deploying the ECK environment for the Elasticsearch, and make sure it is in Running mode """ log.info("Deploying the ECK environment for the ES cluster") self.ocp.apply(f"{self.dir}/{self.eck_file}") for es_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, "elastic-operator", self.namespace): try: if es_pod[0] is not None: self.eckpod = es_pod[0] log.info(f"The ECK pod {self.eckpod} is ready !") break except IndexError: log.info("ECK operator pod not ready yet") def get_ip(self): """ This function return the IP address of the Elasticsearch cluster. this IP is to use inside the OCP cluster Return str : String that represent the Ip Address. """ return self.es.get()["spec"]["clusterIP"] def get_port(self): """ This function return the port of the Elasticsearch cluster. Return str : String that represent the port. """ return self.es.get()["spec"]["ports"][0]["port"] def _deploy_es(self): log.info("Deploy the PVC for the ElasticSearch cluster") self.ocp.apply(self.pvc) log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) for es_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, "quickstart-es-default", self.namespace): try: if es_pod[0] is not None: self.espod = es_pod[0] log.info(f"The ElasticSearch pod {self.espod} Started") break except IndexError: log.info("elasticsearch pod not ready yet") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") assert es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ) log.info("Elastic Search is ready !!!") def get_health(self): """ This method return the health status of the Elasticsearch. Returns: bool : True if the status is green (OK) otherwise - False """ return self.elasticsearch.get( )["items"][0]["status"]["health"] == "green" def get_password(self): """ This method return the password used to connect the Elasticsearch. Returns: str : The password as text """ return base64.b64decode( self.password.get()["data"]["elastic"]).decode("utf-8") def cleanup(self): """ Cleanup the environment from all Elasticsearch components, and from the port forwarding process. """ log.info("Teardown the Elasticsearch environment") log.info(f"Killing the local server process ({self.lspid})") os.kill(self.lspid, signal.SIGKILL) log.info("Deleting all resources") subprocess.run(f"oc delete -f {self.crd}", shell=True) subprocess.run(f"oc delete -f {self.eck_file}", shell=True, cwd=self.dir) self.ns_obj.wait_for_delete(resource_name=self.namespace) def local_server(self): """ Starting sub-process that will do port-forwarding, to allow access from outside the open-shift cluster into the Elasticsearch server. """ cmd = f"oc -n {self.namespace } " cmd += f"port-forward service/quickstart-es-http {self.get_port()}" log.info(f"Going to run : {cmd}") proc = subprocess.Popen(cmd, shell=True) log.info(f"Starting LocalServer with PID of {proc.pid}") self.lspid = proc.pid def _es_connect(self): """ Create a connection to the ES via the localhost port-fwd Returns: Elasticsearch: elasticsearch connection object Raise: ConnectionError: if can not connect to the server """ try: es = Elasticsearch([{ "host": "localhost", "port": self.get_port() }]) except esexp.ConnectionError: log.error("Can not connect to ES server in the LocalServer") raise return es def get_indices(self): """ Getting list of all indices in the ES server - all created by the test, the installation of the ES was without any indexes pre-installed. Returns: list : list of all indices defined in the ES server """ results = [] log.info("Getting all indices") for ind in self.con.indices.get_alias("*"): results.append(ind) return results def _copy(self, es): """ Copy All data from the internal ES server to the main ES Args: es (obj): elasticsearch object which connected to the main ES """ query = {"size": 1000, "query": {"match_all": {}}} for ind in self.get_indices(): log.info(f"Reading {ind} from internal ES server") try: result = self.con.search(index=ind, body=query) except esexp.NotFoundError: log.warning(f"{ind} Not found in the Internal ES.") continue log.debug(f"The results from internal ES for {ind} are :{result}") log.info(f"Writing {ind} into main ES server") for doc in result["hits"]["hits"]: log.debug(f"Going to write : {doc}") es.index(index=ind, doc_type="_doc", body=doc["_source"])
def delete(self, retry=True): """ Deletes the current namespacestore by using OC/CLI commands Args: retry (bool): Whether to retry the deletion if it fails """ log.info(f"Cleaning up namespacestore {self.name}") def _oc_deletion_flow(): try: OCP( kind="namespacestore", namespace=config.ENV_DATA["cluster_namespace"], ).delete(resource_name=self.name) return True except CommandFailed as e: if "not found" in str(e).lower(): log.warning(f"Namespacestore {self.name} was already deleted.") return True elif all( err in e.args[0] for err in ["cannot complete because pool", "in", "state"] ): if retry: log.warning( f"Deletion of {self.name} failed due to its state; Retrying" ) return False else: raise else: raise def _cli_deletion_flow(): try: self.mcg_obj.exec_mcg_cmd(f"namespacestore delete {self.name}") return True except CommandFailed as e: if "being used by one or more buckets" in str(e).lower(): log.warning( f"Deletion of {self.name} failed because it's being used by a bucket. " "Retrying..." ) else: log.warning(f"Deletion of self.name failed. Error:\n{str(e)}") return False cmdMap = { "oc": _oc_deletion_flow, "cli": _cli_deletion_flow, } if retry: sample = TimeoutSampler( timeout=120, sleep=20, func=cmdMap[self.method], ) if not sample.wait_for_func_status(result=True): err_msg = f"Failed to delete {self.name}" log.error(err_msg) raise TimeoutExpiredError(err_msg) else: cmdMap[self.method]() log.info(f"Verifying whether namespacestore {self.name} exists after deletion") ns_deleted_successfully = False if self.method == "oc": try: OCP( kind=constants.NAMESPACESTORE, namespace=config.ENV_DATA["cluster_namespace"], resource_name=self.name, ).get() except CommandFailed as e: if "not found" in str(e).lower(): log.info(f"Namespacestore {self.name} was deleted.") ns_deleted_successfully = True else: raise elif self.method == "cli": if self.name not in self.mcg_obj.exec_mcg_cmd("namespacestore list").stdout: ns_deleted_successfully = True assert ( ns_deleted_successfully ), f"Namespacestore {self.name} was not deleted successfully"
def test_upgrade(): ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): namespace = config.ENV_DATA['cluster_namespace'] version_before_upgrade = config.ENV_DATA.get("ocs_version") upgrade_version = config.UPGRADE.get("upgrade_ocs_version", version_before_upgrade) ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image') if ocs_registry_image: upgrade_version = get_ocs_version_from_image(ocs_registry_image) parsed_version_before_upgrade = parse_version(version_before_upgrade) parsed_upgrade_version = parse_version(upgrade_version) assert parsed_upgrade_version >= parsed_version_before_upgrade, ( f"Version you would like to upgrade to: {upgrade_version} " f"is not higher or equal to the version you currently running: " f"{version_before_upgrade}") operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) channel = config.DEPLOYMENT.get('ocs_csv_channel') csv_name_pre_upgrade = package_manifest.get_current_csv(channel) log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}") csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade, namespace=namespace) pre_upgrade_images = get_images(csv_pre_upgrade.get()) version_change = parsed_upgrade_version > parsed_version_before_upgrade if version_change: version_config_file = os.path.join(constants.CONF_DIR, 'ocs_version', f'ocs-{upgrade_version}.yaml') load_config_file(version_config_file) ocs_catalog = CatalogSource( resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME, namespace=constants.MARKETPLACE_NAMESPACE, ) upgrade_in_current_source = config.UPGRADE.get( 'upgrade_in_current_source', False) if not upgrade_in_current_source: if not ocs_catalog.is_exist() and not upgrade_in_current_source: log.info("OCS catalog source doesn't exist. Creating new one.") create_catalog_source(ocs_registry_image, ignore_upgrade=True) image_url = ocs_catalog.get_image_url() image_tag = ocs_catalog.get_image_name() log.info(f"Current image is: {image_url}, tag: {image_tag}") if ocs_registry_image: image_url, new_image_tag = ocs_registry_image.split(':') elif (config.UPGRADE.get('upgrade_to_latest', True) or version_change): new_image_tag = get_latest_ds_olm_tag() else: new_image_tag = get_next_version_available_for_upgrade( image_tag) cs_data = deepcopy(ocs_catalog.data) image_for_upgrade = ':'.join([image_url, new_image_tag]) log.info(f"Image: {image_for_upgrade} will be used for upgrade.") cs_data['spec']['image'] = image_for_upgrade with NamedTemporaryFile() as cs_yaml: dump_data_to_temp_yaml(cs_data, cs_yaml.name) ocs_catalog.apply(cs_yaml.name) # Wait for the new package manifest for upgrade. operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) package_manifest.wait_for_resource() channel = config.DEPLOYMENT.get('ocs_csv_channel') if not channel: channel = package_manifest.get_default_channel() # update subscription subscription = OCP( resource_name=constants.OCS_SUBSCRIPTION, kind='subscription', namespace=config.ENV_DATA['cluster_namespace'], ) current_ocs_source = subscription.data['spec']['source'] log.info(f"Current OCS subscription source: {current_ocs_source}") ocs_source = current_ocs_source if upgrade_in_current_source else ( constants.OPERATOR_CATALOG_SOURCE_NAME) patch_subscription_cmd = ( f'oc patch subscription {constants.OCS_SUBSCRIPTION} ' f'-n {namespace} --type merge -p \'{{"spec":{{"channel": ' f'"{channel}", "source": "{ocs_source}"}}}}\'') run_cmd(patch_subscription_cmd) subscription_plan_approval = config.DEPLOYMENT.get( 'subscription_plan_approval') if subscription_plan_approval == 'Manual': wait_for_install_plan_and_approve(namespace) attempts = 145 for attempt in range(1, attempts + 1): log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.") csv_name_post_upgrade = package_manifest.get_current_csv(channel) if csv_name_post_upgrade == csv_name_pre_upgrade: log.info(f"CSV is still: {csv_name_post_upgrade}") sleep(5) else: log.info(f"CSV now upgraded to: {csv_name_post_upgrade}") break if attempts == attempt: raise TimeoutException("No new CSV found after upgrade!") csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade, namespace=namespace) log.info( f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state" ) if version_before_upgrade == '4.2' and upgrade_version == '4.3': log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3") setup_ceph_toolbox(force_setup=True) csv_post_upgrade.wait_for_phase("Succeeded", timeout=600) post_upgrade_images = get_images(csv_post_upgrade.get()) old_images, _, _ = get_upgrade_image_info(pre_upgrade_images, post_upgrade_images) verify_image_versions(old_images, parsed_upgrade_version) ocs_install_verification( timeout=600, skip_osd_distribution_check=True, ocs_registry_image=ocs_registry_image, post_upgrade_verification=True, )
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml") if interface == "CephBlockPool": sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if "elasticsearch" in fio_cr["spec"]: backup_es = fio_cr["spec"]["elasticsearch"] else: log.warning( "Elastic Search information does not exists in YAML file") fio_cr["spec"]["elasticsearch"] = {} # Use the internal define elastic-search server in the test - if exist if es: fio_cr["spec"]["elasticsearch"] = { "server": es.get_ip(), "port": es.get_port(), } # Setting the data set to 40% of the total storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() total_data_set = int(ceph_capacity * 0.4) filesize = int(fio_cr["spec"]["workload"]["args"]["filesize"].replace( "GiB", "")) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_capacity * 0.008) fio_cr["spec"]["workload"]["args"]["filesize"] = f"{filesize}GiB" # make sure that the storage size is larger then the file size fio_cr["spec"]["workload"]["args"][ "storagesize"] = f"{int(filesize * 1.2)}Gi" fio_cr["spec"]["workload"]["args"]["servers"] = int(total_data_set / filesize) log.info(f"Total Data set to work on is : {total_data_set} GiB") environment = get_environment_info() if not environment["user"] == "": fio_cr["spec"]["test_user"] = environment["user"] fio_cr["spec"]["clustername"] = environment["clustername"] log.debug(f"Environment information is : {environment}") fio_cr["spec"]["workload"]["args"]["storageclass"] = sc if io_pattern == "sequential": fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"] fio_cr["spec"]["workload"]["args"]["iodepth"] = 1 log.info(f"The FIO CR file is {fio_cr}") fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, "fio-client", constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information fio_cr["spec"]["elasticsearch"] = backup_es full_results = FIOResultsAnalyse(uuid, fio_cr) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Setting the global parameters of the test full_results.add_key("io_pattern", io_pattern) full_results.add_key("dataset", f"{total_data_set}GiB") full_results.add_key("file_size", fio_cr["spec"]["workload"]["args"]["filesize"]) full_results.add_key("servers", fio_cr["spec"]["workload"]["args"]["servers"]) full_results.add_key("samples", fio_cr["spec"]["workload"]["args"]["samples"]) full_results.add_key("operations", fio_cr["spec"]["workload"]["args"]["jobs"]) full_results.add_key("block_sizes", fio_cr["spec"]["workload"]["args"]["bs"]) full_results.add_key("io_depth", fio_cr["spec"]["workload"]["args"]["iodepth"]) full_results.add_key("jobs", fio_cr["spec"]["workload"]["args"]["numjobs"]) full_results.add_key( "runtime", { "read": fio_cr["spec"]["workload"]["args"]["read_runtime"], "write": fio_cr["spec"]["workload"]["args"]["write_runtime"], }, ) full_results.add_key( "storageclass", fio_cr["spec"]["workload"]["args"]["storageclass"]) full_results.add_key("vol_size", fio_cr["spec"]["workload"]["args"]["storagesize"]) # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind="pod") pod_obj.wait_for_resource( condition="Completed", resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) full_results.add_key("test_time", { "start": start_time, "end": end_time }) output = run_cmd(f"oc logs {fio_client_pod}") log.info(f"The Test log is : {output}") try: if "Fio failed to execute" not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() log.debug(f"Full results is : {full_results.results}") # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info("Copy all data from Internal ES to Main ES") es._copy(full_results.es) # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(30) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info(f"The Result can be found at ; {full_results.results_link()}")
def test_replication_with_disruptions( self, awscli_pod_session, mcg_obj_session, cld_mgr, bucket_factory, source_bucketclass, target_bucketclass, test_directory_setup, nodes, ): # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket target_bucket_name = bucket_factory( bucketclass=target_bucketclass)[0].name replication_policy = ("basic-replication-rule", target_bucket_name, None) source_bucket_name = bucket_factory( bucketclass=source_bucketclass, replication_policy=replication_policy)[0].name written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, source_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=5, pattern="first-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Uni-directional bucket replication working as expected") # change from uni-directional to bi-directional replication policy logger.info( "Changing the replication policy from uni to bi-directional!") bi_replication_policy_dict = { "spec": { "additionalConfig": { "replicationPolicy": json.dumps([{ "rule_id": "basic-replication-rule-2", "destination_bucket": source_bucket_name, }]) } } } OCP( namespace=config.ENV_DATA["cluster_namespace"], kind="obc", resource_name=target_bucket_name, ).patch(params=json.dumps(bi_replication_policy_dict), format_type="merge") logger.info( "Patch ran successfully! Changed the replication policy from uni to bi directional" ) # write objects to the second bucket and see if it's replicated on the other logger.info("checking if bi-directional replication works!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=3, pattern="second-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Bi directional bucket replication working as expected") # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on # write logger.info( "checking replication when one of the bucket's objects are deleted!!" ) try: mcg_obj_session.s3_resource.Bucket( target_bucket_name).objects.all().delete() except CommandFailed as e: logger.error(f"[Error] while deleting objects: {e}") if len( mcg_obj_session.s3_list_all_objects_in_bucket( target_bucket_name)) != 0: assert ( False ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}" logger.info("All the objects in RGW namespace buckets are deleted!!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="third-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info( "All the objects retrieved back to s3-compatible bucket on new write!!" ) # restart RGW pods and then see if object sync still works logger.info( "Checking if the replication works when there is RGW pod restarts!!" ) written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fourth-write-", ) logger.info(f"Written objects: {written_random_objects}") pod_names = get_pod_name_by_pattern( "rgw", namespace=config.ENV_DATA["cluster_namespace"]) pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"]) delete_pods(pod_objs=pod_objs) wait_for_pods_to_be_running( pod_names=pod_names, namespace=config.ENV_DATA["cluster_namespace"]) compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Object sync works after the RGW pod restarted!!") # write some object to any of the bucket, followed by immediate cluster restart logger.info("Checking replication when there is a cluster reboot!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fifth-write-", ) logger.info(f"Written objects: {written_random_objects}") node_list = get_worker_nodes() node_objs = get_node_objs(node_list) nodes.restart_nodes(node_objs, timeout=500) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) wait_for_pods_to_be_running( namespace=config.ENV_DATA["cluster_namespace"], timeout=800) logger.info("Nodes rebooted successfully!!") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Objects sync works even when the cluster is rebooted")
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data['spec']['version'] ocs_version = config.ENV_DATA['ocs_version'] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ocs_version in csv_version, ( f"OCS version: {ocs_version} mismatch with CSV version {csv_version}") # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( 'ocs_registry_image') if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch') if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT['external_mode']: osd_count = (int( storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0] ['replica'])) rgw_count = None if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS: # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1 # post upgrade to OCS 4.5. Tracked with # https://github.com/red-hat-storage/ocs-ci/issues/2532 rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not ( post_upgrade_verification) else 1 # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore if float(config.ENV_DATA['ocs_version']) == 4.4 and config.ENV_DATA.get( 'platform') == constants.AZURE_PLATFORM: rgw_count = 1 if float(config.ENV_DATA['ocs_version']) == 4.5 and config.ENV_DATA.get( 'platform' ) == constants.AZURE_PLATFORM and post_upgrade_verification: rgw_count = 1 # Fetch the min and max Noobaa endpoints from the run config if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU, min_memory=constants.MIN_NODE_MEMORY): min_eps = config.DEPLOYMENT.get('min_noobaa_endpoints') max_eps = config.DEPLOYMENT.get('max_noobaa_endpoints') else: min_eps = 1 max_eps = 1 if float(config.ENV_DATA['ocs_version']) < 4.6 else 2 resources_dict = { constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_DB_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps } if not config.DEPLOYMENT['external_mode']: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if not config.ENV_DATA.get( 'platform') in constants.ON_PREM_PLATFORMS: continue assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})") # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } if config.DEPLOYMENT['external_mode']: required_storage_classes.update({ f'{storage_cluster_name}-ceph-rgw', f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io' }) storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not config.DEPLOYMENT['external_mode']: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] deviceset_count = get_deviceset_count() node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > deviceset_count, ( "OSD's are not distributed evenly across worker nodes") # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = ({ item['metadata']['name'] for item in csi_driver.get()['items'] }) assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT['external_mode']: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert sc_rbd['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output if not config.DEPLOYMENT['external_mode']: log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if (config.DEPLOYMENT.get('local_storage') and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if float(config.ENV_DATA['ocs_version']) < 4.6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ]) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ( 'snapshot' not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()['spec']['install']['spec']['deployments'] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator' ] assert { 'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false' } in (rook_ceph_operator_deployment[0]['spec']['template']['spec'] ['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump', format='') pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [ rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get('fips'): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled()
def test_smallfile_workload( self, ripsaw, es, file_size, files, threads, samples, interface ): """ Run SmallFile Workload """ # Loading the main template yaml file for the benchmark sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if "elasticsearch" in sf_data["spec"]: sf_data["spec"]["elasticsearch"][ "url" ] = f"http://{sf_data['spec']['elasticsearch']['server']}:{sf_data['spec']['elasticsearch']['port']}" backup_es = sf_data["spec"]["elasticsearch"] else: log.warning("Elastic Search information does not exists in YAML file") sf_data["spec"]["elasticsearch"] = {} # Use the internal define elastic-search server in the test - if exist if es: sf_data["spec"]["elasticsearch"] = { "url": f"http://{es.get_ip()}:{es.get_port()}", "server": es.get_ip(), "port": es.get_port(), } log.info("Apply Operator CRD") ripsaw.apply_crd("resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml") if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using {storageclass} Storageclass") sf_data["spec"]["workload"]["args"]["storageclass"] = storageclass log.info("Running SmallFile bench") """ Setting up the parameters for this test """ sf_data["spec"]["workload"]["args"]["file_size"] = file_size sf_data["spec"]["workload"]["args"]["files"] = files sf_data["spec"]["workload"]["args"]["threads"] = threads sf_data["spec"]["workload"]["args"]["samples"] = samples """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. Since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ vol_size = int(files * threads * file_size * 3) vol_size = int(vol_size / constants.GB2KB) if vol_size < 100: vol_size = 100 sf_data["spec"]["workload"]["args"]["storagesize"] = f"{vol_size}Gi" environment = get_environment_info() if not environment["user"] == "": sf_data["spec"]["test_user"] = environment["user"] else: # since full results object need this parameter, initialize it from CR file environment["user"] = sf_data["spec"]["test_user"] sf_data["spec"]["clustername"] = environment["clustername"] sf_obj = OCS(**sf_data) sf_obj.create() log.info(f"The smallfile yaml file is {sf_data}") # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler( 240, 10, get_pod_name_by_pattern, "smallfile-client", constants.RIPSAW_NAMESPACE, ): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind="pod", namespace=constants.RIPSAW_NAMESPACE) log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600, ) # Getting the start time of the test start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) test_start_time = time.time() # After testing manually, changing the timeout timeout = 3600 # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(small_file_client_pod) # Setting back the original elastic-search information if backup_es: sf_data["spec"]["elasticsearch"] = backup_es full_results = SmallFileResultsAnalyse(uuid, sf_data) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Calculating the total size of the working data set - in GB full_results.add_key( "dataset", file_size * files * threads * full_results.results["clients"] / constants.GB2KB, ) full_results.add_key( "global_options", { "files": files, "file_size": file_size, "storageclass": sf_data["spec"]["workload"]["args"]["storageclass"], "vol_size": sf_data["spec"]["workload"]["args"]["storagesize"], }, ) while True: logs = bench_pod.exec_oc_cmd( f"logs {small_file_client_pod}", out_yaml_format=False ) if "RUN STATUS DONE" in logs: # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) full_results.add_key( "test_time", {"start": start_time, "end": end_time} ) # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info("Copy all data from Internal ES to Main ES") es._copy(full_results.es) full_results.read() if not full_results.dont_check: full_results.add_key("hosts", full_results.get_clients_list()) full_results.init_full_results() full_results.aggregate_host_results() test_status = full_results.aggregate_samples_results() full_results.es_write() # Creating full link to the results on the ES server log.info( f"The Result can be found at ; {full_results.results_link()}" ) else: test_status = True break if timeout < (time.time() - test_start_time): raise TimeoutError("Timed out waiting for benchmark to complete") time.sleep(30) assert not get_logs_with_errors() and test_status, "Test Failed"
def add_capacity(osd_size_capacity_requested): """ Add storage capacity to the cluster Args: osd_size_capacity_requested(int): Requested osd size capacity Returns: new storage device set count (int) : Returns True if all OSDs are in Running state Note: "StoragedeviceSets->count" represents the set of 3 OSDs. That is, if there are 3 OSDs in the system then count will be 1. If there are 6 OSDs then count is 2 and so on. By changing this value,we can add extra devices to the cluster. For example, if we want to expand the cluster by 3 more osds in a cluster that already has 3 osds, we can set count as 2. So, with each increase of count by 1, we get 3 OSDs extra added to the cluster. This is how we are going to 'add capacity' via automation. As we know that OCS has 3 way replica. That is, same data is placed in 3 OSDs. Because of this, the total usable capacity for apps from 3 OSDs will be the size of one OSD (all osds are of same size). If we want to add more capacity to the cluster then we need to add 3 OSDs of same size as that of the original OSD. add_capacity needs to accept the 'capacity_to_add' as an argument. From this we need to arrive at storagedeviceSets -> count and then "Patch" this count to get the required capacity to add. To do so, we use following formula: storageDeviceSets->count = (capacity reqested / osd capacity ) + existing count storageDeviceSets """ osd_size_existing = get_osd_size() device_sets_required = int(osd_size_capacity_requested / osd_size_existing) old_storage_devices_sets_count = get_deviceset_count() new_storage_devices_sets_count = int(device_sets_required + old_storage_devices_sets_count) lvpresent = localstorage.check_local_volume() if lvpresent: ocp_obj = OCP(kind='localvolume', namespace=constants.LOCAL_STORAGE_NAMESPACE) localvolume_data = ocp_obj.get(resource_name='local-block') device_list = localvolume_data['spec']['storageClassDevices'][0][ 'devicePaths'] final_device_list = localstorage.get_new_device_paths( device_sets_required, osd_size_capacity_requested) device_list.sort() final_device_list.sort() if device_list == final_device_list: raise ResourceNotFoundError("No Extra device found") param = f"""[{{ "op": "replace", "path": "/spec/storageClassDevices/0/devicePaths", "value": {final_device_list}}}]""" log.info(f"Final device list : {final_device_list}") lvcr = localstorage.get_local_volume_cr() log.info("Patching Local Volume CR...") lvcr.patch(resource_name=lvcr.get()['items'][0]['metadata']['name'], params=param.strip('\n'), format_type='json') localstorage.check_pvs_created( int(len(final_device_list) / new_storage_devices_sets_count)) sc = get_storage_cluster() # adding the storage capacity to the cluster params = f"""[{{ "op": "replace", "path": "/spec/storageDeviceSets/0/count", "value": {new_storage_devices_sets_count}}}]""" sc.patch(resource_name=sc.get()['items'][0]['metadata']['name'], params=params.strip('\n'), format_type='json') return new_storage_devices_sets_count
def run_pillowfights(self, replicas=1, num_items=None, num_threads=None, timeout=1800): """ loop through all the yaml files extracted from the pillowfight repo and run them. Run oc logs on the results and save the logs in self.logs directory Args: replicas (int): Number of pod replicas num_items (int): Number of items to be loaded to the cluster num_threads (int): Number of threads """ ocp_local = OCP(namespace=self.namespace) self.replicas = replicas for i in range(self.replicas): # for basic-fillowfight.yaml pfight = templating.load_yaml(constants.COUCHBASE_PILLOWFIGHT) pfight["metadata"]["name"] = "pillowfight-rbd-simple" + f"{i}" # change the name pfight["spec"]["template"]["spec"]["containers"][0]["command"][2] = ( f"couchbase://cb-example-000{i}.cb-example." f"couchbase-operator-namespace.svc:8091/default?select_bucket=true" ) # num of items pfight["spec"]["template"]["spec"]["containers"][0]["command"][ 4] = (str(num_items) if num_items else "20000") # num of threads pfight["spec"]["template"]["spec"]["containers"][0]["command"][ 13] = (str(num_threads) if num_threads else "20") lpillowfight = OCS(**pfight) lpillowfight.create() self.pods_info = {} for pillowfight_pods in TimeoutSampler( timeout, 9, get_pod_name_by_pattern, "pillowfight", constants.COUCHBASE_OPERATOR, ): try: counter = 0 for pf_pod in pillowfight_pods: pod_info = self.up_check.exec_oc_cmd( f"get pods {pf_pod} -o json") pf_status = pod_info["status"]["containerStatuses"][0][ "state"] if "terminated" in pf_status: pf_completion_info = pf_status["terminated"]["reason"] if pf_completion_info == constants.STATUS_COMPLETED: counter += 1 self.pods_info.update({pf_pod: pf_completion_info}) elif "running" in pf_status: pass if counter == self.replicas: break except IndexError: log.info("Pillowfight not yet completed") log.info(self.pods_info) for pod, pf_completion_info in self.pods_info.items(): if pf_completion_info == "Completed": pf_endlog = f"{pod}.log" pf_log = join(self.logs, pf_endlog) data_from_log = ocp_local.exec_oc_cmd( f"logs -f {pod} --ignore-errors", out_yaml_format=False) data_from_log = data_from_log.replace("\x00", "") with open(pf_log, "w") as fd: fd.write(data_from_log) elif pf_completion_info == "Error": raise Exception("Pillowfight failed to complete")
class PillowFight(object): """ Workload operation using PillowFight This class was modelled after the RipSaw class in this directory. """ MIN_ACCEPTABLE_OPS_PER_SEC = 2000 MAX_ACCEPTABLE_RESPONSE_TIME = 2000 def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): Following kwargs are valid repo: PillowFight repo to used - a github link branch: branch to use from the repo namespace: namespace for the operator Example Usage: r1 = PillowFight() r1.run_pillowfights() # To run a private yaml my_custom_bench = my_custom_bench.yaml run_cmd('oc apply -f my_custom_bench') # To get pillowfight data from log file data = r1.extract_data(log_file) # To do basic sanity checking of data r1.sanity_check(data) """ self.args = kwargs self.namespace = self.args.get("namespace", "couchbase-operator-namespace") self.ocp = OCP() self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR) self.logs = tempfile.mkdtemp(prefix="pf_logs_") def run_pillowfights(self, replicas=1, num_items=None, num_threads=None, timeout=1800): """ loop through all the yaml files extracted from the pillowfight repo and run them. Run oc logs on the results and save the logs in self.logs directory Args: replicas (int): Number of pod replicas num_items (int): Number of items to be loaded to the cluster num_threads (int): Number of threads """ ocp_local = OCP(namespace=self.namespace) self.replicas = replicas for i in range(self.replicas): # for basic-fillowfight.yaml pfight = templating.load_yaml(constants.COUCHBASE_PILLOWFIGHT) pfight["metadata"]["name"] = "pillowfight-rbd-simple" + f"{i}" # change the name pfight["spec"]["template"]["spec"]["containers"][0]["command"][2] = ( f"couchbase://cb-example-000{i}.cb-example." f"couchbase-operator-namespace.svc:8091/default?select_bucket=true" ) # num of items pfight["spec"]["template"]["spec"]["containers"][0]["command"][ 4] = (str(num_items) if num_items else "20000") # num of threads pfight["spec"]["template"]["spec"]["containers"][0]["command"][ 13] = (str(num_threads) if num_threads else "20") lpillowfight = OCS(**pfight) lpillowfight.create() self.pods_info = {} for pillowfight_pods in TimeoutSampler( timeout, 9, get_pod_name_by_pattern, "pillowfight", constants.COUCHBASE_OPERATOR, ): try: counter = 0 for pf_pod in pillowfight_pods: pod_info = self.up_check.exec_oc_cmd( f"get pods {pf_pod} -o json") pf_status = pod_info["status"]["containerStatuses"][0][ "state"] if "terminated" in pf_status: pf_completion_info = pf_status["terminated"]["reason"] if pf_completion_info == constants.STATUS_COMPLETED: counter += 1 self.pods_info.update({pf_pod: pf_completion_info}) elif "running" in pf_status: pass if counter == self.replicas: break except IndexError: log.info("Pillowfight not yet completed") log.info(self.pods_info) for pod, pf_completion_info in self.pods_info.items(): if pf_completion_info == "Completed": pf_endlog = f"{pod}.log" pf_log = join(self.logs, pf_endlog) data_from_log = ocp_local.exec_oc_cmd( f"logs -f {pod} --ignore-errors", out_yaml_format=False) data_from_log = data_from_log.replace("\x00", "") with open(pf_log, "w") as fd: fd.write(data_from_log) elif pf_completion_info == "Error": raise Exception("Pillowfight failed to complete") def analyze_all(self): """ Analyze the data extracted into self.logs files """ for path in listdir(self.logs): full_path = join(self.logs, path) log.info(f"Analyzing {full_path}") with open(full_path, "r") as fdesc: data_from_log = fdesc.read() log_data = self.parse_pillowfight_log(data_from_log) self.sanity_check(log_data) def sanity_check(self, stats): """ Make sure the worst cases for ops per second and response times are within an acceptable range. """ stat1 = min(stats["opspersec"]) if stat1 < self.MIN_ACCEPTABLE_OPS_PER_SEC: raise Exception(f"Worst OPS/SEC value reported is {stat1}") stat2 = max(stats["resptimes"].keys()) / 1000 if stat2 > self.MAX_ACCEPTABLE_RESPONSE_TIME: raise Exception( f"Worst response time reported is {stat2} milliseconds") def parse_pillowfight_log(self, data_from_log): """ Run oc logs on the pillowfight pod passed in. Cleanup the output from oc logs to handle peculiarities in the couchbase log results, and generate a summary of the results. The dictionary returned has two values; 'opspersec' and 'resptimes'. opspersec is a list of ops per second numbers reported.' resptimes is a dictionary index by the max response time of a range. Each entry in resptimes contains a minimum response time for that range, and a count of how many messages fall within that range. Args: data_from_log (str): log data Returns: dict: ops per sec and response time information """ # The data in the couchbase logs is kind of abnormal. # It contains histograms with invalid unicode charaters for yaml # output (which is why out_yaml_format=False is used). # It also seems to write a block of text inside another block at # an unpredictable location. The value good_txt below is the output # of the log with that data removed.. # # So what's left is a list of OPS/SEC values and a histogram of # response times. This routine organizes that data. ops_per_sec = [] resp_hist = {} log.info("*******Couchbase raw output log*********\n" f"{data_from_log}") lines = data_from_log.split("\n") for dline in lines: try: if dline.startswith("OPS/SEC"): dfields = dline.split(" ") dnumb = int(dfields[-1].strip()) ops_per_sec.append(dnumb) if re.match("^\\[\\d+ +- \\d+ *\\][um]s \\|#* - \\d+", dline): for element in ["[", "]", "|", "-", "#"]: dline = dline.replace(element, " ") parts = dline.split() i1 = int(parts[0]) i2 = int(parts[1]) if parts[2] == "ms": i1 *= 1000 i2 *= 1000 resp_hist[i2] = {"minindx": i1, "number": int(parts[3])} except ValueError: log.info(f"{dline} -- contains invalid data") ret_data = {"opspersec": ops_per_sec, "resptimes": resp_hist} return ret_data def export_pfoutput_to_googlesheet(self, sheet_name, sheet_index): """ Collect pillowfight output to google spreadsheet Args: sheet_name (str): Name of the sheet sheet_index (int): Index of sheet """ # Collect data and export to Google doc spreadsheet g_sheet = GoogleSpreadSheetAPI(sheet_name=sheet_name, sheet_index=sheet_index) log.info("Exporting pf data to google spreadsheet") for path in listdir(self.logs): full_path = join(self.logs, path) with open(full_path, "r") as fdesc: data_from_log = fdesc.read() log_data = self.parse_pillowfight_log(data_from_log) g_sheet.insert_row( [ f"{path}", min(log_data["opspersec"]), max(log_data["resptimes"].keys()) / 1000, ], 2, ) g_sheet.insert_row(["", "opspersec", "resptimes"], 2) # Capturing versions(OCP, OCS and Ceph) and test run name g_sheet.insert_row( [ f"ocp_version:{utils.get_cluster_version()}", f"ocs_build_number:{utils.get_ocs_build_number()}", f"ceph_version:{utils.get_ceph_version()}", f"test_run_name:{utils.get_testrun_name()}", ], 2, ) def cleanup(self): """ Remove pillowfight pods and temp files """ rmtree(self.logs)
def teardown_module(): ocs_obj = OCP() ocs_obj.login_as_sa()
class RipSaw(object): """ Workload operation using RipSaw """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): Following kwargs are valid repo: Ripsaw repo to used - a github link branch: branch to use from the repo namespace: namespace for the operator Example Usage: r1 = RipSaw() r1.apply_crd(crd='ripsaw_v1alpha1_ripsaw_crd.yaml') # use oc apply to apply custom modified bench my_custom_bench = my_custom_bench.yaml run_cmd('oc apply -f my_custom_bench') """ self.args = kwargs self.repo = self.args.get( "repo", "https://github.com/cloud-bulldozer/benchmark-operator") self.branch = self.args.get("branch", "master") self.namespace = self.args.get("namespace", RIPSAW_NAMESPACE) self.pgsql_is_setup = False self.ocp = OCP() self.ns_obj = OCP(kind="namespace") self.pod_obj = OCP(namespace=RIPSAW_NAMESPACE, kind="pod") self._create_namespace() self._clone_ripsaw() self.worker_nodes = [node.name for node in get_nodes()] helpers.label_worker_node(self.worker_nodes, label_key="kernel-cache-dropper", label_value="yes") def _create_namespace(self): """ create namespace for RipSaw """ self.ocp.new_project(self.namespace) def _clone_ripsaw(self): """ clone the ripaw repo """ self.dir = tempfile.mkdtemp(prefix="ripsaw_") try: log.info(f"cloning ripsaw in {self.dir}") git_clone_cmd = f"git clone -b {self.branch} {self.repo} " run(git_clone_cmd, shell=True, cwd=self.dir, check=True) self.crd = "resources/crds/" self.operator = "resources/operator.yaml" except (CommandFailed, CalledProcessError) as cf: log.error("Error during cloning of ripsaw repository") raise cf def apply_crd(self, crd): """ Apply the CRD Args: crd (str): Name of file to apply """ self.dir += "/benchmark-operator" run("oc apply -f deploy", shell=True, check=True, cwd=self.dir) run(f"oc apply -f {crd}", shell=True, check=True, cwd=self.dir) run(f"oc apply -f {self.operator}", shell=True, check=True, cwd=self.dir) run( "oc create -f resources/kernel-cache-drop-clusterrole.yaml", shell=True, check=True, cwd=self.dir, ) def get_uuid(self, benchmark): """ Getting the UUID of the test. when ripsaw used for running a benchmark tests, each run get its own UUID, so the results in the elastic-search server can be sorted. Args: benchmark (str): the name of the main pod in the test Return: str: the UUID of the test """ count = 0 while count <= 5: try: output = self.pod_obj.exec_oc_cmd(f"exec {benchmark} -- env") break except CommandFailed: time.sleep(3) count += 1 uuid = "" if output: for line in output.split(): if "uuid=" in line: uuid = line.split("=")[1] break log.info(f"The UUID of the test is : {uuid}") else: log.error(f"Can not get the UUID from {benchmark}") return uuid def cleanup(self): run(f"oc delete -f {self.crd}", shell=True, cwd=self.dir) run(f"oc delete -f {self.operator}", shell=True, cwd=self.dir) run("oc delete -f deploy", shell=True, cwd=self.dir) run_cmd(f"oc delete project {self.namespace}") run( "oc delete -f resources/kernel-cache-drop-clusterrole.yaml", shell=True, check=True, cwd=self.dir, ) self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180) # Reset namespace to default switch_to_default_rook_cluster_project() helpers.remove_label_from_worker_node(self.worker_nodes, label_key="kernel-cache-dropper")
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") operator_selector = get_selector_for_ocs_operator() ocs_package_manifest = PackageManifest( resource_name=defaults.OCS_OPERATOR_NAME, selector=operator_selector, ) channel = config.DEPLOYMENT.get('ocs_csv_channel') ocs_csv_name = ocs_package_manifest.get_current_csv(channel=channel) ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace) log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.") ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout) # Verify if OCS CSV has proper version. csv_version = ocs_csv.data['spec']['version'] ocs_version = config.ENV_DATA['ocs_version'] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ocs_version in csv_version, ( f"OCS version: {ocs_version} mismatch with CSV version {csv_version}") # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( 'ocs_registry_image') if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch') if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) # ocs-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OCS_OPERATOR_LABEL, timeout=timeout) # rook-ceph-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, timeout=timeout) # noobaa assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.NOOBAA_APP_LABEL, resource_count=2, timeout=timeout) # mons assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=timeout) # csi-cephfsplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-cephfsplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # csi-rbdplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-rbdplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # osds osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])) assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_count, timeout=timeout) # mgr assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL, timeout=timeout) # mds assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MDS_APP_LABEL, resource_count=2, timeout=timeout) # rgw check only for VmWare if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=1, timeout=timeout) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not skip_osd_distribution_check: log.info("Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] deviceset_count = get_deviceset_count() node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > deviceset_count, ( "OSD's are not distributed evenly across worker nodes") # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({ item['metadata']['name'] for item in csi_driver.get()['items'] }) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert sc_rbd['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if config.DEPLOYMENT.get('local_storage'): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ]) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ( 'snapshot' not in image), ( f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()['spec']['install']['spec']['deployments'] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator' ] assert { 'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false' } in (rook_ceph_operator_deployment[0]['spec']['template']['spec'] ['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump', format='') pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [ rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay)
def __init__(self, *args, **kwargs): """ Constructor for the MCG class """ self.namespace = config.ENV_DATA["cluster_namespace"] self.operator_pod = Pod(**get_pods_having_label( constants.NOOBAA_OPERATOR_POD_LABEL, self.namespace)[0]) self.core_pod = Pod(**get_pods_having_label( constants.NOOBAA_CORE_POD_LABEL, self.namespace)[0]) self.retrieve_noobaa_cli_binary() """ The certificate will be copied on each mcg_obj instantiation since the process is so light and quick, that the time required for the redundant copy is neglible in comparison to the time a hash comparison will take. """ retrieve_default_ingress_crt() get_noobaa = OCP(kind="noobaa", namespace=self.namespace).get() self.s3_endpoint = (get_noobaa.get("items")[0].get("status").get( "services").get("serviceS3").get("externalDNS")[0]) self.s3_internal_endpoint = (get_noobaa.get("items")[0].get( "status").get("services").get("serviceS3").get("internalDNS")[0]) self.mgmt_endpoint = (get_noobaa.get("items")[0].get("status").get( "services").get("serviceMgmt").get("externalDNS")[0]) + "/rpc" self.region = config.ENV_DATA["region"] creds_secret_name = (get_noobaa.get("items")[0].get("status").get( "accounts").get("admin").get("secretRef").get("name")) secret_ocp_obj = OCP(kind="secret", namespace=self.namespace) creds_secret_obj = secret_ocp_obj.get(creds_secret_name) self.access_key_id = base64.b64decode( creds_secret_obj.get("data").get("AWS_ACCESS_KEY_ID")).decode( "utf-8") self.access_key = base64.b64decode( creds_secret_obj.get("data").get("AWS_SECRET_ACCESS_KEY")).decode( "utf-8") self.noobaa_user = base64.b64decode( creds_secret_obj.get("data").get("email")).decode("utf-8") self.noobaa_password = base64.b64decode( creds_secret_obj.get("data").get("password")).decode("utf-8") self.noobaa_token = self.retrieve_nb_token() self.s3_resource = boto3.resource( "s3", verify=retrieve_verification_mode(), endpoint_url=self.s3_endpoint, aws_access_key_id=self.access_key_id, aws_secret_access_key=self.access_key, ) self.s3_client = self.s3_resource.meta.client if config.ENV_DATA["platform"].lower() == "aws" and kwargs.get( "create_aws_creds"): ( self.cred_req_obj, self.aws_access_key_id, self.aws_access_key, ) = self.request_aws_credentials() self.aws_s3_resource = boto3.resource( "s3", endpoint_url="https://s3.amazonaws.com", aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_access_key, ) if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS or storagecluster_independent_check()): if not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM and ( version.get_semantic_ocs_version_from_config() > version.VERSION_4_5): logger.info("Checking whether RGW pod is not present") pods = pod.get_pods_having_label(label=constants.RGW_APP_LABEL, namespace=self.namespace) assert ( not pods ), "RGW pods should not exist in the current platform/cluster" elif config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"], check_if_cluster_was_upgraded(), None) logger.info( f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform' ) rgw_pod = OCP(kind=constants.POD, namespace=self.namespace) assert rgw_pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=rgw_count, timeout=60, )
class RipSaw(object): """ Workload operation using RipSaw """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): Following kwargs are valid repo: Ripsaw repo to used - a github link branch: branch to use from the repo namespace: namespace for the operator Example Usage: r1 = RipSaw() r1.apply_crd(crd='ripsaw_v1alpha1_ripsaw_crd.yaml') # use oc apply to apply custom modified bench my_custom_bench = my_custom_bench.yaml run_cmd('oc apply -f my_custom_bench') """ self.args = kwargs self.repo = self.args.get('repo', 'https://github.com/cloud-bulldozer/ripsaw') self.branch = self.args.get('branch', 'master') self.namespace = self.args.get('namespace', RIPSAW_NAMESPACE) self.pgsql_is_setup = False self.ocp = OCP() self.ns_obj = OCP(kind='namespace') self.pod_obj = OCP(kind='pod') self._create_namespace() self._clone_ripsaw() def _create_namespace(self): """ create namespace for RipSaw """ self.ocp.new_project(self.namespace) def _clone_ripsaw(self): """ clone the ripaw repo """ self.dir = tempfile.mkdtemp(prefix='ripsaw_') try: log.info(f'cloning ripsaw in {self.dir}') git_clone_cmd = f'git clone -b {self.branch} {self.repo} ' run(git_clone_cmd, shell=True, cwd=self.dir, check=True) self.crd = 'resources/crds/' self.operator = 'resources/operator.yaml' except (CommandFailed, CalledProcessError) as cf: log.error('Error during cloning of ripsaw repository') raise cf def apply_crd(self, crd): """ Apply the CRD Args: crd (str): Name of file to apply """ self.dir += '/ripsaw' run('oc apply -f deploy', shell=True, check=True, cwd=self.dir) run(f'oc apply -f {crd}', shell=True, check=True, cwd=self.dir) run(f'oc apply -f {self.operator}', shell=True, check=True, cwd=self.dir) def cleanup(self): run(f'oc delete -f {self.crd}', shell=True, cwd=self.dir) run(f'oc delete -f {self.operator}', shell=True, cwd=self.dir) run('oc delete -f deploy', shell=True, cwd=self.dir) run_cmd(f'oc delete project {self.namespace}') self.ns_obj.wait_for_delete(resource_name=self.namespace) # Reset namespace to default switch_to_default_rook_cluster_project()
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. """ from ocs_ci.ocs.node import get_typed_nodes number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_package_manifest = PackageManifest( resource_name=defaults.OCS_OPERATOR_NAME ) ocs_csv_name = ocs_package_manifest.get_current_csv() ocs_csv = CSV( resource_name=ocs_csv_name, namespace=namespace ) log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.") ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout) # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info( f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase" ) storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP( kind=constants.POD, namespace=namespace ) # ocs-operator assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OCS_OPERATOR_LABEL, timeout=timeout ) # rook-ceph-operator assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, timeout=timeout ) # noobaa assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.NOOBAA_APP_LABEL, resource_count=2, timeout=timeout ) # mons assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=timeout ) # csi-cephfsplugin assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout ) # csi-cephfsplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout ) # csi-rbdplugin assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout ) # csi-rbdplugin-profisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout ) # osds osd_count = storage_cluster.data['spec']['storageDeviceSets'][0]['count'] assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_count, timeout=timeout ) # mgr assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL, timeout=timeout ) # mds assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MDS_APP_LABEL, resource_count=2, timeout=timeout ) # rgw check only for VmWare if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=1, timeout=timeout ) # Verify ceph health log.info("Verifying ceph health") assert utils.ceph_health_check(namespace=namespace) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP( kind=constants.STORAGECLASS, namespace=namespace ) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSD's are distributed if not skip_osd_distribution_check: log.info("Verifying OSD's are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > 1, ( "OSD's are not distributed evenly across worker nodes" )
def test_smallfile_workload(self, ripsaw, es, file_size, files, threads, samples, interface): """ Run SmallFile Workload """ # Loading the main template yaml file for the benchmark sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML) # getting the name and email of the user that running the test. try: user = run_cmd('git config --get user.name').strip() email = run_cmd('git config --get user.email').strip() except CommandFailed: # if no git user define, use the default user from the CR file user = sf_data['spec']['test_user'] email = '' # Saving the Original elastic-search IP and PORT - if defined in yaml es_server = "" es_port = "" if 'elasticsearch' in sf_data['spec']: if 'server' in sf_data['spec']['elasticsearch']: es_server = sf_data['spec']['elasticsearch']['server'] if 'port' in sf_data['spec']['elasticsearch']: es_port = sf_data['spec']['elasticsearch']['port'] else: sf_data['spec']['elasticsearch'] = {} # Use the internal define elastic-search server in the test sf_data['spec']['elasticsearch'] = { 'server': es.get_ip(), 'port': es.get_port() } log.info("Apply Operator CRD") ripsaw.apply_crd('resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml') if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using {storageclass} Storageclass") sf_data['spec']['workload']['args']['storageclass'] = storageclass log.info("Running SmallFile bench") """ Setting up the parameters for this test """ sf_data['spec']['workload']['args']['file_size'] = file_size sf_data['spec']['workload']['args']['files'] = files sf_data['spec']['workload']['args']['threads'] = threads sf_data['spec']['workload']['args']['samples'] = samples sf_data['spec']['clustername'] = get_clustername() sf_data['spec']['test_user'] = f'{user}<{email}>' """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. Since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ vol_size = int(files * threads * file_size * 3) vol_size = int(vol_size / constants.GB2KB) if vol_size < 100: vol_size = 100 sf_data['spec']['workload']['args']['storagesize'] = f"{vol_size}Gi" sf_obj = OCS(**sf_data) sf_obj.create() log.info(f'The smallfile yaml file is {sf_data}') # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler(240, 10, get_pod_name_by_pattern, 'smallfile-client', constants.RIPSAW_NAMESPACE): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind='pod', namespace=constants.RIPSAW_NAMESPACE) log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600) start_time = time.time() # After testing manually, changing the timeout timeout = 3600 # Getting the UUID from inside the benchmark pod output = bench_pod.exec_oc_cmd(f'exec {small_file_client_pod} -- env') for line in output.split(): if 'uuid=' in line: uuid = line.split('=')[1] log.info(f'the UUID of the test is : {uuid}') # Setting back the original elastic-search information sf_data['spec']['elasticsearch'] = { 'server': es_server, 'port': es_port } full_results = SmallFileResultsAnalyse(uuid, sf_data) # Initialaize the results doc file. full_results.add_key('user', sf_data['spec']['test_user']) full_results.add_key('ocp_version', get_ocp_version()) full_results.add_key('ocp_build', get_build()) full_results.add_key('ocp_channel', get_ocp_channel()) # Getting the OCS version (ocs_ver_info, _) = get_ocs_version() ocs_ver_full = ocs_ver_info['status']['desired']['version'] m = re.match(r"(\d.\d).(\d)", ocs_ver_full) if m and m.group(1) is not None: ocs_ver = m.group(1) full_results.add_key('ocs_version', ocs_ver) full_results.add_key('vendor', get_provider()) full_results.add_key( 'start_time', time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime())) # Calculating the total size of the working data set - in GB full_results.add_key( 'dataset', file_size * files * threads * full_results.results['clients'] / constants.GB2KB) full_results.add_key( 'global_options', { 'files': files, 'file_size': file_size, 'storageclass': sf_data['spec']['workload']['args']['storageclass'], 'vol_size': sf_data['spec']['workload']['args']['storagesize'] }) while True: logs = bench_pod.exec_oc_cmd(f'logs {small_file_client_pod}', out_yaml_format=False) if "RUN STATUS DONE" in logs: full_results.add_key( 'end_time', time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime())) full_results.read() if not full_results.dont_check: full_results.add_key('hosts', full_results.get_clients_list()) full_results.init_full_results() full_results.aggregate_host_results() test_status = full_results.aggregate_samples_results() full_results.write() # Creating full link to the results on the ES server res_link = 'http://' res_link += f'{full_results.server}:{full_results.port}/' res_link += f'{full_results.new_index}/_search?q=' res_link += f'uuid:{full_results.uuid}' log.info(f'Full results can be found as : {res_link}') else: test_status = True break if timeout < (time.time() - start_time): raise TimeoutError( "Timed out waiting for benchmark to complete") time.sleep(30) assert (not get_logs_with_errors() and test_status), 'Test Failed'
class OCS(object): """ Base OCSClass """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): 1) For existing resource, use OCP.reload() to get the resource's dictionary and use it to pass as **kwargs 2) For new resource, use yaml files templates under /templates/CSI like: obj_dict = load_yaml( os.path.join( TEMPLATE_DIR, "some_resource.yaml" ) ) """ self.data = kwargs self._api_version = self.data.get('api_version') self._kind = self.data.get('kind') self._namespace = None if 'metadata' in self.data: self._namespace = self.data.get('metadata').get('namespace') self._name = self.data.get('metadata').get('name') self.ocp = OCP( api_version=self._api_version, kind=self.kind, namespace=self._namespace ) self.temp_yaml = tempfile.NamedTemporaryFile( mode='w+', prefix=self._kind, delete=False ) # This _is_delete flag is set to True if the delete method was called # on object of this class and was successfull. self._is_deleted = False @property def api_version(self): return self._api_version @property def kind(self): return self._kind @property def namespace(self): return self._namespace @property def name(self): return self._name @property def is_deleted(self): return self._is_deleted def reload(self): """ Reloading the OCS instance with the new information from its actual data. After creating a resource from a yaml file, the actual yaml file is being changed and more information about the resource is added. """ self.data = self.get() self.__init__(**self.data) def get(self, out_yaml_format=True): return self.ocp.get( resource_name=self.name, out_yaml_format=out_yaml_format ) def describe(self): return self.ocp.describe(resource_name=self.name) def create(self, do_reload=True): log.info(f"Adding {self.kind} with name {self.name}") templating.dump_data_to_temp_yaml(self.data, self.temp_yaml.name) status = self.ocp.create(yaml_file=self.temp_yaml.name) if do_reload: self.reload() return status def delete(self, wait=True, force=False): """ Delete the OCS object if its not already deleted (using the internal is_deleted flag) Args: wait (bool): Wait for object to be deleted force (bool): Force delete object Returns: bool: True if deleted, False otherwise """ if self._is_deleted: log.info( f"Attempt to remove resource: {self.name} which is" f"already deleted! Skipping delete of this resource!" ) result = True else: result = self.ocp.delete( resource_name=self.name, wait=wait, force=force ) self._is_deleted = True return result def apply(self, **data): with open(self.temp_yaml.name, 'w') as yaml_file: yaml.dump(data, yaml_file) assert self.ocp.apply(yaml_file=self.temp_yaml.name), ( f"Failed to apply changes {data}" ) self.reload() def add_label(self, label): """ Addss a new label Args: label (str): New label to be assigned for this pod E.g: "label=app='rook-ceph-mds'" """ status = self.ocp.add_label(resource_name=self.name, label=label) self.reload() return status def delete_temp_yaml_file(self): utils.delete_file(self.temp_yaml.name)
def setup_cb(self): """ Creating admission parts,couchbase operator pod, couchbase worker secret """ # Create admission controller log.info("Create admission controller process for Couchbase") self.up_adm_chk = OCP(namespace="default") self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR) self.adm_objects = [] for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() self.adm_objects.append(adm_obj) # Wait for admission pod to be created for adm_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "couchbase-operator-admission", "default", ): try: if self.is_up_and_running(adm_pod[0], self.up_adm_chk): self.admission_pod = adm_pod[0] break except IndexError: log.info("Admission pod is not ready yet") # Wait for admission pod to be running log.info("Waiting for admission pod to be running") admission_pod_obj = get_pod_obj(self.admission_pod, namespace="default") wait_for_resource_state( resource=admission_pod_obj, state=constants.STATUS_RUNNING, timeout=self.WAIT_FOR_TIME, ) self.ns_obj.new_project(constants.COUCHBASE_OPERATOR) couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML) self.couchbase_obj = OCS(**couchbase_data) self.couchbase_obj.create() op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE) self.operator_role = OCS(**op_data) self.operator_role.create() self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR) self.serviceaccount.exec_oc_cmd( "create serviceaccount couchbase-operator") dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets") startloc = dockercfgs.find("couchbase-operator-dockercfg") newdockerstr = dockercfgs[startloc:] endloc = newdockerstr.find(" ") dockerstr = newdockerstr[:endloc] self.secretsadder.exec_oc_cmd( f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}" ) self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR) rolebind_cmd = "".join([ "create rolebinding couchbase-operator-rolebinding ", "--role couchbase-operator ", "--serviceaccount couchbase-operator-namespace:couchbase-operator", ]) self.rolebinding.exec_oc_cmd(rolebind_cmd) dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY) self.cb_deploy = OCS(**dep_data) self.cb_deploy.create() # Wait for couchbase operator pod to be running for couchbase_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "couchbase-operator", constants.COUCHBASE_OPERATOR, ): try: if self.is_up_and_running(couchbase_pod[0], self.up_check): break except IndexError: log.info("Couchbase operator is not up") cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_worker = OCS(**cb_work) self.cb_worker.create()
def test_pvpool_cpu_and_memory_modifications( self, awscli_pod_session, backingstore_factory, bucket_factory, test_directory_setup, mcg_obj_session, ): """ Test to modify the CPU and Memory resource limits for BS and see if its reflecting """ bucketclass_dict = { "interface": "OC", "backingstore_dict": { "pv": [( 1, MIN_PV_BACKINGSTORE_SIZE_IN_GB, "ocs-storagecluster-ceph-rbd", )] }, } bucket = bucket_factory(1, "OC", bucketclass=bucketclass_dict)[0] bucket_name = bucket.name pv_backingstore = bucket.bucketclass.backingstores[0] pv_bs_name = pv_backingstore.name pv_pod_label = f"pool={pv_bs_name}" pv_pod_info = get_pods_having_label( label=pv_pod_label, namespace=config.ENV_DATA["cluster_namespace"])[0] pv_pod_obj = Pod(**pv_pod_info) pv_pod_name = pv_pod_obj.name logger.info( f"Pod created for PV Backingstore {pv_bs_name}: {pv_pod_name}") new_cpu = "500m" new_mem = "500Mi" new_resource_patch = { "spec": { "pvPool": { "resources": { "limits": { "cpu": f"{new_cpu}", "memory": f"{new_mem}", }, "requests": { "cpu": f"{new_cpu}", "memory": f"{new_mem}", }, } } } } try: OCP( namespace=config.ENV_DATA["cluster_namespace"], kind="backingstore", resource_name=pv_bs_name, ).patch(params=json.dumps(new_resource_patch), format_type="merge") except CommandFailed as e: logger.error(f"[ERROR] Failed to patch: {e}") else: logger.info("Patched new resource limits") wait_for_pods_to_be_running( namespace=config.ENV_DATA["cluster_namespace"], pod_names=[pv_pod_name]) pv_pod_ocp_obj = OCP(namespace=config.ENV_DATA["cluster_namespace"], kind="pod").get(resource_name=pv_pod_name) resource_dict = pv_pod_ocp_obj["spec"]["containers"][0]["resources"] assert ( resource_dict["limits"]["cpu"] == new_cpu and resource_dict["limits"]["memory"] == new_mem and resource_dict["requests"]["cpu"] == new_cpu and resource_dict["requests"]["memory"] == new_mem ), "New resource modification in Backingstore is not reflected in PV Backingstore Pod!!" logger.info( "Resource modification reflected in the PV Backingstore Pod!!") # push some data to the bucket file_dir = test_directory_setup.origin_dir copy_random_individual_objects( podobj=awscli_pod_session, file_dir=file_dir, target=f"s3://{bucket_name}", amount=1, s3_obj=OBC(bucket_name), )
def request_aws_credentials(self): """ Uses a CredentialsRequest CR to create an AWS IAM that allows the program to interact with S3 Returns: OCS: The CredentialsRequest resource """ awscreds_data = templating.load_yaml(constants.MCG_AWS_CREDS_YAML) req_name = create_unique_resource_name('awscredreq', 'credentialsrequests') awscreds_data['metadata']['name'] = req_name awscreds_data['metadata']['namespace'] = self.namespace awscreds_data['spec']['secretRef']['name'] = req_name awscreds_data['spec']['secretRef']['namespace'] = self.namespace creds_request = create_resource(**awscreds_data) sleep(5) secret_ocp_obj = OCP(kind='secret', namespace=self.namespace) try: cred_req_secret_dict = secret_ocp_obj.get( resource_name=creds_request.name, retry=5) except CommandFailed: logger.error('Failed to retrieve credentials request secret') raise CredReqSecretNotFound( 'Please make sure that the cluster used is an AWS cluster, ' 'or that the `platform` var in your config is correct.') aws_access_key_id = base64.b64decode( cred_req_secret_dict.get('data').get('aws_access_key_id')).decode( 'utf-8') aws_access_key = base64.b64decode( cred_req_secret_dict.get('data').get( 'aws_secret_access_key')).decode('utf-8') def _check_aws_credentials(): try: sts = boto3.client('sts', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_access_key) sts.get_caller_identity() return True except ClientError: logger.info('Credentials are still not active. Retrying...') return False try: for api_test_result in TimeoutSampler(120, 5, _check_aws_credentials): if api_test_result: logger.info('AWS credentials created successfully.') break except TimeoutExpiredError: logger.error('Failed to create credentials') assert False return creds_request, aws_access_key_id, aws_access_key
def verify_image_versions(old_images, upgrade_version, version_before_upgrade): """ Verify if all the images of OCS objects got upgraded Args: old_images (set): set with old images upgrade_version (packaging.version.Version): version of OCS version_before_upgrade (float): version of OCS before upgrade """ number_of_worker_nodes = len(get_nodes()) verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL) verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL) default_noobaa_pods = 3 noobaa_pods = default_noobaa_pods if upgrade_version >= parse_version("4.7"): noobaa = OCP(kind="noobaa", namespace=config.ENV_DATA["cluster_namespace"]) resource = noobaa.get()["items"][0] endpoints = resource.get("spec", {}).get("endpoints", {}) max_endpoints = endpoints.get("maxCount", constants.MAX_NB_ENDPOINT_COUNT) min_endpoints = endpoints.get( "minCount", constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT) noobaa_pods = default_noobaa_pods + min_endpoints try: verify_pods_upgraded( old_images, selector=constants.NOOBAA_APP_LABEL, count=noobaa_pods, ) except TimeoutException as ex: if upgrade_version >= parse_version("4.7"): log.info( "Nooba pods didn't match. Trying once more with max noobaa endpoints!" f"Exception: {ex}") noobaa_pods = default_noobaa_pods + max_endpoints verify_pods_upgraded( old_images, selector=constants.NOOBAA_APP_LABEL, count=noobaa_pods, timeout=60, ) else: raise verify_pods_upgraded( old_images, selector=constants.CSI_CEPHFSPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.CSI_RBDPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, count=2) if not config.DEPLOYMENT.get("external_mode"): verify_pods_upgraded( old_images, selector=constants.MON_APP_LABEL, count=3, ) verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL) osd_timeout = 600 if upgrade_version >= parse_version("4.5") else 750 osd_count = get_osd_count() verify_pods_upgraded( old_images, selector=constants.OSD_APP_LABEL, count=osd_count, timeout=osd_timeout * osd_count, ) verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2) if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: rgw_count = get_rgw_count(upgrade_version.base_version, True, version_before_upgrade) verify_pods_upgraded( old_images, selector=constants.RGW_APP_LABEL, count=rgw_count, )
class RGW(object): """ Wrapper class for interaction with a cluster's RGW service """ def __init__(self, namespace=None): self.namespace = ( namespace if namespace else config.ENV_DATA["cluster_namespace"] ) if storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RGW else: sc_name = constants.DEFAULT_STORAGECLASS_RGW self.storageclass = OCP( kind="storageclass", namespace=namespace, resource_name=sc_name ) self.s3_internal_endpoint = ( self.storageclass.get().get("parameters").get("endpoint") ) self.region = self.storageclass.get().get("parameters").get("region") # Todo: Implement retrieval in cases where CephObjectStoreUser is available self.key_id = None self.secret_key = None self.s3_resource = None def get_credentials(self, secret_name=constants.NOOBAA_OBJECTSTOREUSER_SECRET): """ Get Endpoint, Access key and Secret key from OCS secret. Endpoint is taken from rgw exposed service. Use rgw_endpoint fixture in test to get it exposed. Args: secret_name (str): Name of secret to be used for getting RGW credentials Returns: tuple: Endpoint, Access key, Secret key """ if ( secret_name == constants.NOOBAA_OBJECTSTOREUSER_SECRET and storagecluster_independent_check() ): secret_name = constants.EXTERNAL_MODE_NOOBAA_OBJECTSTOREUSER_SECRET secret_ocp_obj = OCP(kind=constants.SECRET, namespace=self.namespace) route_ocp_obj = OCP( kind=constants.ROUTE, namespace=config.ENV_DATA["cluster_namespace"] ) creds_secret_obj = secret_ocp_obj.get(secret_name) if config.DEPLOYMENT["external_mode"]: endpoint = route_ocp_obj.get( resource_name=constants.RGW_SERVICE_EXTERNAL_MODE ) else: endpoint = route_ocp_obj.get( resource_name=constants.RGW_SERVICE_INTERNAL_MODE ) endpoint = f"http://{endpoint['status']['ingress'][0]['host']}" access_key = base64.b64decode( creds_secret_obj.get("data").get("AccessKey") ).decode("utf-8") secret_key = base64.b64decode( creds_secret_obj.get("data").get("SecretKey") ).decode("utf-8") return (endpoint, access_key, secret_key)
def delete(self): log.info(f"Cleaning up backingstore {self.name}") # If the backingstore utilizes a PV, save its PV name for deletion verification if self.type == "pv": backingstore_pvc = OCP( kind=constants.PVC, selector=f"pool={self.name}", namespace=config.ENV_DATA["cluster_namespace"], ).get()["items"][0] pv_name = backingstore_pvc["spec"]["volumeName"] if self.method == "oc": OCP( kind="backingstore", namespace=config.ENV_DATA["cluster_namespace"] ).delete(resource_name=self.name) elif self.method == "cli": def _cli_deletion_flow(): try: self.mcg_obj.exec_mcg_cmd(f"backingstore delete {self.name}") return True except CommandFailed as e: if "being used by one or more buckets" in str(e).lower(): log.warning( f"Deletion of {self.name} failed because it's being used by a bucket. " "Retrying..." ) return False sample = TimeoutSampler( timeout=120, sleep=20, func=_cli_deletion_flow, ) if not sample.wait_for_func_status(result=True): log.error(f"Failed to {self.name}") raise TimeoutExpiredError # Verify deletion was successful log.info(f"Verifying whether backingstore {self.name} exists after deletion") bs_deleted_successfully = False try: if self.method == "oc": OCP( kind="backingstore", namespace=config.ENV_DATA["cluster_namespace"], resource_name=self.name, ).get() elif self.method == "cli": self.mcg_obj.exec_mcg_cmd(f"backingstore status {self.name}") except CommandFailed as e: if "Not Found" in str(e) or "NotFound" in str(e): bs_deleted_successfully = True else: raise assert ( bs_deleted_successfully ), f"Backingstore {self.name} was not deleted successfully" def _wait_for_pv_backingstore_resource_deleted(namespace=None): """ wait for pv backing store resources to be deleted at the end of test teardown Args: backingstore_name (str): backingstore name namespace (str): backing store's namespace """ namespace = namespace or config.ENV_DATA["cluster_namespace"] sample = TimeoutSampler( timeout=120, sleep=15, func=_check_resources_deleted, namespace=namespace, ) if not sample.wait_for_func_status(result=True): log.error(f"{self.name} was not deleted properly, leftovers were found") raise TimeoutExpiredError def _check_resources_deleted(namespace=None): """ check if resources of the pv pool backingstore deleted properly Args: namespace (str): backing store's namespace Returns: bool: True if pvc(s) were deleted """ try: OCP(kind=constants.PV, resource_name=pv_name).get() log.warning(f"Found PV leftovers belonging to {self.name}") return False except CommandFailed as e: if "not found" in str(e): pass else: raise pvcs = get_all_pvcs(namespace=namespace, selector=f"pool={self.name}") pods = get_pods_having_label(namespace=namespace, label=f"pool={self.name}") return len(pvcs["items"]) == 0 and len(pods) == 0 if self.type == "pv": log.info(f"Waiting for backingstore {self.name} resources to be deleted") _wait_for_pv_backingstore_resource_deleted()
class CouchBase(PillowFight): """ CouchBase workload operation """ WAIT_FOR_TIME = 600 admission_parts = [ constants.COUCHBASE_ADMISSION_SERVICE_ACCOUNT_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_BINDING_YAML, constants.COUCHBASE_ADMISSION_SECRET_YAML, constants.COUCHBASE_ADMISSION_DEPLOYMENT_YAML, constants.COUCHBASE_ADMISSION_SERVICE_YAML, constants.COUCHBASE_MUTATING_WEBHOOK_YAML, constants.COUCHBASE_VALIDATING_WEBHOOK_YAML ] pod_obj = OCP(kind='pod') couchbase_pod = OCP(kind='pod') secretsadder = OCP(kind='pod') admission_pod = [] cb_worker = OCS() cb_examples = OCS() def __init__(self, **kwargs): """ Initializer function """ super().__init__(**kwargs) def is_up_and_running(self, pod_name, ocp_value): """ Test if the pod specified is up and running. Args: pod_name (str): Name of pod being checked. ocp_value (object): object used for running oc commands Returns: bool; True if pod is running, False otherwise """ if not pod_name: return False pod_info = ocp_value.exec_oc_cmd(f"get pods {pod_name} -o json") if pod_info['status']['containerStatuses'][0]['ready']: if 'running' in pod_info['status']['containerStatuses'][0][ 'state']: return True return False def setup_cb(self): """ Creating admission parts,couchbase operator pod, couchbase worker secret """ # Create admission controller log.info("Create admission controller process for Couchbase") switch_to_project('default') self.up_adm_chk = OCP(namespace="default") self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() # Wait for admission pod to be created for adm_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator-admission', 'default'): try: if self.is_up_and_running(adm_pod[0], self.up_adm_chk): self.admission_pod = adm_pod[0] break except IndexError: log.info("Admission pod is not ready yet") # Wait for admission pod to be running log.info("Waiting for admission pod to be running") self.pod_obj.wait_for_resource( condition='Running', resource_name=self.admission_pod, timeout=self.WAIT_FOR_TIME, sleep=10, ) self.pod_obj.new_project(constants.COUCHBASE_OPERATOR) couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML) self.couchbase_obj = OCS(**couchbase_data) self.couchbase_obj.create() op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE) self.operator_role = OCS(**op_data) self.operator_role.create() self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR) self.serviceaccount.exec_oc_cmd( "create serviceaccount couchbase-operator") dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets") startloc = dockercfgs.find('couchbase-operator-dockercfg') newdockerstr = dockercfgs[startloc:] endloc = newdockerstr.find(' ') dockerstr = newdockerstr[:endloc] self.secretsadder.exec_oc_cmd( f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}" ) self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR) rolebind_cmd = "".join([ "create rolebinding couchbase-operator-rolebinding ", "--role couchbase-operator ", "--serviceaccount couchbase-operator-namespace:couchbase-operator" ]) self.rolebinding.exec_oc_cmd(rolebind_cmd) dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY) self.cb_deploy = OCS(**dep_data) self.cb_deploy.create() # Wait for couchbase operator pod to be running for couchbase_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator', constants.COUCHBASE_OPERATOR): try: if self.is_up_and_running(couchbase_pod[0], self.up_check): break except IndexError: log.info("Couchbase operator is not up") cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_worker = OCS(**cb_work) self.cb_worker.create() def create_couchbase_worker(self, replicas=1): """ Deploy a Couchbase server and pillowfight workload using operator The couchbase workers do not come up unless there is an admission controller running. The admission controller is started from the default project prior to bringing up the operator. Secrets, rolebindings and serviceaccounts need to also be generated. Once the couchbase operator is running, we need to wait for the three worker pods to also be up. Then a pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ logging.info('Creating pods..') cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) cb_example['spec']['servers'][0]['size'] = replicas self.cb_examples = OCS(**cb_example) self.cb_examples.create() # Wait for last of three workers to be running. logging.info('Waiting for the pods to Running') for cb_wrk_pods in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'cb-example', constants.COUCHBASE_OPERATOR): try: if len(cb_wrk_pods) == replicas: counter = 0 for cb_pod in cb_wrk_pods: if self.is_up_and_running(cb_pod, self.up_check): counter += 1 logging.info(f'Couchbase worker {cb_pod} is up') if counter == replicas: break except IndexError: logging.info( f'Expected number of couchbase pods are {replicas} ' f'but only found {len(cb_wrk_pods)}') def run_workload(self, replicas): """ Running workload with pillow fight operator Args: replicas (int): Number of pods """ logging.info('Running IOs...') PillowFight.run_pillowfights(self, replicas=replicas) def analyze_run(self): """ Analyzing the workload run logs """ logging.info('Analyzing workload run logs..') PillowFight.analyze_all(self) def teardown(self): """ Delete objects created in roughly reverse order of how they were created. """ self.cb_examples.delete() self.cb_worker.delete() self.cb_deploy.delete() self.pod_obj.exec_oc_cmd( command="delete rolebinding couchbase-operator-rolebinding") self.pod_obj.exec_oc_cmd( command="delete serviceaccount couchbase-operator") self.operator_role.delete() self.couchbase_obj.delete() switch_to_project('default') self.pod_obj.delete_project(constants.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.delete() # Before the code below was added, the teardown task would sometimes # fail with the leftover objects because it would still see one of the # couchbase pods. for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase', 'default'): if admin_pod: continue else: break PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
def test_rgw_host_node_failure( self, nodes, node_restart_teardown, node_drain_teardown, mcg_obj, bucket_factory ): """ Test case to fail node where RGW and the NooBaa DB are hosted and verify the new pods spin on a healthy node """ # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted noobaa_pod_node = None for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name in [ constants.NB_DB_NAME_46_AND_BELOW, constants.NB_DB_NAME_47_AND_ABOVE, ]: noobaa_pod_node = get_pod_node(noobaa_pod) if noobaa_pod_node is None: assert False, "Could not find the NooBaa DB pod" # Validate if RGW pod and noobaa-db are hosted on same node # If not, make sure both pods are hosted on same node log.info("Validate if RGW pod and noobaa-db are hosted on same node") rgw_pod_obj = get_rgw_pods() rgw_pod_node_list = [ rgw_pod.get().get("spec").get("nodeName") for rgw_pod in rgw_pod_obj ] if not list(set(rgw_pod_node_list).intersection(noobaa_pod_node.name.split())): log.info( "Unschedule other two nodes such that RGW " "pod moves to node where NooBaa DB pod hosted" ) worker_node_list = get_worker_nodes() node_names = list(set(worker_node_list) - set(noobaa_pod_node.name.split())) unschedule_nodes(node_names=node_names) ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) rgw_pod_obj[0].delete() ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, timeout=300, sleep=5, ) log.info("Schedule those nodes again") schedule_nodes(node_names=node_names) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Check again the rgw pod move to node where NooBaa DB pod hosted rgw_pod_obj_list = get_rgw_pods() rgw_pod_node_list = [ get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list ] value = [ True if rgw_pod_node == noobaa_pod_node.name else False for rgw_pod_node in rgw_pod_node_list ] assert value, ( "RGW Pod didn't move to node where NooBaa DB pod" " hosted even after cordoned and uncordoned nodes" f"RGW pod hosted: {rgw_pod_node_list}" f"NooBaa DB pod hosted: {noobaa_pod_node.name}" ) log.info("RGW and noobaa-db are hosted on same node start the test execution") rgw_pod_obj = get_rgw_pods() for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info( f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and NooBaa DB are hosted" ) node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state( resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720 ) # Validate new rgw pod spun ocp_obj = OCP( kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Start the node nodes.start_nodes(node_obj) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check()