def test_rgw_pod_existence(self): if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS or storagecluster_independent_check()): if (not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM and not config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM and (version.get_semantic_ocs_version_from_config() > version.VERSION_4_5)): logger.info("Checking whether RGW pod is not present") assert ( not pod.get_rgw_pods() ), "RGW pods should not exist in the current platform/cluster" elif (config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS and not config.ENV_DATA["mcg_only_deployment"]): rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"], check_if_cluster_was_upgraded(), None) logger.info( f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform' ) rgw_pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) assert rgw_pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=rgw_count, timeout=60, )
def verify_multus_network(): """ Verify Multus network(s) created successfully and are present on relevant pods. """ with open(constants.MULTUS_YAML, mode="r") as f: multus_public_data = yaml.load(f) multus_namespace = multus_public_data["metadata"]["namespace"] multus_name = multus_public_data["metadata"]["name"] multus_public_network_name = f"{multus_namespace}/{multus_name}" log.info("Verifying multus NetworkAttachmentDefinitions") ocp.OCP( resource_name=multus_public_network_name, kind="network-attachment-definitions", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # TODO: also check if private NAD exists log.info("Verifying multus public network exists on ceph pods") osd_pods = get_osd_pods() for _pod in osd_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) # TODO: also check private network if it exists on OSD pods mon_pods = get_mon_pods() mds_pods = get_mds_pods() mgr_pods = get_mgr_pods() rgw_pods = get_rgw_pods() ceph_pods = [*mon_pods, *mds_pods, *mgr_pods, *rgw_pods] for _pod in ceph_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) log.info("Verifying multus public network exists on CSI pods") csi_pods = [] interfaces = [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM] for interface in interfaces: plugin_pods = get_plugin_pods(interface) csi_pods += plugin_pods cephfs_provisioner_pods = get_cephfsplugin_provisioner_pods() rbd_provisioner_pods = get_rbdfsplugin_provisioner_pods() csi_pods += cephfs_provisioner_pods csi_pods += rbd_provisioner_pods for _pod in csi_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) log.info("Verifying StorageCluster multus network data") sc = get_storage_cluster() sc_data = sc.get().get("items")[0] network_data = sc_data["spec"]["network"] assert network_data["provider"] == "multus" selectors = network_data["selectors"] assert selectors[ "public"] == f"{defaults.ROOK_CLUSTER_NAMESPACE}/ocs-public"
def test_rgw_host_node_failure( self, nodes, node_restart_teardown, mcg_obj, bucket_factory ): """ Test case to fail node where RGW and Noobaa-db-0 hosting and verify new pod spuns on healthy node """ # Get rgw pods rgw_pod_obj = get_rgw_pods() # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name == "noobaa-db-0": noobaa_pod_node = get_pod_node(noobaa_pod) for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info( f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted" ) node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state( resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720 ) # Validate new rgw pod spun ocp_obj = OCP( kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1") # Start the node nodes.start_nodes(node_obj) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check() # Verify all storage pods are running wait_for_storage_pods()
def get_rgw_restart_counts(): """ Gets the restart count of the RGW pods Returns: list: restart counts of RGW pods """ rgw_pods = get_rgw_pods() return [rgw_pod.restart_count for rgw_pod in rgw_pods]
def recover_mcg(): """ Recovery procedure for noobaa by re-spinning the pods after mon recovery """ logger.info("Re-spinning noobaa pods") for noobaa_pod in get_noobaa_pods(): noobaa_pod.delete() for noobaa_pod in get_noobaa_pods(): wait_for_resource_state( resource=noobaa_pod, state=constants.STATUS_RUNNING, timeout=600 ) if config.ENV_DATA["platform"].lower() in constants.ON_PREM_PLATFORMS: logger.info("Re-spinning RGW pods") for rgw_pod in get_rgw_pods(): rgw_pod.delete() for rgw_pod in get_rgw_pods(): wait_for_resource_state( resource=rgw_pod, state=constants.STATUS_RUNNING, timeout=600 )
def get_rgw_restart_counts(): """ Gets the restart count of the RGW pods Returns: list: restart counts of RGW pods """ # Internal import in order to avoid circular import from ocs_ci.ocs.resources.pod import get_rgw_pods rgw_pods = get_rgw_pods() return [rgw_pod.restart_count for rgw_pod in rgw_pods]
def create_resource_hsbench(self): """ Create resource for hsbench mark test: Create service account Create PVC Create golang pod """ # Check for existing rgw pods on cluster self.rgw_pod = pod.get_rgw_pods() if self.rgw_pod: # Create service account self.sa_name = helpers.create_serviceaccount(self.namespace) self.sa_name = self.sa_name.name helpers.add_scc_policy(sa_name=self.sa_name, namespace=self.namespace) # Create test pvc+pod log.info( f"Create Golang pod to generate S3 workload... {self.namespace}" ) pvc_size = "50Gi" node_name = "compute-0" self.pod_name = "hsbench-pod" self.pvc_obj = helpers.create_pvc( sc_name=constants.DEFAULT_STORAGECLASS_RBD, namespace=self.namespace, size=pvc_size, ) self.pod_obj = helpers.create_pod( constants.CEPHBLOCKPOOL, namespace=self.namespace, pod_name=self.pod_name, pvc_name=self.pvc_obj.name, node_name=node_name, sa_name=self.sa_name, pod_dict_path=self.pod_dic_path, dc_deployment=True, deploy_pod_status=constants.STATUS_COMPLETED, ) else: raise UnexpectedBehaviour( "This cluster doesn't have RGW pod(s) to perform hsbench")
def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) # TODO: Workaround for BZ1748325: mons = pod.get_mon_pods(self.mon_selector, self.namespace) for mon in mons: if mon.ocp.get_resource_status( mon.name) == constant.STATUS_RUNNING: self.mons.append(mon) # TODO: End of workaround for BZ1748325 self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.noobaas = pod.get_noobaa_pods(self.noobaa_selector, self.namespace) self.rgws = pod.get_rgw_pods() self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs: self.cephfs.reload() else: try: self.cephfs_config = self.CEPHFS.get().get('items')[0] self.cephfs = ocs.OCS(**self.cephfs_config) self.cephfs.reload() except IndexError as e: logging.warning(e) logging.warning("No CephFS found") self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds) self.noobaa_count = len(self.noobaas) self.rgw_count = len(self.rgws)
def test_rgw_kafka_notifications(self, bucket_factory): """ Test to verify rgw kafka notifications """ # Get sc sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) # Deploy amq cluster self.amq.setup_amq_cluster(sc.name) # Create topic self.kafka_topic = self.amq.create_kafka_topic() # Create Kafkadrop pod ( self.kafkadrop_pod, self.kafkadrop_pod, self.kafkadrop_route, ) = self.amq.create_kafkadrop() # Get the kafkadrop route kafkadrop_host = self.kafkadrop_route.get().get("spec").get("host") # Create bucket bucketname = bucket_factory(amount=1, interface="RGW-OC")[0].name # Get RGW credentials rgw_obj = RGW() rgw_endpoint, access_key, secret_key = rgw_obj.get_credentials() # Clone notify repo notify_path = clone_notify() # Initialise to put objects data = "A random string data to write on created rgw bucket" obc_obj = OBC(bucketname) s3_resource = boto3.resource( "s3", verify=retrieve_verification_mode(), endpoint_url=rgw_endpoint, aws_access_key_id=obc_obj.access_key_id, aws_secret_access_key=obc_obj.access_key, ) s3_client = s3_resource.meta.client # Initialize notify command to run notify_cmd = ( f"python {notify_path} -e {rgw_endpoint} -a {obc_obj.access_key_id} " f"-s {obc_obj.access_key} -b {bucketname} -ke {constants.KAFKA_ENDPOINT} -t {self.kafka_topic.name}" ) log.info(f"Running cmd {notify_cmd}") # Put objects to bucket assert s3_client.put_object(Bucket=bucketname, Key="key-1", Body=data), "Failed: Put object: key-1" exec_cmd(notify_cmd) # Validate rgw logs notification are sent # No errors are seen pattern = "ERROR: failed to create push endpoint" rgw_pod_obj = get_rgw_pods() rgw_log = get_pod_logs(pod_name=rgw_pod_obj[0].name, container="rgw") assert re.search(pattern=pattern, string=rgw_log) is None, ( f"Error: {pattern} msg found in the rgw logs." f"Validate {pattern} found on rgw logs and also " f"rgw bucket notification is working correctly") assert s3_client.put_object(Bucket=bucketname, Key="key-2", Body=data), "Failed: Put object: key-2" exec_cmd(notify_cmd) # Validate message are received Kafka side using curl command # A temporary way to check from Kafka side, need to check from UI curl_command = ( f"curl -X GET {kafkadrop_host}/topic/{self.kafka_topic.name} " "-H 'content-type: application/vnd.kafka.json.v2+json'") json_output = run_cmd(cmd=curl_command) new_string = json_output.split() messages = new_string[new_string.index("messages</td>") + 1] if messages.find("1") == -1: raise Exception( "Error: Messages are not recieved from Kafka side." "RGW bucket notification is not working as expected.") # Validate the timestamp events ocs_version = config.ENV_DATA["ocs_version"] if Version.coerce(ocs_version) >= Version.coerce("4.8"): cmd = ( f"bin/kafka-console-consumer.sh --bootstrap-server {constants.KAFKA_ENDPOINT} " f"--topic {self.kafka_topic.name} --from-beginning --timeout-ms 20000" ) pod_list = get_pod_name_by_pattern( pattern="my-cluster-zookeeper", namespace=constants.AMQ_NAMESPACE) zookeeper_obj = get_pod_obj(name=pod_list[0], namespace=constants.AMQ_NAMESPACE) event_obj = zookeeper_obj.exec_cmd_on_pod(command=cmd) log.info(f"Event obj: {event_obj}") event_time = event_obj.get("Records")[0].get("eventTime") format_string = "%Y-%m-%dT%H:%M:%S.%fZ" try: datetime.strptime(event_time, format_string) except ValueError as ef: log.error( f"Timestamp event {event_time} doesnt match the pattern {format_string}" ) raise ef log.info( f"Timestamp event {event_time} matches the pattern {format_string}" )
def test_rgw_kafka_notifications(self, bucket_factory): """ Test to verify rgw kafka notifications """ # Get sc sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) # Deploy amq cluster self.amq.setup_amq_cluster(sc.name) # Create topic self.kafka_topic = self.amq.create_kafka_topic() # Create Kafkadrop pod ( self.kafkadrop_pod, self.kafkadrop_pod, self.kafkadrop_route, ) = self.amq.create_kafkadrop() # Get the kafkadrop route kafkadrop_host = self.kafkadrop_route.get().get("spec").get("host") # Create bucket bucketname = bucket_factory(amount=1, interface="RGW-OC")[0].name # Get RGW credentials rgw_obj = RGW() rgw_endpoint, access_key, secret_key = rgw_obj.get_credentials() # Clone notify repo notify_path = clone_notify() # Initialise to put objects data = "A random string data to write on created rgw bucket" obc_obj = OBC(bucketname) s3_resource = boto3.resource( "s3", verify=retrieve_verification_mode(), endpoint_url=rgw_endpoint, aws_access_key_id=obc_obj.access_key_id, aws_secret_access_key=obc_obj.access_key, ) s3_client = s3_resource.meta.client # Initialize notify command to run notify_cmd = ( f"python {notify_path} -e {rgw_endpoint} -a {obc_obj.access_key_id} " f"-s {obc_obj.access_key} -b {bucketname} -ke {constants.KAFKA_ENDPOINT} -t {self.kafka_topic.name}" ) log.info(f"Running cmd {notify_cmd}") # Put objects to bucket assert s3_client.put_object(Bucket=bucketname, Key="key-1", Body=data), "Failed: Put object: key-1" exec_cmd(notify_cmd) # Validate rgw logs notification are sent # No errors are seen pattern = "ERROR: failed to create push endpoint" rgw_pod_obj = get_rgw_pods() rgw_log = get_pod_logs(pod_name=rgw_pod_obj[0].name, container="rgw") assert re.search(pattern=pattern, string=rgw_log) is None, ( f"Error: {pattern} msg found in the rgw logs." f"Validate {pattern} found on rgw logs and also " f"rgw bucket notification is working correctly") assert s3_client.put_object(Bucket=bucketname, Key="key-2", Body=data), "Failed: Put object: key-2" exec_cmd(notify_cmd) # Validate message are received Kafka side using curl command # A temporary way to check from Kafka side, need to check from UI curl_command = ( f"curl -X GET {kafkadrop_host}/topic/{self.kafka_topic.name} " "-H 'content-type: application/vnd.kafka.json.v2+json'") json_output = run_cmd(cmd=curl_command) new_string = json_output.split() messages = new_string[new_string.index("messages</td>") + 1] if messages.find("1") == -1: raise Exception( "Error: Messages are not recieved from Kafka side." "RGW bucket notification is not working as expected.")
def test_rgw_host_node_failure(self, nodes, node_restart_teardown, node_drain_teardown, mcg_obj, bucket_factory): """ Test case to fail node where RGW and the NooBaa DB are hosted and verify the new pods spin on a healthy node """ # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted noobaa_pod_node = None for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name in [ constants.NB_DB_NAME_46_AND_BELOW, constants.NB_DB_NAME_47_AND_ABOVE, ]: noobaa_pod_node = get_pod_node(noobaa_pod) if noobaa_pod_node is None: assert False, "Could not find the NooBaa DB pod" # Validate if RGW pod and noobaa-db are hosted on same node # If not, make sure both pods are hosted on same node log.info("Validate if RGW pod and noobaa-db are hosted on same node") rgw_pod_obj = get_rgw_pods() rgw_pod_node_list = [ rgw_pod.get().get("spec").get("nodeName") for rgw_pod in rgw_pod_obj ] if not list( set(rgw_pod_node_list).intersection( noobaa_pod_node.name.split())): log.info("Unschedule other two nodes such that RGW " "pod moves to node where NooBaa DB pod hosted") worker_node_list = get_worker_nodes() node_names = list( set(worker_node_list) - set(noobaa_pod_node.name.split())) unschedule_nodes(node_names=node_names) ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) rgw_pod_obj[0].delete() ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, timeout=300, sleep=5, ) log.info("Schedule those nodes again") schedule_nodes(node_names=node_names) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Check again the rgw pod move to node where NooBaa DB pod hosted rgw_pod_obj_list = get_rgw_pods() rgw_pod_node_list = [ get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list ] value = [ True if rgw_pod_node == noobaa_pod_node.name else False for rgw_pod_node in rgw_pod_node_list ] assert value, ("RGW Pod didn't move to node where NooBaa DB pod" " hosted even after cordoned and uncordoned nodes" f"RGW pod hosted: {rgw_pod_node_list}" f"NooBaa DB pod hosted: {noobaa_pod_node.name}") log.info( "RGW and noobaa-db are hosted on same node start the test execution" ) rgw_pod_obj = get_rgw_pods() for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info(f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and NooBaa DB are hosted") node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state(resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720) # Validate new rgw pod spun ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Start the node nodes.start_nodes(node_obj) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check()
def test_replication_with_disruptions( self, awscli_pod_session, mcg_obj_session, cld_mgr, bucket_factory, source_bucketclass, target_bucketclass, test_directory_setup, nodes, ): # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket target_bucket_name = bucket_factory( bucketclass=target_bucketclass)[0].name replication_policy = ("basic-replication-rule", target_bucket_name, None) source_bucket_name = bucket_factory( bucketclass=source_bucketclass, replication_policy=replication_policy)[0].name written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, source_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=5, pattern="first-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Uni-directional bucket replication working as expected") # change from uni-directional to bi-directional replication policy logger.info( "Changing the replication policy from uni to bi-directional!") bi_replication_policy_dict = { "spec": { "additionalConfig": { "replicationPolicy": json.dumps([{ "rule_id": "basic-replication-rule-2", "destination_bucket": source_bucket_name, }]) } } } OCP( namespace=config.ENV_DATA["cluster_namespace"], kind="obc", resource_name=target_bucket_name, ).patch(params=json.dumps(bi_replication_policy_dict), format_type="merge") logger.info( "Patch ran successfully! Changed the replication policy from uni to bi directional" ) # write objects to the second bucket and see if it's replicated on the other logger.info("checking if bi-directional replication works!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=3, pattern="second-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Bi directional bucket replication working as expected") # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on # write logger.info( "checking replication when one of the bucket's objects are deleted!!" ) try: mcg_obj_session.s3_resource.Bucket( target_bucket_name).objects.all().delete() except CommandFailed as e: logger.error(f"[Error] while deleting objects: {e}") if len( mcg_obj_session.s3_list_all_objects_in_bucket( target_bucket_name)) != 0: assert ( False ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}" logger.info("All the objects in RGW namespace buckets are deleted!!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="third-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info( "All the objects retrieved back to s3-compatible bucket on new write!!" ) # restart RGW pods and then see if object sync still works logger.info( "Checking if the replication works when there is RGW pod restarts!!" ) written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fourth-write-", ) logger.info(f"Written objects: {written_random_objects}") pod_names = get_pod_name_by_pattern( "rgw", namespace=config.ENV_DATA["cluster_namespace"]) pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"]) delete_pods(pod_objs=pod_objs) wait_for_pods_to_be_running( pod_names=pod_names, namespace=config.ENV_DATA["cluster_namespace"]) compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Object sync works after the RGW pod restarted!!") # write some object to any of the bucket, followed by immediate cluster restart logger.info("Checking replication when there is a cluster reboot!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fifth-write-", ) logger.info(f"Written objects: {written_random_objects}") node_list = get_worker_nodes() node_objs = get_node_objs(node_list) nodes.restart_nodes(node_objs, timeout=500) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) wait_for_pods_to_be_running( namespace=config.ENV_DATA["cluster_namespace"], timeout=800) logger.info("Nodes rebooted successfully!!") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Objects sync works even when the cluster is rebooted")