def start_baremetal_machines_with_ipmi_ctx(self, ipmi_ctxs, wait=True): """ Start Baremetal Machines using Ipmi ctx Args: ipmi_ctxs (list): List of BM ipmi_ctx wait (bool): Wait for BMs to start """ for ipmi_ctx in ipmi_ctxs: ipmi_ctx.chassis_control_power_up() if wait: for ipmi_ctx in ipmi_ctxs: for status in TimeoutSampler(600, 5, self.get_power_status, ipmi_ctx): logger.info( f"Waiting for Baremetal Machine to power on. " f"Current Baremetal status: {status}" ) if status == VM_POWERED_ON: logger.info("Baremetal Machine reached poweredOn status") break wait_for_cluster_connectivity(tries=400) wait_for_nodes_status( node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800 ) wait_for_nodes_status( node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800 )
def start_powernodes_machines(self, powernode_machines, timeout=900, wait=True, force=True): """ Start PowerNode Machines Args: powernode_machines (list): List of PowerNode machines timeout (int): time in seconds to wait for node to reach 'not ready' state, and 'ready' state. wait (bool): Wait for PowerNodes to start - for future use force (bool): True for PowerNode ungraceful power off, False for graceful PowerNode shutdown - for future use """ ocpversion = get_ocp_version("-") for node in powernode_machines: result = exec_cmd( f"sudo virsh start test-ocp{ocpversion}-{node.name}") logger.info(f"Result of shutdown {result}") wait_for_cluster_connectivity(tries=900) wait_for_nodes_status(node_names=get_master_nodes(), status=constants.NODE_READY, timeout=timeout) wait_for_nodes_status(node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=timeout)
def health_check(self, cluster_check=True, tries=20): """ Perform Ceph and cluster health checks """ wait_for_cluster_connectivity(tries=400) logger.info("Checking cluster and Ceph health") node.wait_for_nodes_status(timeout=300) ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=tries) if cluster_check: self.ceph_cluster.cluster_health_check(timeout=60)
def restart(self, node, timeout): """ Restart the kubelet service using parent service class. After that, ensures the corresponding OCP node is connectable and moves to Ready state. Args: node (object): Node objects timeout (int): time in seconds to wait for service to stop. """ super().restart(node, timeout) wait_for_cluster_connectivity(tries=900) wait_for_nodes_status(node_names=[node.name], status=constants.NODE_READY, timeout=timeout)
def start_baremetal_machines(self, baremetal_machine, wait=True): """ Start Baremetal Machines Args: baremetal_machine (list): BM objects wait (bool): Wait for BMs to start """ for node in baremetal_machine: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) logger.info(f"Powering On {node.name}") ipmi_ctx.chassis_control_power_up() if wait: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) for status in TimeoutSampler( 600, 5, self.get_power_status, ipmi_ctx ): logger.info( f"Waiting for Baremetal Machine {node.name} to power on. " f"Current Baremetal status: {status}" ) if status == VM_POWERED_ON: logger.info( f"Baremetal Machine {node.name} reached poweredOn status" ) ipmi_ctx.session.close() break wait_for_cluster_connectivity(tries=400) wait_for_nodes_status( node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800 ) wait_for_nodes_status( node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800 )
def wait_for_nodes_status_and_prometheus_health_check(pods): """ Waits for the all the nodes to be in running state and also check prometheus health """ # Validate all nodes are in READY state ocp.wait_for_cluster_connectivity(tries=400) wait_for_nodes_status(timeout=1800) # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) assert prometheus_health_check(), "Prometheus health is degraded"
def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) get_node_resource_utilization_from_adm_top(node_type="master", print_table=True) if pod_name_of_node == "couchbase": node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == "osd": node_list = get_osd_running_nodes() elif pod_name_of_node == "master": master_node = get_nodes(pod_name_of_node, num_of_nodes=1) # Restart relevant node if pod_name_of_node == "master": nodes.restart_nodes(master_node, wait=False) waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) else: restart_node = get_node_objs(node_list[random.randint( 0, len(node_list) - 1)]) nodes.restart_nodes(restart_node) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] retry((CommandFailed), tries=60, delay=15)(bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)) self.sanity_helpers.health_check(tries=40)
def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup): """ Test case to validate rebooting master node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_nodes(node_type, num_of_nodes=1) # Reboot one master nodes nodes.restart_nodes(node, wait=False) # Wait some time after rebooting master waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition="Running", resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800)
def test_replication_with_disruptions( self, awscli_pod_session, mcg_obj_session, cld_mgr, bucket_factory, source_bucketclass, target_bucketclass, test_directory_setup, nodes, ): # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket target_bucket_name = bucket_factory( bucketclass=target_bucketclass)[0].name replication_policy = ("basic-replication-rule", target_bucket_name, None) source_bucket_name = bucket_factory( bucketclass=source_bucketclass, replication_policy=replication_policy)[0].name written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, source_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=5, pattern="first-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Uni-directional bucket replication working as expected") # change from uni-directional to bi-directional replication policy logger.info( "Changing the replication policy from uni to bi-directional!") bi_replication_policy_dict = { "spec": { "additionalConfig": { "replicationPolicy": json.dumps([{ "rule_id": "basic-replication-rule-2", "destination_bucket": source_bucket_name, }]) } } } OCP( namespace=config.ENV_DATA["cluster_namespace"], kind="obc", resource_name=target_bucket_name, ).patch(params=json.dumps(bi_replication_policy_dict), format_type="merge") logger.info( "Patch ran successfully! Changed the replication policy from uni to bi directional" ) # write objects to the second bucket and see if it's replicated on the other logger.info("checking if bi-directional replication works!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=3, pattern="second-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Bi directional bucket replication working as expected") # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on # write logger.info( "checking replication when one of the bucket's objects are deleted!!" ) try: mcg_obj_session.s3_resource.Bucket( target_bucket_name).objects.all().delete() except CommandFailed as e: logger.error(f"[Error] while deleting objects: {e}") if len( mcg_obj_session.s3_list_all_objects_in_bucket( target_bucket_name)) != 0: assert ( False ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}" logger.info("All the objects in RGW namespace buckets are deleted!!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="third-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info( "All the objects retrieved back to s3-compatible bucket on new write!!" ) # restart RGW pods and then see if object sync still works logger.info( "Checking if the replication works when there is RGW pod restarts!!" ) written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fourth-write-", ) logger.info(f"Written objects: {written_random_objects}") pod_names = get_pod_name_by_pattern( "rgw", namespace=config.ENV_DATA["cluster_namespace"]) pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"]) delete_pods(pod_objs=pod_objs) wait_for_pods_to_be_running( pod_names=pod_names, namespace=config.ENV_DATA["cluster_namespace"]) compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Object sync works after the RGW pod restarted!!") # write some object to any of the bucket, followed by immediate cluster restart logger.info("Checking replication when there is a cluster reboot!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fifth-write-", ) logger.info(f"Written objects: {written_random_objects}") node_list = get_worker_nodes() node_objs = get_node_objs(node_list) nodes.restart_nodes(node_objs, timeout=500) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) wait_for_pods_to_be_running( namespace=config.ENV_DATA["cluster_namespace"], timeout=800) logger.info("Nodes rebooted successfully!!") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Objects sync works even when the cluster is rebooted")