def deploy_ocp(self, log_cli_level='DEBUG'): """ Deployment specific to OCP cluster on vSphere platform Args: log_cli_level (str): openshift installer's log level (default: "DEBUG") """ super(VSPHEREUPI, self).deploy_ocp(log_cli_level) if config.ENV_DATA.get('scale_up'): logger.info("Adding extra nodes to cluster") self.add_nodes() # remove RHCOS compute nodes if ( config.ENV_DATA.get('scale_up') and not config.ENV_DATA.get('mixed_cluster') ): rhcos_nodes = get_typed_worker_nodes() logger.info( f"RHCOS compute nodes to delete: " f"{[node.name for node in rhcos_nodes]}" ) logger.info("Removing RHCOS compute nodes from a cluster") remove_nodes(rhcos_nodes)
def deploy_ocp(self, log_cli_level='DEBUG'): """ Deployment specific to OCP cluster on vSphere platform Args: log_cli_level (str): openshift installer's log level (default: "DEBUG") """ super(VSPHEREUPI, self).deploy_ocp(log_cli_level) if config.ENV_DATA.get('scale_up'): logger.info("Adding extra nodes to cluster") self.add_nodes() # remove RHCOS compute nodes if (config.ENV_DATA.get('scale_up') and not config.ENV_DATA.get('mixed_cluster')): rhcos_nodes = get_typed_worker_nodes() logger.info(f"RHCOS compute nodes to delete: " f"{[node.name for node in rhcos_nodes]}") logger.info("Removing RHCOS compute nodes from a cluster") remove_nodes(rhcos_nodes) # get datastore type and configure chrony for all nodes ONLY if # datstore type is vsan datastore_type = self.vsphere.get_datastore_type_by_name( self.datastore, self.datacenter) if datastore_type != constants.VMFS: configure_chrony_and_wait_for_machineconfig_status(node_type="all", timeout=1800)
def deploy_ocp(self, log_cli_level="DEBUG"): """ Deployment specific to OCP cluster on vSphere platform Args: log_cli_level (str): openshift installer's log level (default: "DEBUG") """ cluster_name_parts = config.ENV_DATA.get("cluster_name").split("-") prefix = cluster_name_parts[0] if not ( prefix.startswith(tuple(constants.PRODUCTION_JOBS_PREFIX)) or config.DEPLOYMENT.get("force_deploy_multiple_clusters") ): if self.check_cluster_existence(prefix): raise exceptions.SameNamePrefixClusterAlreadyExistsException( f"Cluster with name prefix {prefix} already exists. " f"Please destroy the existing cluster for a new cluster " f"deployment" ) super(VSPHEREUPI, self).deploy_ocp(log_cli_level) if config.ENV_DATA.get("scale_up"): logger.info("Adding extra nodes to cluster") self.add_nodes() # remove RHCOS compute nodes if config.ENV_DATA.get("scale_up") and not config.ENV_DATA.get("mixed_cluster"): rhcos_nodes = get_typed_worker_nodes() logger.info( f"RHCOS compute nodes to delete: " f"{[node.name for node in rhcos_nodes]}" ) logger.info("Removing RHCOS compute nodes from a cluster") remove_nodes(rhcos_nodes) if config.DEPLOYMENT.get("thick_sc"): sc_data = templating.load_yaml(constants.VSPHERE_THICK_STORAGECLASS_YAML) sc_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="storageclass", delete=False ) if config.DEPLOYMENT.get("eager_zeroed_thick_sc"): sc_data["parameters"]["diskformat"] = "eagerzeroedthick" else: sc_data["parameters"]["diskformat"] = "zeroedthick" templating.dump_data_to_temp_yaml(sc_data, sc_data_yaml.name) run_cmd(f"oc create -f {sc_data_yaml.name}") self.DEFAULT_STORAGECLASS = "thick"
def deploy_ocp(self, log_cli_level="DEBUG"): """ Deployment specific to OCP cluster on vSphere platform Args: log_cli_level (str): openshift installer's log level (default: "DEBUG") """ cluster_name_parts = config.ENV_DATA.get("cluster_name").split("-") prefix = cluster_name_parts[0] if not ( prefix.startswith(tuple(constants.PRODUCTION_JOBS_PREFIX)) or config.DEPLOYMENT.get("force_deploy_multiple_clusters") ): if self.check_cluster_existence(prefix): raise exceptions.SameNamePrefixClusterAlreadyExistsException( f"Cluster with name prefix {prefix} already exists. " f"Please destroy the existing cluster for a new cluster " f"deployment" ) super(VSPHEREUPI, self).deploy_ocp(log_cli_level) if config.ENV_DATA.get("scale_up"): logger.info("Adding extra nodes to cluster") self.add_nodes() # remove RHCOS compute nodes if config.ENV_DATA.get("scale_up") and not config.ENV_DATA.get("mixed_cluster"): rhcos_nodes = get_typed_worker_nodes() logger.info( f"RHCOS compute nodes to delete: " f"{[node.name for node in rhcos_nodes]}" ) logger.info("Removing RHCOS compute nodes from a cluster") remove_nodes(rhcos_nodes) # get datastore type and configure chrony for all nodes ONLY if # datastore type is vsan datastore_type = self.vsphere.get_datastore_type_by_name( self.datastore, self.datacenter ) if datastore_type != constants.VMFS: configure_chrony_and_wait_for_machineconfig_status( node_type="all", timeout=1800 )
def test_simultaneous_drain_of_two_ocs_nodes( self, pvc_factory, pod_factory, dc_pod_factory, interface, bucket_factory, rgw_bucket_factory, ): """ OCS-2128/OCS-2129: - Create PVCs and start IO on DC based app pods - Add one extra node in two of the AZs and label the nodes with OCS storage label - Maintenance (mark as unscheduable and drain) 2 worker nodes simultaneously - Confirm that OCS and DC pods are in running state - Remove unscheduled nodes - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Check cluster and Ceph health """ # Get OSD running nodes osd_running_worker_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_worker_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_worker_nodes, label_key="dc", label_value="fedora") log.info("Successfully labeled worker nodes with {dc:fedora}") # Create DC app pods log.info("Creating DC based app pods and starting IO in background") interface = (constants.CEPHBLOCKPOOL if interface == "rbd" else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get the machine name using the node name machine_names = [ machine.get_machine_from_node_name(osd_running_worker_node) for osd_running_worker_node in osd_running_worker_nodes[:2] ] log.info(f"{osd_running_worker_nodes} associated " f"machine are {machine_names}") # Get the machineset name using machine name machineset_names = [ machine.get_machineset_from_machine_name(machine_name) for machine_name in machine_names ] log.info(f"{osd_running_worker_nodes} associated machineset " f"is {machineset_names}") # Add a new node and label it add_new_node_and_label_it(machineset_names[0]) add_new_node_and_label_it(machineset_names[1]) # Drain 2 nodes drain_nodes(osd_running_worker_nodes[:2]) # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if "rook-ceph-crashcollector" in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = "-".join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # DC app pods on the drained node will get automatically created on other # running node in same AZ. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=1200) log.info("All the dc pods reached running state") # Remove unscheduled nodes # In scenarios where the drain is attempted on >3 worker setup, # post completion of drain we are removing the unscheduled nodes so # that we maintain 3 worker nodes. log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}") remove_node_objs = get_node_objs(osd_running_worker_nodes[:2]) remove_nodes(remove_node_objs) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()