def add_new_node_and_label_upi(node_type, num_nodes, mark_for_ocs_label=True, node_conf=None): """ Add a new node for aws/vmware upi platform and label it Args: node_type (str): Type of node, RHEL or RHCOS num_nodes (int): number of nodes to add mark_for_ocs_label (bool): True if label the new node node_conf (dict): The node configurations. Retuns: bool: True if node addition has done successfully """ node_conf = node_conf or {} initial_nodes = tests.helpers.get_worker_nodes() from ocs_ci.ocs.platform_nodes import PlatformNodesFactory plt = PlatformNodesFactory() node_util = plt.get_nodes_platform() node_util.create_and_attach_nodes_to_cluster(node_conf, node_type, num_nodes) for sample in TimeoutSampler( timeout=600, sleep=6, func=tests.helpers.get_worker_nodes ): if len(sample) == len(initial_nodes) + num_nodes: break nodes_after_exp = tests.helpers.get_worker_nodes() wait_for_nodes_status( node_names=tests.helpers.get_worker_nodes(), status=constants.NODE_READY ) new_spun_nodes = list(set(nodes_after_exp) - set(initial_nodes)) if node_type == constants.RHEL_OS: set_selinux_permissions(workers=new_spun_nodes) if mark_for_ocs_label: node_obj = ocp.OCP(kind='node') for new_spun_node in new_spun_nodes: node_obj.add_label( resource_name=new_spun_node, label=constants.OPERATOR_NODE_LABEL ) logging.info( f"Successfully labeled {new_spun_node} with OCS storage label" ) return True
def add_disk_to_node(node_obj, disk_size=None): """ Add a new disk to a node Args: node_obj (ocs_ci.ocs.resources.ocs.OCS): The node object disk_size (int): The size of the new disk to attach. If not specified, the disk size will be equal to the size of the previous disk. """ from ocs_ci.ocs.platform_nodes import PlatformNodesFactory plt = PlatformNodesFactory() node_util = plt.get_nodes_platform() if not disk_size: pv_objs = get_pv_objs_in_sc(sc_name=constants.LOCAL_BLOCK_RESOURCE) disk_size = get_pv_size(pv_objs[-1]) node_util.create_and_attach_volume(node=node_obj, size=disk_size)
def osd_node_reboot(): """ Rebooting worker node that running OSD Raises: AssertionError: in case the ceph-tools pod was not recovered """ nodes = PlatformNodesFactory().get_nodes_platform() osd_nodes_names = get_osd_running_nodes() osd_node_to_reboot = list() for node in get_nodes(): node_name = get_node_name(node) if node_name == osd_nodes_names[0]: osd_node_to_reboot.append(node) log.info(f"Rebooting OSD node: {get_node_name(osd_node_to_reboot[0])}") nodes.restart_nodes(osd_node_to_reboot) log.info("Sleeping 5 minutes") time.sleep(320) assert ( wait_for_ct_pod_recovery() ), "Ceph tools pod failed to come up on another node"
def worker_node_shutdown(abrupt): """ Shutdown worker node that running ocs-operator pod Args: abrupt: (bool): True if abrupt shutdown, False for permanent shutdown Raises: AssertionError: in case the ceph-tools pod was not recovered """ nodes = PlatformNodesFactory().get_nodes_platform() log.info(f"Abrupt {abrupt}") # get ocs-operator node: ocs_operator_node_name = get_ocs_operator_node_name() # get workers node objects: node_to_shutdown = list() for node in get_nodes(): node_name = get_node_name(node) log.info(f"node: {node_name}, ocs operator node: {ocs_operator_node_name}") if node_name == ocs_operator_node_name: node_to_shutdown.append(node) log.info(f"node to shutdown: {get_node_name(node_to_shutdown[0])}") nodes.stop_nodes(node_to_shutdown) log.info("stop instance - done!") break log.info("Sleeping 5 minutes") time.sleep(320) assert ( wait_for_ct_pod_recovery() ), "Ceph tools pod failed to come up on another node" if abrupt: log.info("Abrupt Shutdown") if node_to_shutdown: nodes.start_nodes(nodes=node_to_shutdown)
def delete_and_create_osd_node_vsphere_upi_lso(osd_node_name, use_existing_node=False): """ Unschedule, drain and delete osd node, and creating a new osd node. At the end of the function there should be the same number of osd nodes as it was in the beginning, and also ceph health should be OK. This function is for vSphere UPI. Args: osd_node_name (str): the name of the osd node use_existing_node (bool): If False, create a new node and label it. If True, use an existing node to replace the deleted node and label it. Returns: str: The new node name """ from ocs_ci.ocs.platform_nodes import PlatformNodesFactory from ocs_ci.ocs.resources.storage_cluster import get_osd_size sc_name = constants.LOCAL_BLOCK_RESOURCE old_pv_objs = get_pv_objs_in_sc(sc_name) osd_node = get_node_objs(node_names=[osd_node_name])[0] osd_pod = get_node_pods(osd_node_name, pods_to_search=pod.get_osd_pods())[0] osd_id = pod.get_osd_pod_id(osd_pod) log.info(f"osd id to remove = {osd_id}") # Save the node hostname before deleting the node osd_node_hostname_label = get_node_hostname_label(osd_node) log.info("Scale down node deployments...") scale_down_deployments(osd_node_name) log.info("Scale down deployments finished successfully") new_node_name = delete_and_create_osd_node_vsphere_upi( osd_node_name, use_existing_node ) assert new_node_name, "Failed to create a new node" log.info(f"New node created successfully. Node name: {new_node_name}") # If we use LSO, we need to create and attach a new disk manually new_node = get_node_objs(node_names=[new_node_name])[0] plt = PlatformNodesFactory() node_util = plt.get_nodes_platform() osd_size = get_osd_size() log.info( f"Create a new disk with size {osd_size}, and attach to node {new_node_name}" ) node_util.create_and_attach_volume(node=new_node, size=osd_size) new_node_hostname_label = get_node_hostname_label(new_node) log.info( "Replace the old node with the new worker node in localVolumeDiscovery and localVolumeSet" ) res = add_new_node_to_lvd_and_lvs( old_node_name=osd_node_hostname_label, new_node_name=new_node_hostname_label, ) assert res, "Failed to add the new node to LVD and LVS" log.info("Verify new pv is available...") is_new_pv_available = verify_new_pv_available_in_sc(old_pv_objs, sc_name) assert is_new_pv_available, "New pv is not available" log.info("Finished verifying that the new pv is available") osd_removal_job = pod.run_osd_removal_job(osd_id) assert osd_removal_job, "ocs-osd-removal failed to create" is_completed = (pod.verify_osd_removal_job_completed_successfully(osd_id),) assert is_completed, "ocs-osd-removal-job is not in status 'completed'" log.info("ocs-osd-removal-job completed successfully") expected_num_of_deleted_pvs = 1 num_of_deleted_pvs = delete_released_pvs_in_sc(sc_name) assert ( num_of_deleted_pvs == expected_num_of_deleted_pvs ), f"num of deleted PVs is {num_of_deleted_pvs} instead of {expected_num_of_deleted_pvs}" log.info("Successfully deleted old pv") is_deleted = pod.delete_osd_removal_job(osd_id) assert is_deleted, "Failed to delete ocs-osd-removal-job" log.info("ocs-osd-removal-job deleted successfully") return new_node_name