Ejemplo n.º 1
0
def osd_device_replacement(nodes):
    """
    Replacing randomly picked osd device
    Args:
        node (OCS): The OCS object representing the node
    """
    logger.info("Picking a PV which to be deleted from the platform side")
    osd_pvs = get_deviceset_pvs()
    osd_pv = random.choice(osd_pvs)
    osd_pv_name = osd_pv.name
    # get the claim name
    logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
    claim_name = osd_pv.get().get("spec").get("claimRef").get("name")

    # Get the backing volume name
    logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
    backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

    # Get the corresponding PVC
    logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
    osd_pvcs = get_deviceset_pvcs()
    osd_pvcs_count = len(osd_pvcs)
    osd_pvc = [
        ds for ds in osd_pvcs
        if ds.get().get("metadata").get("name") == claim_name
    ][0]

    # Get the corresponding OSD pod and ID
    logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
    osd_pods = get_osd_pods()
    osd_pods_count = len(osd_pods)
    osd_pod = [
        osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get(
            "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
    ][0]
    logger.info(f"OSD_POD {osd_pod.name}")
    osd_id = get_osd_pod_id(osd_pod)

    # Get the node that has the OSD pod running on
    logger.info(
        f"Getting the node that has the OSD pod {osd_pod.name} running on")
    osd_node = get_pod_node(osd_pod)
    ocp_version = get_ocp_version()
    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get("metadata").get(
                "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get(
            "labels").get("job-name"))
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

    # Get the corresponding OSD deployment
    logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
    osd_deployment = [
        osd_pod for osd_pod in get_osd_deployments()
        if osd_pod.get().get("metadata").get("labels").get(
            constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
    ][0]
    osd_deployment_name = osd_deployment.name

    # Delete the volume from the platform side
    logger.info(f"Deleting {backing_volume} from the platform side")
    nodes.detach_volume(backing_volume, osd_node)

    # Scale down OSD deployment
    logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
    ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
    ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}")

    # Force delete OSD pod if necessary
    osd_pod_name = osd_pod.name
    logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
    try:
        osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
    except TimeoutError:
        osd_pod.delete(force=True)
        osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

    # Run ocs-osd-removal job
    osd_removal_job = run_osd_removal_job([osd_id])
    assert osd_removal_job, "ocs-osd-removal failed to create"
    is_completed = verify_osd_removal_job_completed_successfully(osd_id)
    assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
    logger.info("ocs-osd-removal-job completed successfully")

    osd_pvc_name = osd_pvc.name

    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        # Delete the OSD prepare job
        logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
        osd_prepare_job.delete()
        osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name,
                                            timeout=120)

        # Delete the OSD PVC
        logger.info(f"Deleting OSD PVC {osd_pvc_name}")
        osd_pvc.delete()
        osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

        # Delete the OSD deployment
        logger.info(f"Deleting OSD deployment {osd_deployment_name}")
        osd_deployment.delete()
        osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name,
                                           timeout=120)
    else:
        # If ocp version is '4.6' and above the osd removal job should
        # delete the OSD prepare job, OSD PVC, OSD deployment
        # We just need to verify the old PV is in the expected status
        logger.info(
            f"Verify that the old PV '{osd_pv_name}' is in the expected status"
        )
        if cluster.is_lso_cluster():
            expected_old_pv_statuses = [constants.STATUS_RELEASED]
        else:
            expected_old_pv_statuses = [
                constants.STATUS_RELEASED,
                constants.STATUS_FAILED,
            ]

        assert (osd_pv.ocp.get_resource_status(osd_pv_name)
                in expected_old_pv_statuses), logger.warning(
                    f"The old PV '{osd_pv_name}' is not in "
                    f"the expected statuses: {expected_old_pv_statuses}")

    # Delete PV
    logger.info(f"Verifying deletion of PV {osd_pv_name}")
    try:
        osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
    except TimeoutError:
        osd_pv.delete()
        osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

    # If we use LSO, we need to create and attach a new disk manually
    if cluster.is_lso_cluster():
        node.add_disk_to_node(osd_node)

    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        # Delete the rook ceph operator pod to trigger reconciliation
        rook_operator_pod = get_operator_pods()[0]
        logger.info(
            f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
        rook_operator_pod.delete()

    # Delete the OSD removal job
    logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
    is_deleted = delete_osd_removal_job(osd_id)
    assert is_deleted, "Failed to delete ocs-osd-removal-job"
    logger.info("ocs-osd-removal-job deleted successfully")

    timeout = 600
    # Wait for OSD PVC to get created and reach Bound state
    logger.info(
        "Waiting for a new OSD PVC to get created and reach Bound state")
    assert osd_pvc.ocp.wait_for_resource(
        timeout=timeout,
        condition=constants.STATUS_BOUND,
        selector=constants.OSD_PVC_GENERIC_LABEL,
        resource_count=osd_pvcs_count,
    ), (f"Cluster recovery failed after {timeout} seconds. "
        f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
        f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
        )
    # Wait for OSD pod to get created and reach Running state
    logger.info(
        "Waiting for a new OSD pod to get created and reach Running state")
    assert osd_pod.ocp.wait_for_resource(
        timeout=timeout,
        condition=constants.STATUS_RUNNING,
        selector=constants.OSD_APP_LABEL,
        resource_count=osd_pods_count,
    ), (f"Cluster recovery failed after {timeout} seconds. "
        f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
        f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
        )

    # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810
    # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438
    if Version.coerce(ocp_version) >= Version.coerce("4.6"):
        silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning(
            osd_pod_name)
        if not silence_osd_crash:
            logger.info("Didn't find ceph osd crash warning")
    sanity_helpers = Sanity()
    sanity_helpers.health_check(tries=120)
Ejemplo n.º 2
0
def delete_and_create_osd_node_vsphere_upi_lso(osd_node_name,
                                               use_existing_node=False):
    """
    Unschedule, drain and delete osd node, and creating a new osd node.
    At the end of the function there should be the same number of osd nodes as
    it was in the beginning, and also ceph health should be OK.
    This function is for vSphere UPI.

    Args:
        osd_node_name (str): the name of the osd node
        use_existing_node (bool): If False, create a new node and label it.
            If True, use an existing node to replace the deleted node
            and label it.

    Returns:
        str: The new node name

    """
    sc_name = constants.LOCAL_BLOCK_RESOURCE
    old_pv_objs = get_pv_objs_in_sc(sc_name)

    osd_node = get_node_objs(node_names=[osd_node_name])[0]
    osd_pod = get_node_pods(osd_node_name,
                            pods_to_search=pod.get_osd_pods())[0]
    osd_id = pod.get_osd_pod_id(osd_pod)
    log.info(f"osd id to remove = {osd_id}")
    # Save the node hostname before deleting the node
    osd_node_hostname_label = get_node_hostname_label(osd_node)

    log.info("Scale down node deployments...")
    scale_down_deployments(osd_node_name)
    log.info("Scale down deployments finished successfully")

    new_node_name = delete_and_create_osd_node_vsphere_upi(
        osd_node_name, use_existing_node)
    assert new_node_name, "Failed to create a new node"
    log.info(f"New node created successfully. Node name: {new_node_name}")

    # If we use LSO, we need to create and attach a new disk manually
    new_node = get_node_objs(node_names=[new_node_name])[0]
    add_disk_to_node(new_node)

    new_node_hostname_label = get_node_hostname_label(new_node)
    log.info(
        "Replace the old node with the new worker node in localVolumeDiscovery and localVolumeSet"
    )
    res = add_new_node_to_lvd_and_lvs(
        old_node_name=osd_node_hostname_label,
        new_node_name=new_node_hostname_label,
    )
    assert res, "Failed to add the new node to LVD and LVS"

    log.info("Verify new pv is available...")
    is_new_pv_available = verify_new_pv_available_in_sc(old_pv_objs, sc_name)
    assert is_new_pv_available, "New pv is not available"
    log.info("Finished verifying that the new pv is available")

    osd_removal_job = pod.run_osd_removal_job(osd_id)
    assert osd_removal_job, "ocs-osd-removal failed to create"
    is_completed = (
        pod.verify_osd_removal_job_completed_successfully(osd_id), )
    assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
    log.info("ocs-osd-removal-job completed successfully")

    expected_num_of_deleted_pvs = [0, 1]
    num_of_deleted_pvs = delete_released_pvs_in_sc(sc_name)
    assert num_of_deleted_pvs in expected_num_of_deleted_pvs, (
        f"num of deleted PVs is {num_of_deleted_pvs} "
        f"instead of the expected values {expected_num_of_deleted_pvs}")
    log.info(f"num of deleted PVs is {num_of_deleted_pvs}")
    log.info("Successfully deleted old pv")

    is_deleted = pod.delete_osd_removal_job(osd_id)
    assert is_deleted, "Failed to delete ocs-osd-removal-job"
    log.info("ocs-osd-removal-job deleted successfully")

    return new_node_name