예제 #1
0
def test_offline_node_with_attached_volume_and_pod(
        client, core_api, volume_name, make_deployment_with_pvc,
        reset_cluster_ready_status):  # NOQA
    """
    Test offline node with attached volume and pod

    1. Create PV/PVC/Deployment manifest.
    2. Update deployment's tolerations to 20 seconds to speed up test
    3. Update deployment's node affinity rule to avoid the current node
    4. Create volume, PV/PVC and deployment.
    5. Find the pod in the deployment and write `test_data` into it
    6. Shutdown the node pod is running on
    7. Wait for deployment to delete the pod
        1. Deployment cannot delete the pod here because kubelet doesn't
        response
    8. Force delete the terminating pod
    9. Wait for the new pod to be created and the volume attached
    10. Check `test_data` in the new pod
    """
    toleration_seconds = 20

    apps_api = get_apps_api_client()
    cloudprovider = detect_cloudprovider()

    volume_name = generate_volume_name()
    pv_name = volume_name + "-pv"
    pvc_name = volume_name + "-pvc"
    deployment_name = volume_name + "-dep"

    longhorn_test_node_name = get_self_host_id()

    deployment_manifest = make_deployment_with_pvc(deployment_name, pvc_name)

    unreachable_toleration = {
        "key": "node.kubernetes.io/unreachable",
        "operator": "Exists",
        "effect": "NoExecute",
        "tolerationSeconds": toleration_seconds
    }

    not_ready_toleration = {
        "key": "node.kubernetes.io/not-ready",
        "operator": "Exists",
        "effect": "NoExecute",
        "tolerationSeconds": toleration_seconds
    }

    deployment_manifest["spec"]["template"]["spec"]["tolerations"] =\
        [unreachable_toleration, not_ready_toleration]

    node_affinity_roles = {
        "nodeAffinity": {
            "requiredDuringSchedulingIgnoredDuringExecution": {
                "nodeSelectorTerms": [{
                    "matchExpressions": [{
                        "key": "kubernetes.io/hostname",
                        "operator": "NotIn",
                        "values": [longhorn_test_node_name]
                    }]
                }]
            }
        }
    }

    deployment_manifest["spec"]["template"]["spec"]["affinity"] =\
        node_affinity_roles

    longhorn_volume = create_and_check_volume(client, volume_name, size=SIZE)

    wait_for_volume_detached(client, volume_name)

    create_pv_for_volume(client, core_api, longhorn_volume, pv_name)

    create_pvc_for_volume(client, core_api, longhorn_volume, pvc_name)

    create_and_wait_deployment(apps_api, deployment_manifest)

    deployment_label_selector =\
        "name=" + deployment_manifest["metadata"]["labels"]["name"]

    deployment_pod_list =\
        core_api.list_namespaced_pod(namespace="default",
                                     label_selector=deployment_label_selector)

    assert deployment_pod_list.items.__len__() == 1

    pod_name = deployment_pod_list.items[0].metadata.name

    test_data = generate_random_data(VOLUME_RWTEST_SIZE)

    write_pod_volume_data(core_api, pod_name, test_data)

    node_name = deployment_pod_list.items[0].spec.node_name
    node = cloudprovider.node_id(node_name)

    cloudprovider.node_shutdown(node)

    k8s_node_down = wait_for_node_down_k8s(node_name, core_api)

    assert k8s_node_down

    client = get_longhorn_api_client()

    longhorn_node_down = wait_for_node_down_longhorn(node_name, client)
    assert longhorn_node_down

    time.sleep(toleration_seconds + 5)

    for i in range(TERMINATING_POD_RETRYS):
        deployment_pod_list =\
            core_api.list_namespaced_pod(
                namespace="default",
                label_selector=deployment_label_selector
            )

        terminating_pod_name = None
        for pod in deployment_pod_list.items:
            if pod.metadata.__getattribute__("deletion_timestamp") is not None:
                terminating_pod_name = pod.metadata.name
                break

        if terminating_pod_name is not None:
            break
        else:
            time.sleep(TERMINATING_POD_INTERVAL)

    assert terminating_pod_name is not None

    core_api.delete_namespaced_pod(namespace="default",
                                   name=terminating_pod_name,
                                   grace_period_seconds=0)

    delete_and_wait_pod(core_api, terminating_pod_name)

    deployment_pod_list =\
        core_api.list_namespaced_pod(
            namespace="default",
            label_selector=deployment_label_selector
        )

    assert deployment_pod_list.items.__len__() == 1

    wait_for_volume_detached(client, volume_name)
    wait_for_volume_healthy(client, volume_name)

    deployment_pod_list =\
        core_api.list_namespaced_pod(
            namespace="default",
            label_selector=deployment_label_selector
        )

    assert deployment_pod_list.items.__len__() == 1

    new_pod_name = deployment_pod_list.items[0].metadata.name

    wait_pod(new_pod_name)

    resp_data = read_volume_data(core_api, new_pod_name)

    assert test_data == resp_data
예제 #2
0
def test_restore_rwo_volume_to_rwx(set_random_backupstore, client, core_api,
                                   volume_name, pvc, csi_pv, pod_make,
                                   make_deployment_with_pvc):  # NOQA
    """
    Test restoring a rwo to a rwx volume.

    1. Create a volume with 'accessMode' rwo.
    2. Create a PV and a PVC with access mode 'readwriteonce' and attach to the
       volume.
    3. Create a pod and attach to the PVC.
    4. Write some data into the pod and compute md5sum.
    5. Take a backup of the volume.
    6. Restore the backup with 'accessMode' rwx.
    7. Create PV and PVC and attach to 2 pods.
    8. Verify the data.
    """

    data_path = "/data/test"
    pod_name, pv_name, pvc_name, md5sum = \
        prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc,
                                    pod_make,
                                    volume_name,
                                    data_size_in_mb=DATA_SIZE_IN_MB_1,
                                    data_path=data_path)

    snap = create_snapshot(client, volume_name)
    volume = client.by_id_volume(volume_name)
    volume.snapshotBackup(name=snap.name)
    wait_for_backup_completion(client, volume_name, snap.name)
    bv, b1 = find_backup(client, volume_name, snap.name)

    restore_volume_name = 'restored-rwx-volume'
    restore_pv_name = restore_volume_name + "-pv"
    restore_pvc_name = restore_volume_name + "-pvc"

    client.create_volume(name=restore_volume_name,
                         size=str(1 * Gi),
                         numberOfReplicas=3,
                         fromBackup=b1.url,
                         accessMode='rwx')
    wait_for_volume_creation(client, restore_volume_name)
    restore_volume = wait_for_volume_detached(client, restore_volume_name)
    create_pv_for_volume(client, core_api, restore_volume, restore_pv_name)
    create_pvc_for_volume(client, core_api, restore_volume, restore_pvc_name)
    deployment = make_deployment_with_pvc('deployment-multi-pods-test',
                                          restore_pvc_name,
                                          replicas=2)
    apps_api = get_apps_api_client()
    create_and_wait_deployment(apps_api, deployment)

    deployment_label_selector = \
        "name=" + deployment["metadata"]["labels"]["name"]

    deployment_pod_list = \
        core_api.list_namespaced_pod(namespace="default",
                                     label_selector=deployment_label_selector)

    pod_name_1 = deployment_pod_list.items[0].metadata.name
    pod_name_2 = deployment_pod_list.items[1].metadata.name

    md5sum_pod1 = get_pod_data_md5sum(core_api, pod_name_1, data_path)
    md5sum_pod2 = get_pod_data_md5sum(core_api, pod_name_2, data_path)

    assert md5sum == md5sum_pod1 == md5sum_pod2
예제 #3
0
def test_recurring_jobs_when_volume_detached_unexpectedly(
        settings_reset, set_random_backupstore, client, core_api, apps_api,
        pvc, make_deployment_with_pvc):  # NOQA
    """
    Test recurring jobs when volume detached unexpectedly

    Context:

    If the volume is automatically attached by the recurring backup job,
    make sure that workload pod eventually is able to use the volume
    when volume is detached unexpectedly during the backup process.

    Steps:

    1. Create a volume, attach to a pod of a deployment,
       write 500MB to the volume.
    2. Scale down the deployment. The volume is detached.
    3. Turn on `Allow Recurring Job While Volume Is Detached` setting.
    4. Create a recurring backup job that runs every 2 mins.
    5. Wait until the recurring backup job starts and the backup progress
       is > 50%, kill the engine process of the volume.
    6. Verify volume automatically reattached and is healthy again.
    7. Wait until the backup finishes.
    8. Wait for the volume to be in detached state with
       `frontendDisabled=false`
    9. Scale up the deployment.
       Verify that we can read the file `lost+found` from the workload pod
    10. Turn off `Allow Recurring Job While Volume Is Detached` setting
       Clean up backups, volumes.
    """

    recurring_job_setting = \
        client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED)
    client.update(recurring_job_setting, value="true")

    pvc_name = 'pvc-volume-detached-unexpectedly-test'
    pvc['metadata']['name'] = pvc_name
    pvc['spec']['storageClassName'] = 'longhorn'

    core_api.create_namespaced_persistent_volume_claim(body=pvc,
                                                       namespace='default')

    deployment = make_deployment_with_pvc(
        'deployment-volume-detached-unexpectedly-test', pvc_name)
    create_and_wait_deployment(apps_api, deployment)
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    vol_name = get_volume_name(core_api, pvc_name)

    write_pod_volume_random_data(core_api, pod_names[0], "/data/test",
                                 DATA_SIZE_IN_MB_3)

    data = read_volume_data(core_api, pod_names[0], 'default')
    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])
    vol = wait_for_volume_detached(client, vol_name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)
    time.sleep(60)
    wait_for_recurring_backup_to_start(client,
                                       core_api,
                                       vol_name,
                                       expected_snapshot_count=1,
                                       minimum_progress=50)

    crash_engine_process_with_sigkill(client, core_api, vol_name)
    # Check if the volume is reattached after recurring backup is interrupted
    time.sleep(10)
    wait_for_volume_healthy_no_frontend(client, vol_name)

    # Since the backup state is removed after the backup complete and it
    # could happen quickly. Checking for the both in-progress and complete
    # state could be hard to catch, thus we only check the complete state
    def backup_complete_predicate(b):
        return b.state == "complete" and b.error == ""

    common.wait_for_backup_state(client, vol_name, backup_complete_predicate)

    wait_for_volume_detached(client, vol_name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])
    wait_deployment_replica_ready(apps_api, deployment["metadata"]["name"], 1)
    pod_names = common.get_deployment_pod_names(core_api, deployment)

    assert read_volume_data(core_api, pod_names[0], 'default') == data

    # Use fixture to cleanup the backupstore and since we
    # crashed the engine replica initiated the backup, it's
    # backupstore lock will still be present, so we need
    # to wait till the lock is expired, before we can delete
    # the backups
    vol.recurringUpdate(jobs=[])
    backupstore.backupstore_wait_for_lock_expiration()
예제 #4
0
def test_rwx_deployment_with_multi_pods(core_api, pvc,
                                        make_deployment_with_pvc):  # NOQA
    """
    Test deployment of 2 pods with same PVC.

    1. Create a volume with 'accessMode' rwx.
    2. Create a PV and a PVC with access mode 'readwritemany' and attach to the
       volume.
    3. Create a deployment of 2 pods with PVC created
    4. Wait for 2 pods to come up healthy.
    5. Write data in both pods and compute md5sum.
    6. Check the data md5sum in the share manager pod.
    """

    pvc_name = 'pvc-deployment-multi-pods-test'
    pvc['metadata']['name'] = pvc_name
    pvc['spec']['storageClassName'] = 'longhorn'
    pvc['spec']['accessModes'] = ['ReadWriteMany']

    core_api.create_namespaced_persistent_volume_claim(body=pvc,
                                                       namespace='default')

    deployment = make_deployment_with_pvc('deployment-multi-pods-test',
                                          pvc_name,
                                          replicas=2)
    apps_api = get_apps_api_client()
    create_and_wait_deployment(apps_api, deployment)

    pv_name = get_volume_name(core_api, pvc_name)
    share_manager_name = 'share-manager-' + pv_name
    deployment_label_selector = "name=" + \
                                deployment["metadata"]["labels"]["name"]

    deployment_pod_list = \
        core_api.list_namespaced_pod(namespace="default",
                                     label_selector=deployment_label_selector)

    assert deployment_pod_list.items.__len__() == 2

    pod_name_1 = deployment_pod_list.items[0].metadata.name
    test_data_1 = generate_random_data(VOLUME_RWTEST_SIZE)
    write_pod_volume_data(core_api, pod_name_1, test_data_1, filename='test1')

    pod_name_2 = deployment_pod_list.items[1].metadata.name
    command = 'cat /data/test1'
    pod_data_2 = exec_command_in_pod(core_api, command, pod_name_2, 'default')

    assert test_data_1 == pod_data_2

    test_data_2 = generate_random_data(VOLUME_RWTEST_SIZE)
    write_pod_volume_data(core_api, pod_name_2, test_data_2, filename='test2')

    command = 'cat /export' + '/' + pv_name + '/' + 'test1'
    share_manager_data_1 = exec_command_in_pod(core_api, command,
                                               share_manager_name,
                                               LONGHORN_NAMESPACE)
    assert test_data_1 == share_manager_data_1

    command = 'cat /export' + '/' + pv_name + '/' + 'test2'
    share_manager_data_2 = exec_command_in_pod(core_api, command,
                                               share_manager_name,
                                               LONGHORN_NAMESPACE)

    assert test_data_2 == share_manager_data_2
예제 #5
0
def test_recurring_jobs_for_detached_volume(set_random_backupstore, client,
                                            core_api, apps_api, volume_name,
                                            make_deployment_with_pvc):  # NOQA
    """
    Test recurring jobs for detached volume

    Context:
    In the current Longhorn implementation, users cannot do recurring
    backup when volumes are detached.
    This feature gives the users an option to do recurring backup even when
    volumes are detached.
    longhorn/longhorn#1509

    Steps:
    1.  Change the setting allow-recurring-job-while-volume-detached to true.
    2.  Create and attach volume, write 50MB data to the volume.
    3.  Detach the volume.
    4.  Set the recurring backup for the volume on every minute.
    5.  In a 2-minutes retry loop, verify that there is exactly 1 new backup.
    6.  Delete the recurring backup.
    7.  Create a PV and PVC from the volume.
    8.  Create a deployment of 1 pod using the PVC.
    9.  Write 400MB data to the volume from the pod.
    10. Scale down the deployment. Wait until the volume is detached.
    11. Set the recurring backup for every 2 minutes.
    12. Wait util the recurring backup starts, scale up the deployment to 1
        pod.
    13. Verify that during the recurring backup, the volume's frontend is
        disabled, and pod cannot start.
    14. Wait for the recurring backup finishes.
        Delete the recurring backup.
    15. In a 10-minutes retry loop, verify that the pod can eventually start.
    16. Change the setting allow-recurring-job-while-volume-detached to false.
    17. Cleanup.
    """
    recurring_job_setting = \
        client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED)
    client.update(recurring_job_setting, value="true")

    vol = common.create_and_check_volume(client, volume_name, size=str(1 * Gi))

    lht_hostId = get_self_host_id()
    vol.attach(hostId=lht_hostId)
    vol = wait_for_volume_healthy(client, vol.name)

    data = {
        'pos': 0,
        'content': common.generate_random_data(50 * Mi),
    }
    common.write_volume_data(vol, data)

    # Give sometimes for data to flush to disk
    time.sleep(15)

    vol.detach(hostId="")
    vol = common.wait_for_volume_detached(client, vol.name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/1 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)
    common.wait_for_backup_completion(client, vol.name)
    for _ in range(4):
        bv = client.by_id_backupVolume(vol.name)
        backups = bv.backupList().data
        assert len(backups) == 1
        time.sleep(30)

    vol.recurringUpdate(jobs=[])

    pv_name = volume_name + "-pv"
    common.create_pv_for_volume(client, core_api, vol, pv_name)

    pvc_name = volume_name + "-pvc"
    common.create_pvc_for_volume(client, core_api, vol, pvc_name)

    deployment_name = volume_name + "-dep"
    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
    common.create_and_wait_deployment(apps_api, deployment)

    size_mb = 400
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    write_pod_volume_random_data(core_api, pod_names[0], "/data/test", size_mb)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])

    vol = common.wait_for_volume_detached(client, vol.name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)

    common.wait_for_backup_to_start(client, vol.name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])

    deployment_label_name = deployment["metadata"]["labels"]["name"]
    common.wait_pod_auto_attach_after_first_backup_completion(
        client, core_api, vol.name, deployment_label_name)

    vol.recurringUpdate(jobs=[])

    pod_names = common.get_deployment_pod_names(core_api, deployment)
    common.wait_for_pod_phase(core_api, pod_names[0], pod_phase="Running")