コード例 #1
0
def test_recurring_job_in_storageclass(set_random_backupstore, client,
                                       core_api, storage_class,
                                       statefulset):  # NOQA
    """
    Test create volume with StorageClass contains recurring jobs

    1. Create a StorageClass with recurring jobs
    2. Create a StatefulSet with PVC template and StorageClass
    3. Verify the recurring jobs run correctly.
    """
    statefulset_name = 'recurring-job-in-storageclass-test'
    update_statefulset_manifests(statefulset, storage_class, statefulset_name)
    storage_class["parameters"]["recurringJobs"] = json.dumps(create_jobs1())

    create_storage_class(storage_class)

    # wait until the beginning of an even minute
    wait_until_begin_of_an_even_minute()

    start_time = datetime.utcnow()
    create_and_wait_statefulset(statefulset)
    statefulset_creating_duration = datetime.utcnow() - start_time

    assert 150 > statefulset_creating_duration.seconds

    # We want to write data exactly at the 150th second since the start_time
    time.sleep(150 - statefulset_creating_duration.seconds)

    pod_info = get_statefulset_pod_info(core_api, statefulset)
    volume_info = [p['pv_name'] for p in pod_info]
    pod_names = [p['pod_name'] for p in pod_info]

    # write random data to volume to trigger recurring snapshot and backup job
    volume_data_path = "/data/test"
    for pod_name in pod_names:
        write_pod_volume_random_data(core_api, pod_name, volume_data_path, 2)

    time.sleep(150)  # 2.5 minutes

    for volume_name in volume_info:  # NOQA
        volume = client.by_id_volume(volume_name)
        check_jobs1_result(volume)
コード例 #2
0
def test_allow_volume_creation_with_degraded_availability_csi(
        client, core_api, apps_api, make_deployment_with_pvc):  # NOQA
    """
    Test Allow Volume Creation with Degraded Availability (CSI)

    Requirement:
    1. Set `allow-volume-creation-with-degraded-availability` to true.
    2. Set `node-level-soft-anti-affinity` to false.

    Steps:
    1. Disable scheduling for node 3.
    2. Create a Deployment Pod with a volume and 3 replicas.
        1. After the volume is attached, scheduling error should be seen.
    3. Write data to the Pod.
    4. Scale down the deployment to 0 to detach the volume.
        1. Scheduled condition should become true.
    5. Scale up the deployment back to 1 and verify the data.
        1. Scheduled condition should become false.
    6. Enable the scheduling for node 3.
        1. Volume should start rebuilding on the node 3 soon.
        2. Once the rebuilding starts, the scheduled condition should become
           true.
    7. Once rebuild finished, scale down and back the deployment to verify
       the data.
    """
    setting = client.by_id_setting(common.SETTING_DEGRADED_AVAILABILITY)
    client.update(setting, value="true")

    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")

    nodes = client.list_node()
    node3 = nodes[2]
    client.update(node3, allowScheduling=False)

    vol = common.create_and_check_volume(client, generate_volume_name(),
                                         size=str(500 * Mi))

    pv_name = vol.name + "-pv"
    common.create_pv_for_volume(client, core_api, vol, pv_name)

    pvc_name = vol.name + "-pvc"
    common.create_pvc_for_volume(client, core_api, vol, pvc_name)

    deployment_name = vol.name + "-dep"
    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
    deployment["spec"]["replicas"] = 3
    apps_api.create_namespaced_deployment(body=deployment, namespace='default')
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)
    common.wait_scheduling_failure(client, vol.name)

    data_path = "/data/test"
    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    common.write_pod_volume_random_data(core_api, pod.metadata.name,
                                        data_path, common.DATA_SIZE_IN_MB_2)
    created_md5sum = get_pod_data_md5sum(core_api, pod.metadata.name,
                                         data_path)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    vol = common.wait_for_volume_detached(client, vol.name)
    assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True"

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)
    common.wait_for_volume_condition_scheduled(client, vol.name, "status",
                                               common.CONDITION_STATUS_FALSE)
    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    assert created_md5sum == get_pod_data_md5sum(core_api,
                                                 pod.metadata.name,
                                                 data_path)

    client.update(node3, allowScheduling=True)
    common.wait_for_rebuild_start(client, vol.name)
    vol = client.by_id_volume(vol.name)
    assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True"
    common.wait_for_rebuild_complete(client, vol.name)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_detached(client, vol.name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)

    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    assert created_md5sum == get_pod_data_md5sum(core_api,
                                                 pod.metadata.name,
                                                 data_path)
コード例 #3
0
def write_data_into_pod(pod_name_and_data_path):
    pod_info = pod_name_and_data_path.split(':')
    core_api = get_core_api_client()  # NOQA
    write_pod_volume_random_data(core_api, pod_info[0], pod_info[1],
                                 DATA_SIZE_IN_MB_3)
コード例 #4
0
def test_engine_live_upgrade_with_intensive_data_writing(
        client, core_api, volume_name, pod_make):  # NOQA
    """
    Test engine live upgrade with intensive data writing

    1. Deploy a compatible new engine image
    2. Create a volume(with the old default engine image) with /PV/PVC/Pod
       and wait for pod to be deployed.
    3. Write data to a tmp file in the pod and get the md5sum
    4. Upgrade the volume to the new engine image without waiting.
    5. Keep copying data from the tmp file to the volume
       during the live upgrade.
    6. Wait until the upgrade completed, verify the volume engine image changed
    7. Wait for new replica mode update then check the engine status.
    8. Verify all engine and replicas' engine image changed
    9. Verify the reference count of the new engine image changed
    10. Check the existing data.
        Then write new data to the upgraded volume and get the md5sum.
    11. Delete the pod and wait for the volume detached.
        Then check engine and replicas's engine image again.
    12. Recreate the pod.
    13. Check if the attached volume is state `healthy`
        rather than `degraded`.
    14. Check the data.
    """
    default_img = common.get_default_engine_image(client)
    default_img_name = default_img.name
    default_img = wait_for_engine_image_ref_count(client, default_img_name, 0)
    cli_v = default_img.cliAPIVersion
    cli_minv = default_img.cliAPIMinVersion
    ctl_v = default_img.controllerAPIVersion
    ctl_minv = default_img.controllerAPIMinVersion
    data_v = default_img.dataFormatVersion
    data_minv = default_img.dataFormatMinVersion
    engine_upgrade_image = common.get_upgrade_test_image(
        cli_v, cli_minv, ctl_v, ctl_minv, data_v, data_minv)

    new_img = client.create_engine_image(image=engine_upgrade_image)
    new_img_name = new_img.name
    ei_status_value = get_engine_image_status_value(client, new_img_name)
    new_img = wait_for_engine_image_state(client, new_img_name,
                                          ei_status_value)
    assert new_img.refCount == 0
    assert new_img.noRefSince != ""

    default_img = common.get_default_engine_image(client)
    default_img_name = default_img.name

    pod_name = volume_name + "-pod"
    pv_name = volume_name + "-pv"
    pvc_name = volume_name + "-pvc"

    pod = pod_make(name=pod_name)
    volume = create_and_check_volume(client,
                                     volume_name,
                                     num_of_replicas=3,
                                     size=str(1 * Gi))
    original_engine_image = volume.engineImage
    assert original_engine_image != engine_upgrade_image

    create_pv_for_volume(client, core_api, volume, pv_name)
    create_pvc_for_volume(client, core_api, volume, pvc_name)
    pod['spec']['volumes'] = [create_pvc_spec(pvc_name)]
    create_and_wait_pod(core_api, pod)

    volume = client.by_id_volume(volume_name)
    assert volume.engineImage == original_engine_image
    assert volume.currentImage == original_engine_image
    engine = get_volume_engine(volume)
    assert engine.engineImage == original_engine_image
    assert engine.currentImage == original_engine_image
    for replica in volume.replicas:
        assert replica.engineImage == original_engine_image
        assert replica.currentImage == original_engine_image

    data_path0 = "/tmp/test"
    data_path1 = "/data/test1"
    write_pod_volume_random_data(core_api, pod_name, data_path0,
                                 RANDOM_DATA_SIZE_LARGE)
    original_md5sum1 = get_pod_data_md5sum(core_api, pod_name, data_path0)

    volume.engineUpgrade(image=engine_upgrade_image)
    # Keep writing data to the volume during the live upgrade
    copy_pod_volume_data(core_api, pod_name, data_path0, data_path1)

    # Wait for live upgrade complete
    wait_for_volume_current_image(client, volume_name, engine_upgrade_image)
    volume = wait_for_volume_replicas_mode(client, volume_name, "RW")
    engine = get_volume_engine(volume)
    assert engine.engineImage == engine_upgrade_image
    check_volume_endpoint(volume)

    wait_for_engine_image_ref_count(client, default_img_name, 0)
    wait_for_engine_image_ref_count(client, new_img_name, 1)

    volume_file_md5sum1 = get_pod_data_md5sum(core_api, pod_name, data_path1)
    assert volume_file_md5sum1 == original_md5sum1

    data_path2 = "/data/test2"
    write_pod_volume_random_data(core_api, pod_name, data_path2,
                                 RANDOM_DATA_SIZE_SMALL)
    original_md5sum2 = get_pod_data_md5sum(core_api, pod_name, data_path2)

    delete_and_wait_pod(core_api, pod_name)
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume.replicas) == 3
    assert volume.engineImage == engine_upgrade_image
    engine = get_volume_engine(volume)
    assert engine.engineImage == engine_upgrade_image
    for replica in volume.replicas:
        assert replica.engineImage == engine_upgrade_image

    create_and_wait_pod(core_api, pod)
    common.wait_for_volume_healthy(client, volume_name)

    volume_file_md5sum1 = get_pod_data_md5sum(core_api, pod_name, data_path1)
    assert volume_file_md5sum1 == original_md5sum1
    volume_file_md5sum2 = get_pod_data_md5sum(core_api, pod_name, data_path2)
    assert volume_file_md5sum2 == original_md5sum2
コード例 #5
0
def test_recurring_jobs_when_volume_detached_unexpectedly(
        settings_reset, set_random_backupstore, client, core_api, apps_api,
        pvc, make_deployment_with_pvc):  # NOQA
    """
    Test recurring jobs when volume detached unexpectedly

    Context:

    If the volume is automatically attached by the recurring backup job,
    make sure that workload pod eventually is able to use the volume
    when volume is detached unexpectedly during the backup process.

    Steps:

    1. Create a volume, attach to a pod of a deployment,
       write 500MB to the volume.
    2. Scale down the deployment. The volume is detached.
    3. Turn on `Allow Recurring Job While Volume Is Detached` setting.
    4. Create a recurring backup job that runs every 2 mins.
    5. Wait until the recurring backup job starts and the backup progress
       is > 50%, kill the engine process of the volume.
    6. Verify volume automatically reattached and is healthy again.
    7. Wait until the backup finishes.
    8. Wait for the volume to be in detached state with
       `frontendDisabled=false`
    9. Scale up the deployment.
       Verify that we can read the file `lost+found` from the workload pod
    10. Turn off `Allow Recurring Job While Volume Is Detached` setting
       Clean up backups, volumes.
    """

    recurring_job_setting = \
        client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED)
    client.update(recurring_job_setting, value="true")

    pvc_name = 'pvc-volume-detached-unexpectedly-test'
    pvc['metadata']['name'] = pvc_name
    pvc['spec']['storageClassName'] = 'longhorn'

    core_api.create_namespaced_persistent_volume_claim(body=pvc,
                                                       namespace='default')

    deployment = make_deployment_with_pvc(
        'deployment-volume-detached-unexpectedly-test', pvc_name)
    create_and_wait_deployment(apps_api, deployment)
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    vol_name = get_volume_name(core_api, pvc_name)

    write_pod_volume_random_data(core_api, pod_names[0], "/data/test",
                                 DATA_SIZE_IN_MB_3)

    data = read_volume_data(core_api, pod_names[0], 'default')
    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])
    vol = wait_for_volume_detached(client, vol_name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)
    time.sleep(60)
    wait_for_recurring_backup_to_start(client,
                                       core_api,
                                       vol_name,
                                       expected_snapshot_count=1,
                                       minimum_progress=50)

    crash_engine_process_with_sigkill(client, core_api, vol_name)
    # Check if the volume is reattached after recurring backup is interrupted
    time.sleep(10)
    wait_for_volume_healthy_no_frontend(client, vol_name)

    # Since the backup state is removed after the backup complete and it
    # could happen quickly. Checking for the both in-progress and complete
    # state could be hard to catch, thus we only check the complete state
    def backup_complete_predicate(b):
        return b.state == "complete" and b.error == ""

    common.wait_for_backup_state(client, vol_name, backup_complete_predicate)

    wait_for_volume_detached(client, vol_name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])
    wait_deployment_replica_ready(apps_api, deployment["metadata"]["name"], 1)
    pod_names = common.get_deployment_pod_names(core_api, deployment)

    assert read_volume_data(core_api, pod_names[0], 'default') == data

    # Use fixture to cleanup the backupstore and since we
    # crashed the engine replica initiated the backup, it's
    # backupstore lock will still be present, so we need
    # to wait till the lock is expired, before we can delete
    # the backups
    vol.recurringUpdate(jobs=[])
    backupstore.backupstore_wait_for_lock_expiration()
コード例 #6
0
def test_recurring_jobs_for_detached_volume(set_random_backupstore, client,
                                            core_api, apps_api, volume_name,
                                            make_deployment_with_pvc):  # NOQA
    """
    Test recurring jobs for detached volume

    Context:
    In the current Longhorn implementation, users cannot do recurring
    backup when volumes are detached.
    This feature gives the users an option to do recurring backup even when
    volumes are detached.
    longhorn/longhorn#1509

    Steps:
    1.  Change the setting allow-recurring-job-while-volume-detached to true.
    2.  Create and attach volume, write 50MB data to the volume.
    3.  Detach the volume.
    4.  Set the recurring backup for the volume on every minute.
    5.  In a 2-minutes retry loop, verify that there is exactly 1 new backup.
    6.  Delete the recurring backup.
    7.  Create a PV and PVC from the volume.
    8.  Create a deployment of 1 pod using the PVC.
    9.  Write 400MB data to the volume from the pod.
    10. Scale down the deployment. Wait until the volume is detached.
    11. Set the recurring backup for every 2 minutes.
    12. Wait util the recurring backup starts, scale up the deployment to 1
        pod.
    13. Verify that during the recurring backup, the volume's frontend is
        disabled, and pod cannot start.
    14. Wait for the recurring backup finishes.
        Delete the recurring backup.
    15. In a 10-minutes retry loop, verify that the pod can eventually start.
    16. Change the setting allow-recurring-job-while-volume-detached to false.
    17. Cleanup.
    """
    recurring_job_setting = \
        client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED)
    client.update(recurring_job_setting, value="true")

    vol = common.create_and_check_volume(client, volume_name, size=str(1 * Gi))

    lht_hostId = get_self_host_id()
    vol.attach(hostId=lht_hostId)
    vol = wait_for_volume_healthy(client, vol.name)

    data = {
        'pos': 0,
        'content': common.generate_random_data(50 * Mi),
    }
    common.write_volume_data(vol, data)

    # Give sometimes for data to flush to disk
    time.sleep(15)

    vol.detach(hostId="")
    vol = common.wait_for_volume_detached(client, vol.name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/1 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)
    common.wait_for_backup_completion(client, vol.name)
    for _ in range(4):
        bv = client.by_id_backupVolume(vol.name)
        backups = bv.backupList().data
        assert len(backups) == 1
        time.sleep(30)

    vol.recurringUpdate(jobs=[])

    pv_name = volume_name + "-pv"
    common.create_pv_for_volume(client, core_api, vol, pv_name)

    pvc_name = volume_name + "-pvc"
    common.create_pvc_for_volume(client, core_api, vol, pvc_name)

    deployment_name = volume_name + "-dep"
    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
    common.create_and_wait_deployment(apps_api, deployment)

    size_mb = 400
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    write_pod_volume_random_data(core_api, pod_names[0], "/data/test", size_mb)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])

    vol = common.wait_for_volume_detached(client, vol.name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)

    common.wait_for_backup_to_start(client, vol.name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])

    deployment_label_name = deployment["metadata"]["labels"]["name"]
    common.wait_pod_auto_attach_after_first_backup_completion(
        client, core_api, vol.name, deployment_label_name)

    vol.recurringUpdate(jobs=[])

    pod_names = common.get_deployment_pod_names(core_api, deployment)
    common.wait_for_pod_phase(core_api, pod_names[0], pod_phase="Running")
コード例 #7
0
def test_cloning_basic(client, core_api, pvc, pod, clone_pvc, clone_pod, storage_class_name='longhorn'):  # NOQA
    """
    1. Create a PVC:
        ```yaml
        apiVersion: v1
        kind: PersistentVolumeClaim
        metadata:
          name: source-pvc
        spec:
          storageClassName: longhorn
          accessModes:
            - ReadWriteOnce
          resources:
            requests:
              storage: 3Gi
        ```
    2. Specify the `source-pvc` in a pod yaml and start the pod
    3. Wait for the pod to be running, write some data to the mount
       path of the volume
    4. Clone a volume by creating the PVC:
        ```yaml
        apiVersion: v1
        kind: PersistentVolumeClaim
        metadata:
          name: cloned-pvc
        spec:
          storageClassName: longhorn
          dataSource:
            name: source-pvc
            kind: PersistentVolumeClaim
          accessModes:
            - ReadWriteOnce
          resources:
            requests:
              storage: 3Gi
        ```
    5. Wait for the `CloneStatus.State` in `cloned-pvc` to be `completed`
    6. Clone volume should get detached after cloning completion, wait for it.
    7. Specify the `cloned-pvc` in a cloned pod yaml and deploy the cloned pod
    8. In 3-min retry loop, wait for the cloned pod to be running
    9. Verify the data in `cloned-pvc` is the same as in `source-pvc`
    10. In 2-min retry loop, verify the volume of the `clone-pvc` eventually
       becomes healthy
    """
    # Step-1
    source_pvc_name = 'source-pvc' + generate_random_suffix()
    pvc['metadata']['name'] = source_pvc_name
    pvc['spec']['storageClassName'] = storage_class_name
    core_api.create_namespaced_persistent_volume_claim(
        body=pvc, namespace='default')
    wait_for_pvc_phase(core_api, source_pvc_name, "Bound")

    # Step-2
    pod_name = 'source-pod' + generate_random_suffix()
    pod['metadata']['name'] = pod_name
    pod['spec']['volumes'] = [create_pvc_spec(source_pvc_name)]
    create_and_wait_pod(core_api, pod)

    # Step-3
    write_pod_volume_random_data(core_api, pod_name,
                                 '/data/test', DATA_SIZE_IN_MB_2)
    source_data = get_pod_data_md5sum(core_api, pod_name, '/data/test')

    # Step-4
    clone_pvc_name = 'clone-pvc' + generate_random_suffix()
    clone_pvc['metadata']['name'] = clone_pvc_name
    clone_pvc['spec']['storageClassName'] = storage_class_name
    clone_pvc['spec']['dataSource'] = {
        'name': source_pvc_name,
        'kind': 'PersistentVolumeClaim'
    }
    core_api.create_namespaced_persistent_volume_claim(
        body=clone_pvc, namespace='default')
    wait_for_pvc_phase(core_api, clone_pvc_name, "Bound")

    # Step-5
    clone_volume_name = get_volume_name(core_api, clone_pvc_name)
    wait_for_volume_clone_status(client, clone_volume_name, VOLUME_FIELD_STATE,
                                 VOLUME_FIELD_CLONE_COMPLETED)

    # Step-6
    wait_for_volume_detached(client, clone_volume_name)

    # Step-7,8
    clone_pod_name = 'clone-pod' + generate_random_suffix()
    clone_pod['metadata']['name'] = clone_pod_name
    clone_pod['spec']['volumes'] = [create_pvc_spec(clone_pvc_name)]
    create_and_wait_pod(core_api, clone_pod)
    clone_data = get_pod_data_md5sum(core_api, clone_pod_name, '/data/test')

    # Step-9
    assert source_data == clone_data

    # Step-10
    wait_for_volume_healthy(client, clone_volume_name)
コード例 #8
0
def test_cloning_interrupted(client, core_api, pvc, pod, clone_pvc, clone_pod):  # NOQA
    """
    1. Create a PVC:
        ```yaml
        apiVersion: v1
        kind: PersistentVolumeClaim
        metadata:
          name: source-pvc
        spec:
          storageClassName: longhorn
          accessModes:
            - ReadWriteOnce
          resources:
            requests:
              storage: 3Gi
        ```
    2. Specify the `source-pvc` in a pod yaml and start the pod
    3. Wait for the pod to be running, write 500MB of data to the mount
       path of the volume
    4. Clone a volume by creating the PVC:
        ```yaml
        apiVersion: v1
        kind: PersistentVolumeClaim
        metadata:
          name: cloned-pvc
        spec:
          storageClassName: longhorn
          dataSource:
            name: source-pvc
            kind: PersistentVolumeClaim
          accessModes:
            - ReadWriteOnce
          resources:
            requests:
              storage: 3Gi
        ```
    5. Wait for the `CloneStatus.State` in `cloned-pvc` to be `initiated`
    6. Kill all replicas process of the `source-pvc`
    7. Wait for the `CloneStatus.State` in `cloned-pvc` to be `failed`
    8. Clean up `clone-pvc`
    9. Redeploy `cloned-pvc` and clone pod
    10. In 3-min retry loop, verify cloned pod become running
    11. `cloned-pvc` has the same data as `source-pvc`
    12. In 2-min retry loop, verify the volume of the `clone-pvc`
        eventually becomes healthy.
    """
    # Step-1
    source_pvc_name = 'source-pvc' + generate_random_suffix()
    pvc['metadata']['name'] = source_pvc_name
    pvc['spec']['storageClassName'] = 'longhorn'
    core_api.create_namespaced_persistent_volume_claim(
        body=pvc, namespace='default')
    wait_for_pvc_phase(core_api, source_pvc_name, "Bound")

    # Step-2
    pod_name = 'source-pod' + generate_random_suffix()
    pod['metadata']['name'] = pod_name
    pod['spec']['volumes'] = [create_pvc_spec(source_pvc_name)]
    create_and_wait_pod(core_api, pod)

    # Step-3
    write_pod_volume_random_data(core_api, pod_name,
                                 '/data/test', DATA_SIZE_IN_MB_3)
    source_data = get_pod_data_md5sum(core_api, pod_name, '/data/test')

    source_volume_name = get_volume_name(core_api, source_pvc_name)

    # Step-4
    clone_pvc_name = 'clone-pvc' + generate_random_suffix()
    clone_pvc['metadata']['name'] = clone_pvc_name
    clone_pvc['spec']['storageClassName'] = 'longhorn'
    clone_pvc['spec']['dataSource'] = {
        'name': source_pvc_name,
        'kind': 'PersistentVolumeClaim'
    }
    core_api.create_namespaced_persistent_volume_claim(
        body=clone_pvc, namespace='default')

    # Step-5
    clone_volume_name = get_clone_volume_name(client, source_volume_name)
    wait_for_volume_clone_status(client, clone_volume_name, VOLUME_FIELD_STATE,
                                 'initiated')

    # Step-6
    crash_replica_processes(client, core_api, source_volume_name)

    # Step-7
    wait_for_volume_faulted(client, source_volume_name)
    wait_for_volume_clone_status(client, clone_volume_name, VOLUME_FIELD_STATE,
                                 'failed')

    # Step-8
    delete_and_wait_pvc(core_api, clone_pvc_name)

    # Step-9
    clone_pvc_name = 'clone-pvc-2' + generate_random_suffix()
    clone_pvc['metadata']['name'] = clone_pvc_name
    clone_pvc['spec']['storageClassName'] = 'longhorn'
    clone_pvc['spec']['dataSource'] = {
        'name': source_pvc_name,
        'kind': 'PersistentVolumeClaim'
    }
    core_api.create_namespaced_persistent_volume_claim(
        body=clone_pvc, namespace='default')
    wait_for_pvc_phase(core_api, clone_pvc_name, "Bound")

    # Step-9
    clone_pod_name = 'clone-pod' + generate_random_suffix()
    clone_pod['metadata']['name'] = clone_pod_name
    clone_pod['spec']['volumes'] = [create_pvc_spec(clone_pvc_name)]
    create_and_wait_pod(core_api, clone_pod)

    # Step-10
    clone_volume_name = get_volume_name(core_api, clone_pvc_name)
    wait_for_volume_clone_status(client, clone_volume_name, VOLUME_FIELD_STATE,
                                 VOLUME_FIELD_CLONE_COMPLETED)

    # Step-11
    clone_data = get_pod_data_md5sum(core_api, clone_pod_name, '/data/test')
    assert source_data == clone_data

    # Step-12
    wait_for_volume_healthy(client, clone_volume_name)
コード例 #9
0
def test_data_locality_basic(client, core_api, volume_name, pod,
                             settings_reset):  # NOQA
    """
    Test data locality basic feature

    Context:

    Data Locality feature allows users to have an option to keep a local
    replica on the same node as the consuming pod.
    Longhorn is currently supporting 2 modes:
    - disabled: Longhorn does not try to keep a local replica
    - best-effort: Longhorn try to keep a local replica

    See manual tests at:
    https://github.com/longhorn/longhorn/issues/1045#issuecomment-680706283

    Steps:

    Case 1: Test that Longhorn builds a local replica on the engine node

    1. Create a volume(1) with 1 replica and dataLocality set to disabled
    2. Find node where the replica is located on.
       Let's call the node is replica-node
    3. Attach the volume to a node different than replica-node.
       Let call the node is engine-node
    4. Write 200MB data to volume(1)
    5. Use a retry loop to verify that Longhorn does not create
       a replica on the engine-node
    6. Update dataLocality to best-effort for volume(1)
    7. Use a retry loop to verify that Longhorn creates and rebuilds
       a replica on the engine-node and remove the other replica
    8. detach the volume(1) and attach it to a different node.
       Let's call the new node is new-engine-node and the old
       node is old-engine-node
    9. Wait for volume(1) to finish attaching
    10. Use a retry loop to verify that Longhorn creates and rebuilds
       a replica on the new-engine-node and remove the replica on
       old-engine-node

    Case 2: Test that Longhorn prioritizes deleting replicas on the same node

    1. Add the tag AVAIL to node-1 and node-2
    2. Set node soft anti-affinity to `true`.
    3. Create a volume(2) with 3 replicas and dataLocality set to best-effort
    4. Use a retry loop to verify that all 3 replicas are on node-1 and
        node-2, no replica is on node-3
    5. Attach volume(2) to node-3
    6. User a retry loop to verify that there is no replica on node-3 and
        we can still read/write to volume(2)
    7. Find the node which contains 2 replicas.
        Let call the node is most-replica-node
    8. Set the replica count to 2 for volume(2)
    9. Verify that Longhorn remove one replica from most-replica-node

    Case 3: Test that the volume is not corrupted if there is an unexpected
    detachment during building local replica

    1. Remove the tag AVAIL from node-1 and node-2
       Set node soft anti-affinity to `false`.
    2. Create a volume(3) with 1 replicas and dataLocality set to best-effort
    3. Attach volume(3) to node-3.
    4. Use a retry loop to verify that volume(3) has only 1 replica on node-3
    5. Write 800MB data to volume(3)
    6. Detach volume(3)
    7. Attach volume(3) to node-1
    8. Use a retry loop to:
        Wait until volume(3) finishes attaching.
        Wait until Longhorn start rebuilding a replica on node-1
        Immediately detach volume(3)
    9. Verify that the replica on node-1 is in ERR state.
    10. Attach volume(3) to node-1
    11. Wait until volume(3) finishes attaching.
    12. Use a retry loop to verify the Longhorn cleanup the ERR replica,
        rebuild a new replica on node-1, and remove the replica on node-3

    Case 4: Make sure failed to schedule local replica doesn't block the
    the creation of other replicas.

    1. Disable scheduling for node-3
    2. Create a vol with 1 replica, `dataLocality = best-effort`.
        The replica is scheduled on a node (say node-1)
    3. Attach vol to node-3. There is a fail-to-schedule
        replica with Spec.HardNodeAffinity=node-3
    4. Increase numberOfReplica to 3. Verify that the replica set contains:
        one on node-1, one on node-2,  one failed replica
        with Spec.HardNodeAffinity=node-3.
    5. Decrease numberOfReplica to 2. Verify that the replica set contains:
        one on node-1, one on node-2,  one failed replica
        with Spec.HardNodeAffinity=node-3.
    6. Decrease numberOfReplica to 1. Verify that the replica set contains:
        one on node-1 or node-2,  one failed replica
        with Spec.HardNodeAffinity=node-3.
    7. Decrease numberOfReplica to 2. Verify that the replica set contains:
        one on node-1, one on node-2, one failed replica
        with Spec.HardNodeAffinity=node-3.
    8. Turn off data locality by set `dataLocality=disabled` for the vol.
        Verify that the replica set contains: one on node-1, one on node-2

    9. clean up
    """

    # Case 1: Test that Longhorn builds a local replica on the engine node

    nodes = client.list_node()

    default_data_locality_setting = \
        client.by_id_setting(SETTING_DEFAULT_DATA_LOCALITY)
    try:
        client.update(default_data_locality_setting, value="disabled")
    except Exception as e:
        print("Exception when update Default Data Locality setting",
              default_data_locality_setting, e)

    volume1_name = volume_name + "-1"
    volume1_size = str(500 * Mi)
    volume1_data_path = "/data/test"
    pv1_name = volume1_name + "-pv"
    pvc1_name = volume1_name + "-pvc"
    pod1_name = volume1_name + "-pod"
    pod1 = pod

    pod1['metadata']['name'] = pod1_name

    volume1 = create_and_check_volume(client,
                                      volume1_name,
                                      num_of_replicas=1,
                                      size=volume1_size)

    volume1 = client.by_id_volume(volume1_name)
    create_pv_for_volume(client, core_api, volume1, pv1_name)
    create_pvc_for_volume(client, core_api, volume1, pvc1_name)

    volume1 = client.by_id_volume(volume1_name)
    volume1_replica_node = volume1.replicas[0]['hostId']

    volume1_attached_node = None
    for node in nodes:
        if node.name != volume1_replica_node:
            volume1_attached_node = node.name
            break

    assert volume1_attached_node is not None

    pod1['spec']['volumes'] = [{
        "name": "pod-data",
        "persistentVolumeClaim": {
            "claimName": pvc1_name
        }
    }]

    pod1['spec']['nodeSelector'] = \
        {"kubernetes.io/hostname": volume1_attached_node}
    create_and_wait_pod(core_api, pod1)

    write_pod_volume_random_data(core_api, pod1_name, volume1_data_path,
                                 DATA_SIZE_IN_MB_2)

    for i in range(10):
        volume1 = client.by_id_volume(volume1_name)
        assert len(volume1.replicas) == 1
        assert volume1.replicas[0]['hostId'] != volume1_attached_node
        time.sleep(1)

    volume1 = client.by_id_volume(volume1_name)
    volume1.updateDataLocality(dataLocality="best-effort")

    for _ in range(RETRY_COUNTS):
        volume1 = client.by_id_volume(volume1_name)
        assert volume1[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY
        if len(volume1.replicas) == 1 and \
                volume1.replicas[0]['hostId'] == volume1_attached_node:
            break
        time.sleep(RETRY_INTERVAL)
    assert len(volume1.replicas) == 1
    assert volume1.replicas[0]['hostId'] == volume1_attached_node

    delete_and_wait_pod(core_api, pod1_name)
    volume1 = wait_for_volume_detached(client, volume1_name)

    volume1_replica_node = volume1.replicas[0]['hostId']

    volume1_attached_node = None
    for node in nodes:
        if node.name != volume1_replica_node:
            volume1_attached_node = node.name
            break

    assert volume1_attached_node is not None

    pod1['spec']['nodeSelector'] = \
        {"kubernetes.io/hostname": volume1_attached_node}
    create_and_wait_pod(core_api, pod1)
    for _ in range(RETRY_COUNTS):
        volume1 = client.by_id_volume(volume1_name)
        assert volume1[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY
        if len(volume1.replicas) == 1 and \
                volume1.replicas[0]['hostId'] == volume1_attached_node:
            break
        time.sleep(RETRY_INTERVAL)
    assert len(volume1.replicas) == 1
    assert volume1.replicas[0]['hostId'] == volume1_attached_node
    delete_and_wait_pod(core_api, pod1_name)
    wait_for_volume_detached(client, volume1_name)

    # Case 2: Test that Longhorn prioritizes deleting replicas on the same node

    node1 = nodes[0]
    node2 = nodes[1]
    node3 = nodes[2]

    client.update(node1, allowScheduling=True, tags=["AVAIL"])
    client.update(node2, allowScheduling=True, tags=["AVAIL"])

    replica_node_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    try:
        client.update(replica_node_soft_anti_affinity_setting, value="true")
    except Exception as e:
        print(
            "Exception when update "
            "Replica Node Level Soft Anti-Affinity setting",
            replica_node_soft_anti_affinity_setting, e)

    volume2_name = volume_name + "-2"
    volume2_size = str(500 * Mi)
    pv2_name = volume2_name + "-pv"
    pvc2_name = volume2_name + "-pvc"
    pod2_name = volume2_name + "-pod"
    pod2 = pod

    pod2['metadata']['name'] = pod2_name

    volume2 = client.create_volume(name=volume2_name,
                                   size=volume2_size,
                                   numberOfReplicas=3,
                                   nodeSelector=["AVAIL"],
                                   dataLocality="best-effort")

    volume2 = wait_for_volume_detached(client, volume2_name)
    volume2 = client.by_id_volume(volume2_name)
    create_pv_for_volume(client, core_api, volume2, pv2_name)
    create_pvc_for_volume(client, core_api, volume2, pvc2_name)

    volume2 = client.by_id_volume(volume2_name)

    pod2['spec']['volumes'] = [{
        "name": "pod-data",
        "persistentVolumeClaim": {
            "claimName": pvc2_name
        }
    }]

    pod2['spec']['nodeSelector'] = {"kubernetes.io/hostname": node3.name}
    create_and_wait_pod(core_api, pod2)

    volume2 = wait_for_volume_healthy(client, volume2_name)

    for replica in volume2.replicas:
        assert replica["hostId"] != node3.name

    volume2.updateReplicaCount(replicaCount=2)

    # 2 Healthy replicas and 1 replica failed to schedule
    # The failed to schedule replica is the local replica on node3
    volume2 = wait_for_volume_replica_count(client, volume2_name, 3)
    volume2 = client.by_id_volume(volume2_name)

    volume2_healthy_replicas = []
    for replica in volume2.replicas:
        if replica.running is True:
            volume2_healthy_replicas.append(replica)

    assert len(volume2_healthy_replicas) == 2

    volume2_rep1 = volume2_healthy_replicas[0]
    volume2_rep2 = volume2_healthy_replicas[1]
    assert volume2_rep1["hostId"] != volume2_rep2["hostId"]
    delete_and_wait_pod(core_api, pod2_name)
    wait_for_volume_detached(client, volume2_name)

    # Case 3: Test that the volume is not corrupted if there is an unexpected
    # detachment during building local replica

    client.update(node1, allowScheduling=True, tags=[])
    client.update(node2, allowScheduling=True, tags=[])

    replica_node_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    try:
        client.update(replica_node_soft_anti_affinity_setting, value="false")
    except Exception as e:
        print(
            "Exception when update "
            "Replica Node Level Soft Anti-Affinity setting",
            replica_node_soft_anti_affinity_setting, e)

    volume3_name = volume_name + "-3"
    volume3_size = str(1 * Gi)
    volume3_data_path = "/data/test"
    pv3_name = volume3_name + "-pv"
    pvc3_name = volume3_name + "-pvc"
    pod3_name = volume3_name + "-pod"
    pod3 = pod

    pod3['metadata']['name'] = pod3_name

    volume3 = client.create_volume(name=volume3_name,
                                   size=volume3_size,
                                   numberOfReplicas=1)

    volume3 = wait_for_volume_detached(client, volume3_name)
    volume3 = client.by_id_volume(volume3_name)
    create_pv_for_volume(client, core_api, volume3, pv3_name)
    create_pvc_for_volume(client, core_api, volume3, pvc3_name)

    volume3 = client.by_id_volume(volume3_name)

    pod3['spec']['volumes'] = [{
        "name": "pod-data",
        "persistentVolumeClaim": {
            "claimName": pvc3_name
        }
    }]

    pod3['spec']['nodeSelector'] = {"kubernetes.io/hostname": node3.name}
    create_and_wait_pod(core_api, pod3)
    volume3 = wait_for_volume_healthy(client, volume3_name)

    write_pod_volume_random_data(core_api, pod3_name, volume3_data_path,
                                 DATA_SIZE_IN_MB_4)

    volume3.updateDataLocality(dataLocality="best-effort")
    volume3 = client.by_id_volume(volume3_name)

    if volume3.replicas[0]['hostId'] != node3.name:
        wait_for_rebuild_start(client, volume3_name)
        volume3 = client.by_id_volume(volume3_name)
        assert len(volume3.replicas) == 2
        wait_for_rebuild_complete(client, volume3_name)

    volume3 = wait_for_volume_replica_count(client, volume3_name, 1)
    assert volume3.replicas[0]["hostId"] == node3.name

    delete_and_wait_pod(core_api, pod3_name)

    pod3['spec']['nodeSelector'] = {"kubernetes.io/hostname": node1.name}
    create_and_wait_pod(core_api, pod3)

    wait_for_rebuild_start(client, volume3_name)
    crash_engine_process_with_sigkill(client, core_api, volume3_name)
    delete_and_wait_pod(core_api, pod3_name)
    wait_for_volume_detached(client, volume3_name)
    volume3 = client.by_id_volume(volume3_name)
    assert len(volume3.replicas) == 1
    assert volume3.replicas[0]["hostId"] == node3.name

    create_and_wait_pod(core_api, pod3)
    wait_for_rebuild_start(client, volume3_name)
    volume3 = client.by_id_volume(volume3_name)
    assert len(volume3.replicas) == 2
    wait_for_rebuild_complete(client, volume3_name)

    # Wait for deletion of extra replica
    volume3 = wait_for_volume_replica_count(client, volume3_name, 1)
    assert volume3.replicas[0]["hostId"] == node1.name
    assert volume3.replicas[0]["mode"] == "RW"
    assert volume3.replicas[0]["running"] is True

    delete_and_wait_pod(core_api, pod3_name)
    wait_for_volume_detached(client, volume3_name)

    # Case 4: Make sure failed to schedule local replica doesn't block the
    # the creation of other replicas.

    replica_node_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    try:
        client.update(replica_node_soft_anti_affinity_setting, value="false")
    except Exception as e:
        print(
            "Exception when update "
            "Replica Node Level Soft Anti-Affinity setting",
            replica_node_soft_anti_affinity_setting, e)

    client.update(node3, allowScheduling=False)

    volume4_name = volume_name + "-4"
    volume4_size = str(1 * Gi)

    volume4 = client.create_volume(name=volume4_name,
                                   size=volume4_size,
                                   numberOfReplicas=1,
                                   dataLocality="best-effort")

    volume4 = wait_for_volume_detached(client, volume4_name)
    volume4 = client.by_id_volume(volume4_name)

    volume4_replica_name = volume4.replicas[0]["name"]

    volume4.attach(hostId=node3.name)

    wait_for_volume_healthy(client, volume4_name)

    volume4 = client.by_id_volume(volume4_name)
    assert len(volume4.replicas) == 2

    for replica in volume4.replicas:
        if replica["name"] == volume4_replica_name:
            assert replica["running"] is True
            assert replica["mode"] == "RW"
        else:
            assert replica["running"] is False
            assert replica["mode"] == ""

    assert volume4.conditions.scheduled.reason == \
        "LocalReplicaSchedulingFailure"

    volume4 = volume4.updateReplicaCount(replicaCount=3)

    volume4 = wait_for_volume_degraded(client, volume4_name)

    v4_node1_replica_count = 0
    v4_node2_replica_count = 0
    v4_failed_replica_count = 0

    for replica in volume4.replicas:
        if replica["hostId"] == node1.name:
            v4_node1_replica_count += 1
        elif replica["hostId"] == node2.name:
            v4_node2_replica_count += 1
        elif replica["hostId"] == "":
            v4_failed_replica_count += 1

    assert v4_node1_replica_count == 1
    assert v4_node2_replica_count == 1
    assert v4_failed_replica_count > 0

    volume4 = volume4.updateReplicaCount(replicaCount=2)

    volume4 = wait_for_volume_replica_count(client, volume4_name, 3)

    v4_node1_replica_count = 0
    v4_node2_replica_count = 0
    v4_failed_replica_count = 0

    for replica in volume4.replicas:
        if replica["hostId"] == node1.name:
            v4_node1_replica_count += 1
        elif replica["hostId"] == node2.name:
            v4_node2_replica_count += 1
        elif replica["hostId"] == "":
            v4_failed_replica_count += 1

    assert v4_node1_replica_count == 1
    assert v4_node2_replica_count == 1
    assert v4_failed_replica_count > 0

    volume4 = volume4.updateReplicaCount(replicaCount=1)

    volume4 = wait_for_volume_replica_count(client, volume4_name, 2)

    v4_node1_replica_count = 0
    v4_node2_replica_count = 0
    v4_failed_replica_count = 0

    for replica in volume4.replicas:
        if replica["hostId"] == node1.name:
            v4_node1_replica_count += 1
        elif replica["hostId"] == node2.name:
            v4_node2_replica_count += 1
        elif replica["hostId"] == "":
            v4_failed_replica_count += 1

    assert v4_node1_replica_count + v4_node2_replica_count == 1
    assert v4_failed_replica_count == 1

    volume4 = volume4.updateDataLocality(dataLocality="disabled")
    volume4 = volume4.updateReplicaCount(replicaCount=2)

    running_replica_count = 0
    for _ in range(RETRY_COUNTS):
        volume4 = client.by_id_volume(volume4_name)
        running_replica_count = 0
        for r in volume4.replicas:
            if r.failedAt == "" and r.running is True:
                running_replica_count += 1
        if running_replica_count == 2:
            break
        time.sleep(RETRY_INTERVAL)
    assert running_replica_count == 2

    v4_node1_replica_count = 0
    v4_node2_replica_count = 0
    v4_node3_replica_count = 0

    for replica in volume4.replicas:
        wait_for_replica_running(client, volume4_name, replica["name"])
        if replica["hostId"] == node1.name:
            v4_node1_replica_count += 1
        elif replica["hostId"] == node2.name:
            v4_node2_replica_count += 1
        elif replica["hostId"] == node3.name:
            v4_node3_replica_count += 1
    assert v4_node1_replica_count == 1
    assert v4_node2_replica_count == 1
    assert v4_node3_replica_count == 0