Example #1
0
def test_recurring_jobs_when_volume_detached_unexpectedly(
        settings_reset, set_random_backupstore, client, core_api, apps_api,
        pvc, make_deployment_with_pvc):  # NOQA
    """
    Test recurring jobs when volume detached unexpectedly

    Context:

    If the volume is automatically attached by the recurring backup job,
    make sure that workload pod eventually is able to use the volume
    when volume is detached unexpectedly during the backup process.

    Steps:

    1. Create a volume, attach to a pod of a deployment,
       write 500MB to the volume.
    2. Scale down the deployment. The volume is detached.
    3. Turn on `Allow Recurring Job While Volume Is Detached` setting.
    4. Create a recurring backup job that runs every 2 mins.
    5. Wait until the recurring backup job starts and the backup progress
       is > 50%, kill the engine process of the volume.
    6. Verify volume automatically reattached and is healthy again.
    7. Wait until the backup finishes.
    8. Wait for the volume to be in detached state with
       `frontendDisabled=false`
    9. Scale up the deployment.
       Verify that we can read the file `lost+found` from the workload pod
    10. Turn off `Allow Recurring Job While Volume Is Detached` setting
       Clean up backups, volumes.
    """

    recurring_job_setting = \
        client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED)
    client.update(recurring_job_setting, value="true")

    pvc_name = 'pvc-volume-detached-unexpectedly-test'
    pvc['metadata']['name'] = pvc_name
    pvc['spec']['storageClassName'] = 'longhorn'

    core_api.create_namespaced_persistent_volume_claim(body=pvc,
                                                       namespace='default')

    deployment = make_deployment_with_pvc(
        'deployment-volume-detached-unexpectedly-test', pvc_name)
    create_and_wait_deployment(apps_api, deployment)
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    vol_name = get_volume_name(core_api, pvc_name)

    write_pod_volume_random_data(core_api, pod_names[0], "/data/test",
                                 DATA_SIZE_IN_MB_3)

    data = read_volume_data(core_api, pod_names[0], 'default')
    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])
    vol = wait_for_volume_detached(client, vol_name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)
    time.sleep(60)
    wait_for_recurring_backup_to_start(client,
                                       core_api,
                                       vol_name,
                                       expected_snapshot_count=1,
                                       minimum_progress=50)

    crash_engine_process_with_sigkill(client, core_api, vol_name)
    # Check if the volume is reattached after recurring backup is interrupted
    time.sleep(10)
    wait_for_volume_healthy_no_frontend(client, vol_name)

    # Since the backup state is removed after the backup complete and it
    # could happen quickly. Checking for the both in-progress and complete
    # state could be hard to catch, thus we only check the complete state
    def backup_complete_predicate(b):
        return b.state == "complete" and b.error == ""

    common.wait_for_backup_state(client, vol_name, backup_complete_predicate)

    wait_for_volume_detached(client, vol_name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])
    wait_deployment_replica_ready(apps_api, deployment["metadata"]["name"], 1)
    pod_names = common.get_deployment_pod_names(core_api, deployment)

    assert read_volume_data(core_api, pod_names[0], 'default') == data

    # Use fixture to cleanup the backupstore and since we
    # crashed the engine replica initiated the backup, it's
    # backupstore lock will still be present, so we need
    # to wait till the lock is expired, before we can delete
    # the backups
    vol.recurringUpdate(jobs=[])
    backupstore.backupstore_wait_for_lock_expiration()
Example #2
0
def test_allow_volume_creation_with_degraded_availability_csi(
        client, core_api, apps_api, make_deployment_with_pvc):  # NOQA
    """
    Test Allow Volume Creation with Degraded Availability (CSI)

    Requirement:
    1. Set `allow-volume-creation-with-degraded-availability` to true.
    2. Set `node-level-soft-anti-affinity` to false.

    Steps:
    1. Disable scheduling for node 3.
    2. Create a Deployment Pod with a volume and 3 replicas.
        1. After the volume is attached, scheduling error should be seen.
    3. Write data to the Pod.
    4. Scale down the deployment to 0 to detach the volume.
        1. Scheduled condition should become true.
    5. Scale up the deployment back to 1 and verify the data.
        1. Scheduled condition should become false.
    6. Enable the scheduling for node 3.
        1. Volume should start rebuilding on the node 3 soon.
        2. Once the rebuilding starts, the scheduled condition should become
           true.
    7. Once rebuild finished, scale down and back the deployment to verify
       the data.
    """
    setting = client.by_id_setting(common.SETTING_DEGRADED_AVAILABILITY)
    client.update(setting, value="true")

    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")

    nodes = client.list_node()
    node3 = nodes[2]
    client.update(node3, allowScheduling=False)

    vol = common.create_and_check_volume(client, generate_volume_name(),
                                         size=str(500 * Mi))

    pv_name = vol.name + "-pv"
    common.create_pv_for_volume(client, core_api, vol, pv_name)

    pvc_name = vol.name + "-pvc"
    common.create_pvc_for_volume(client, core_api, vol, pvc_name)

    deployment_name = vol.name + "-dep"
    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
    deployment["spec"]["replicas"] = 3
    apps_api.create_namespaced_deployment(body=deployment, namespace='default')
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)
    common.wait_scheduling_failure(client, vol.name)

    data_path = "/data/test"
    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    common.write_pod_volume_random_data(core_api, pod.metadata.name,
                                        data_path, common.DATA_SIZE_IN_MB_2)
    created_md5sum = get_pod_data_md5sum(core_api, pod.metadata.name,
                                         data_path)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    vol = common.wait_for_volume_detached(client, vol.name)
    assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True"

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)
    common.wait_for_volume_condition_scheduled(client, vol.name, "status",
                                               common.CONDITION_STATUS_FALSE)
    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    assert created_md5sum == get_pod_data_md5sum(core_api,
                                                 pod.metadata.name,
                                                 data_path)

    client.update(node3, allowScheduling=True)
    common.wait_for_rebuild_start(client, vol.name)
    vol = client.by_id_volume(vol.name)
    assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True"
    common.wait_for_rebuild_complete(client, vol.name)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_detached(client, vol.name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)

    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    assert created_md5sum == get_pod_data_md5sum(core_api,
                                                 pod.metadata.name,
                                                 data_path)
Example #3
0
def test_recurring_jobs_for_detached_volume(set_random_backupstore, client,
                                            core_api, apps_api, volume_name,
                                            make_deployment_with_pvc):  # NOQA
    """
    Test recurring jobs for detached volume

    Context:
    In the current Longhorn implementation, users cannot do recurring
    backup when volumes are detached.
    This feature gives the users an option to do recurring backup even when
    volumes are detached.
    longhorn/longhorn#1509

    Steps:
    1.  Change the setting allow-recurring-job-while-volume-detached to true.
    2.  Create and attach volume, write 50MB data to the volume.
    3.  Detach the volume.
    4.  Set the recurring backup for the volume on every minute.
    5.  In a 2-minutes retry loop, verify that there is exactly 1 new backup.
    6.  Delete the recurring backup.
    7.  Create a PV and PVC from the volume.
    8.  Create a deployment of 1 pod using the PVC.
    9.  Write 400MB data to the volume from the pod.
    10. Scale down the deployment. Wait until the volume is detached.
    11. Set the recurring backup for every 2 minutes.
    12. Wait util the recurring backup starts, scale up the deployment to 1
        pod.
    13. Verify that during the recurring backup, the volume's frontend is
        disabled, and pod cannot start.
    14. Wait for the recurring backup finishes.
        Delete the recurring backup.
    15. In a 10-minutes retry loop, verify that the pod can eventually start.
    16. Change the setting allow-recurring-job-while-volume-detached to false.
    17. Cleanup.
    """
    recurring_job_setting = \
        client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED)
    client.update(recurring_job_setting, value="true")

    vol = common.create_and_check_volume(client, volume_name, size=str(1 * Gi))

    lht_hostId = get_self_host_id()
    vol.attach(hostId=lht_hostId)
    vol = wait_for_volume_healthy(client, vol.name)

    data = {
        'pos': 0,
        'content': common.generate_random_data(50 * Mi),
    }
    common.write_volume_data(vol, data)

    # Give sometimes for data to flush to disk
    time.sleep(15)

    vol.detach(hostId="")
    vol = common.wait_for_volume_detached(client, vol.name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/1 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)
    common.wait_for_backup_completion(client, vol.name)
    for _ in range(4):
        bv = client.by_id_backupVolume(vol.name)
        backups = bv.backupList().data
        assert len(backups) == 1
        time.sleep(30)

    vol.recurringUpdate(jobs=[])

    pv_name = volume_name + "-pv"
    common.create_pv_for_volume(client, core_api, vol, pv_name)

    pvc_name = volume_name + "-pvc"
    common.create_pvc_for_volume(client, core_api, vol, pvc_name)

    deployment_name = volume_name + "-dep"
    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
    common.create_and_wait_deployment(apps_api, deployment)

    size_mb = 400
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    write_pod_volume_random_data(core_api, pod_names[0], "/data/test", size_mb)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])

    vol = common.wait_for_volume_detached(client, vol.name)

    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1
    }]
    vol.recurringUpdate(jobs=jobs)

    common.wait_for_backup_to_start(client, vol.name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment["metadata"]["name"])

    deployment_label_name = deployment["metadata"]["labels"]["name"]
    common.wait_pod_auto_attach_after_first_backup_completion(
        client, core_api, vol.name, deployment_label_name)

    vol.recurringUpdate(jobs=[])

    pod_names = common.get_deployment_pod_names(core_api, deployment)
    common.wait_for_pod_phase(core_api, pod_names[0], pod_phase="Running")