def test_allow_volume_creation_with_degraded_availability_csi( client, core_api, apps_api, make_deployment_with_pvc): # NOQA """ Test Allow Volume Creation with Degraded Availability (CSI) Requirement: 1. Set `allow-volume-creation-with-degraded-availability` to true. 2. Set `node-level-soft-anti-affinity` to false. Steps: 1. Disable scheduling for node 3. 2. Create a Deployment Pod with a volume and 3 replicas. 1. After the volume is attached, scheduling error should be seen. 3. Write data to the Pod. 4. Scale down the deployment to 0 to detach the volume. 1. Scheduled condition should become true. 5. Scale up the deployment back to 1 and verify the data. 1. Scheduled condition should become false. 6. Enable the scheduling for node 3. 1. Volume should start rebuilding on the node 3 soon. 2. Once the rebuilding starts, the scheduled condition should become true. 7. Once rebuild finished, scale down and back the deployment to verify the data. """ setting = client.by_id_setting(common.SETTING_DEGRADED_AVAILABILITY) client.update(setting, value="true") setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") nodes = client.list_node() node3 = nodes[2] client.update(node3, allowScheduling=False) vol = common.create_and_check_volume(client, generate_volume_name(), size=str(500 * Mi)) pv_name = vol.name + "-pv" common.create_pv_for_volume(client, core_api, vol, pv_name) pvc_name = vol.name + "-pvc" common.create_pvc_for_volume(client, core_api, vol, pvc_name) deployment_name = vol.name + "-dep" deployment = make_deployment_with_pvc(deployment_name, pvc_name) deployment["spec"]["replicas"] = 3 apps_api.create_namespaced_deployment(body=deployment, namespace='default') common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) common.wait_scheduling_failure(client, vol.name) data_path = "/data/test" pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) common.write_pod_volume_random_data(core_api, pod.metadata.name, data_path, common.DATA_SIZE_IN_MB_2) created_md5sum = get_pod_data_md5sum(core_api, pod.metadata.name, data_path) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) vol = common.wait_for_volume_detached(client, vol.name) assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True" deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) common.wait_for_volume_condition_scheduled(client, vol.name, "status", common.CONDITION_STATUS_FALSE) pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) assert created_md5sum == get_pod_data_md5sum(core_api, pod.metadata.name, data_path) client.update(node3, allowScheduling=True) common.wait_for_rebuild_start(client, vol.name) vol = client.by_id_volume(vol.name) assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True" common.wait_for_rebuild_complete(client, vol.name) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_detached(client, vol.name) deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) assert created_md5sum == get_pod_data_md5sum(core_api, pod.metadata.name, data_path)
def test_offline_node_with_attached_volume_and_pod( client, core_api, volume_name, make_deployment_with_pvc, reset_cluster_ready_status): # NOQA """ Test offline node with attached volume and pod 1. Create PV/PVC/Deployment manifest. 2. Update deployment's tolerations to 20 seconds to speed up test 3. Update deployment's node affinity rule to avoid the current node 4. Create volume, PV/PVC and deployment. 5. Find the pod in the deployment and write `test_data` into it 6. Shutdown the node pod is running on 7. Wait for deployment to delete the pod 1. Deployment cannot delete the pod here because kubelet doesn't response 8. Force delete the terminating pod 9. Wait for the new pod to be created and the volume attached 10. Check `test_data` in the new pod """ toleration_seconds = 20 apps_api = get_apps_api_client() cloudprovider = detect_cloudprovider() volume_name = generate_volume_name() pv_name = volume_name + "-pv" pvc_name = volume_name + "-pvc" deployment_name = volume_name + "-dep" longhorn_test_node_name = get_self_host_id() deployment_manifest = make_deployment_with_pvc(deployment_name, pvc_name) unreachable_toleration = { "key": "node.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": toleration_seconds } not_ready_toleration = { "key": "node.kubernetes.io/not-ready", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": toleration_seconds } deployment_manifest["spec"]["template"]["spec"]["tolerations"] =\ [unreachable_toleration, not_ready_toleration] node_affinity_roles = { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [{ "matchExpressions": [{ "key": "kubernetes.io/hostname", "operator": "NotIn", "values": [longhorn_test_node_name] }] }] } } } deployment_manifest["spec"]["template"]["spec"]["affinity"] =\ node_affinity_roles longhorn_volume = create_and_check_volume(client, volume_name, size=SIZE) wait_for_volume_detached(client, volume_name) create_pv_for_volume(client, core_api, longhorn_volume, pv_name) create_pvc_for_volume(client, core_api, longhorn_volume, pvc_name) create_and_wait_deployment(apps_api, deployment_manifest) deployment_label_selector =\ "name=" + deployment_manifest["metadata"]["labels"]["name"] deployment_pod_list =\ core_api.list_namespaced_pod(namespace="default", label_selector=deployment_label_selector) assert deployment_pod_list.items.__len__() == 1 pod_name = deployment_pod_list.items[0].metadata.name test_data = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name, test_data) node_name = deployment_pod_list.items[0].spec.node_name node = cloudprovider.node_id(node_name) cloudprovider.node_shutdown(node) k8s_node_down = wait_for_node_down_k8s(node_name, core_api) assert k8s_node_down client = get_longhorn_api_client() longhorn_node_down = wait_for_node_down_longhorn(node_name, client) assert longhorn_node_down time.sleep(toleration_seconds + 5) for i in range(TERMINATING_POD_RETRYS): deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) terminating_pod_name = None for pod in deployment_pod_list.items: if pod.metadata.__getattribute__("deletion_timestamp") is not None: terminating_pod_name = pod.metadata.name break if terminating_pod_name is not None: break else: time.sleep(TERMINATING_POD_INTERVAL) assert terminating_pod_name is not None core_api.delete_namespaced_pod(namespace="default", name=terminating_pod_name, grace_period_seconds=0) delete_and_wait_pod(core_api, terminating_pod_name) deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) assert deployment_pod_list.items.__len__() == 1 wait_for_volume_detached(client, volume_name) wait_for_volume_healthy(client, volume_name) deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) assert deployment_pod_list.items.__len__() == 1 new_pod_name = deployment_pod_list.items[0].metadata.name wait_pod(new_pod_name) resp_data = read_volume_data(core_api, new_pod_name) assert test_data == resp_data
def test_restore_rwo_volume_to_rwx(set_random_backupstore, client, core_api, volume_name, pvc, csi_pv, pod_make, make_deployment_with_pvc): # NOQA """ Test restoring a rwo to a rwx volume. 1. Create a volume with 'accessMode' rwo. 2. Create a PV and a PVC with access mode 'readwriteonce' and attach to the volume. 3. Create a pod and attach to the PVC. 4. Write some data into the pod and compute md5sum. 5. Take a backup of the volume. 6. Restore the backup with 'accessMode' rwx. 7. Create PV and PVC and attach to 2 pods. 8. Verify the data. """ data_path = "/data/test" pod_name, pv_name, pvc_name, md5sum = \ prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc, pod_make, volume_name, data_size_in_mb=DATA_SIZE_IN_MB_1, data_path=data_path) snap = create_snapshot(client, volume_name) volume = client.by_id_volume(volume_name) volume.snapshotBackup(name=snap.name) wait_for_backup_completion(client, volume_name, snap.name) bv, b1 = find_backup(client, volume_name, snap.name) restore_volume_name = 'restored-rwx-volume' restore_pv_name = restore_volume_name + "-pv" restore_pvc_name = restore_volume_name + "-pvc" client.create_volume(name=restore_volume_name, size=str(1 * Gi), numberOfReplicas=3, fromBackup=b1.url, accessMode='rwx') wait_for_volume_creation(client, restore_volume_name) restore_volume = wait_for_volume_detached(client, restore_volume_name) create_pv_for_volume(client, core_api, restore_volume, restore_pv_name) create_pvc_for_volume(client, core_api, restore_volume, restore_pvc_name) deployment = make_deployment_with_pvc('deployment-multi-pods-test', restore_pvc_name, replicas=2) apps_api = get_apps_api_client() create_and_wait_deployment(apps_api, deployment) deployment_label_selector = \ "name=" + deployment["metadata"]["labels"]["name"] deployment_pod_list = \ core_api.list_namespaced_pod(namespace="default", label_selector=deployment_label_selector) pod_name_1 = deployment_pod_list.items[0].metadata.name pod_name_2 = deployment_pod_list.items[1].metadata.name md5sum_pod1 = get_pod_data_md5sum(core_api, pod_name_1, data_path) md5sum_pod2 = get_pod_data_md5sum(core_api, pod_name_2, data_path) assert md5sum == md5sum_pod1 == md5sum_pod2
def test_rwx_deployment_with_multi_pods(core_api, pvc, make_deployment_with_pvc): # NOQA """ Test deployment of 2 pods with same PVC. 1. Create a volume with 'accessMode' rwx. 2. Create a PV and a PVC with access mode 'readwritemany' and attach to the volume. 3. Create a deployment of 2 pods with PVC created 4. Wait for 2 pods to come up healthy. 5. Write data in both pods and compute md5sum. 6. Check the data md5sum in the share manager pod. """ pvc_name = 'pvc-deployment-multi-pods-test' pvc['metadata']['name'] = pvc_name pvc['spec']['storageClassName'] = 'longhorn' pvc['spec']['accessModes'] = ['ReadWriteMany'] core_api.create_namespaced_persistent_volume_claim(body=pvc, namespace='default') deployment = make_deployment_with_pvc('deployment-multi-pods-test', pvc_name, replicas=2) apps_api = get_apps_api_client() create_and_wait_deployment(apps_api, deployment) pv_name = get_volume_name(core_api, pvc_name) share_manager_name = 'share-manager-' + pv_name deployment_label_selector = "name=" + \ deployment["metadata"]["labels"]["name"] deployment_pod_list = \ core_api.list_namespaced_pod(namespace="default", label_selector=deployment_label_selector) assert deployment_pod_list.items.__len__() == 2 pod_name_1 = deployment_pod_list.items[0].metadata.name test_data_1 = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name_1, test_data_1, filename='test1') pod_name_2 = deployment_pod_list.items[1].metadata.name command = 'cat /data/test1' pod_data_2 = exec_command_in_pod(core_api, command, pod_name_2, 'default') assert test_data_1 == pod_data_2 test_data_2 = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name_2, test_data_2, filename='test2') command = 'cat /export' + '/' + pv_name + '/' + 'test1' share_manager_data_1 = exec_command_in_pod(core_api, command, share_manager_name, LONGHORN_NAMESPACE) assert test_data_1 == share_manager_data_1 command = 'cat /export' + '/' + pv_name + '/' + 'test2' share_manager_data_2 = exec_command_in_pod(core_api, command, share_manager_name, LONGHORN_NAMESPACE) assert test_data_2 == share_manager_data_2
def test_recurring_jobs_when_volume_detached_unexpectedly( settings_reset, set_random_backupstore, client, core_api, apps_api, pvc, make_deployment_with_pvc): # NOQA """ Test recurring jobs when volume detached unexpectedly Context: If the volume is automatically attached by the recurring backup job, make sure that workload pod eventually is able to use the volume when volume is detached unexpectedly during the backup process. Steps: 1. Create a volume, attach to a pod of a deployment, write 500MB to the volume. 2. Scale down the deployment. The volume is detached. 3. Turn on `Allow Recurring Job While Volume Is Detached` setting. 4. Create a recurring backup job that runs every 2 mins. 5. Wait until the recurring backup job starts and the backup progress is > 50%, kill the engine process of the volume. 6. Verify volume automatically reattached and is healthy again. 7. Wait until the backup finishes. 8. Wait for the volume to be in detached state with `frontendDisabled=false` 9. Scale up the deployment. Verify that we can read the file `lost+found` from the workload pod 10. Turn off `Allow Recurring Job While Volume Is Detached` setting Clean up backups, volumes. """ recurring_job_setting = \ client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED) client.update(recurring_job_setting, value="true") pvc_name = 'pvc-volume-detached-unexpectedly-test' pvc['metadata']['name'] = pvc_name pvc['spec']['storageClassName'] = 'longhorn' core_api.create_namespaced_persistent_volume_claim(body=pvc, namespace='default') deployment = make_deployment_with_pvc( 'deployment-volume-detached-unexpectedly-test', pvc_name) create_and_wait_deployment(apps_api, deployment) pod_names = common.get_deployment_pod_names(core_api, deployment) vol_name = get_volume_name(core_api, pvc_name) write_pod_volume_random_data(core_api, pod_names[0], "/data/test", DATA_SIZE_IN_MB_3) data = read_volume_data(core_api, pod_names[0], 'default') deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment["metadata"]["name"]) vol = wait_for_volume_detached(client, vol_name) jobs = [{ "name": RECURRING_JOB_NAME, "cron": "*/2 * * * *", "task": "backup", "retain": 1 }] vol.recurringUpdate(jobs=jobs) time.sleep(60) wait_for_recurring_backup_to_start(client, core_api, vol_name, expected_snapshot_count=1, minimum_progress=50) crash_engine_process_with_sigkill(client, core_api, vol_name) # Check if the volume is reattached after recurring backup is interrupted time.sleep(10) wait_for_volume_healthy_no_frontend(client, vol_name) # Since the backup state is removed after the backup complete and it # could happen quickly. Checking for the both in-progress and complete # state could be hard to catch, thus we only check the complete state def backup_complete_predicate(b): return b.state == "complete" and b.error == "" common.wait_for_backup_state(client, vol_name, backup_complete_predicate) wait_for_volume_detached(client, vol_name) deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment["metadata"]["name"]) wait_deployment_replica_ready(apps_api, deployment["metadata"]["name"], 1) pod_names = common.get_deployment_pod_names(core_api, deployment) assert read_volume_data(core_api, pod_names[0], 'default') == data # Use fixture to cleanup the backupstore and since we # crashed the engine replica initiated the backup, it's # backupstore lock will still be present, so we need # to wait till the lock is expired, before we can delete # the backups vol.recurringUpdate(jobs=[]) backupstore.backupstore_wait_for_lock_expiration()
def test_recurring_jobs_for_detached_volume(set_random_backupstore, client, core_api, apps_api, volume_name, make_deployment_with_pvc): # NOQA """ Test recurring jobs for detached volume Context: In the current Longhorn implementation, users cannot do recurring backup when volumes are detached. This feature gives the users an option to do recurring backup even when volumes are detached. longhorn/longhorn#1509 Steps: 1. Change the setting allow-recurring-job-while-volume-detached to true. 2. Create and attach volume, write 50MB data to the volume. 3. Detach the volume. 4. Set the recurring backup for the volume on every minute. 5. In a 2-minutes retry loop, verify that there is exactly 1 new backup. 6. Delete the recurring backup. 7. Create a PV and PVC from the volume. 8. Create a deployment of 1 pod using the PVC. 9. Write 400MB data to the volume from the pod. 10. Scale down the deployment. Wait until the volume is detached. 11. Set the recurring backup for every 2 minutes. 12. Wait util the recurring backup starts, scale up the deployment to 1 pod. 13. Verify that during the recurring backup, the volume's frontend is disabled, and pod cannot start. 14. Wait for the recurring backup finishes. Delete the recurring backup. 15. In a 10-minutes retry loop, verify that the pod can eventually start. 16. Change the setting allow-recurring-job-while-volume-detached to false. 17. Cleanup. """ recurring_job_setting = \ client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED) client.update(recurring_job_setting, value="true") vol = common.create_and_check_volume(client, volume_name, size=str(1 * Gi)) lht_hostId = get_self_host_id() vol.attach(hostId=lht_hostId) vol = wait_for_volume_healthy(client, vol.name) data = { 'pos': 0, 'content': common.generate_random_data(50 * Mi), } common.write_volume_data(vol, data) # Give sometimes for data to flush to disk time.sleep(15) vol.detach(hostId="") vol = common.wait_for_volume_detached(client, vol.name) jobs = [{ "name": RECURRING_JOB_NAME, "cron": "*/1 * * * *", "task": "backup", "retain": 1 }] vol.recurringUpdate(jobs=jobs) common.wait_for_backup_completion(client, vol.name) for _ in range(4): bv = client.by_id_backupVolume(vol.name) backups = bv.backupList().data assert len(backups) == 1 time.sleep(30) vol.recurringUpdate(jobs=[]) pv_name = volume_name + "-pv" common.create_pv_for_volume(client, core_api, vol, pv_name) pvc_name = volume_name + "-pvc" common.create_pvc_for_volume(client, core_api, vol, pvc_name) deployment_name = volume_name + "-dep" deployment = make_deployment_with_pvc(deployment_name, pvc_name) common.create_and_wait_deployment(apps_api, deployment) size_mb = 400 pod_names = common.get_deployment_pod_names(core_api, deployment) write_pod_volume_random_data(core_api, pod_names[0], "/data/test", size_mb) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment["metadata"]["name"]) vol = common.wait_for_volume_detached(client, vol.name) jobs = [{ "name": RECURRING_JOB_NAME, "cron": "*/2 * * * *", "task": "backup", "retain": 1 }] vol.recurringUpdate(jobs=jobs) common.wait_for_backup_to_start(client, vol.name) deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment["metadata"]["name"]) deployment_label_name = deployment["metadata"]["labels"]["name"] common.wait_pod_auto_attach_after_first_backup_completion( client, core_api, vol.name, deployment_label_name) vol.recurringUpdate(jobs=[]) pod_names = common.get_deployment_pod_names(core_api, deployment) common.wait_for_pod_phase(core_api, pod_names[0], pod_phase="Running")