def test_pv_creation(client, core_api): # NOQA volume_name = "test-pv-creation" client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pv_name = "pv-" + volume_name create_pv_for_volume(client, core_api, volume, pv_name) # try to create one more pv for the volume pv_name_2 = "pv2-" + volume_name with pytest.raises(Exception) as e: volume.pvCreate(pvName=pv_name_2) assert "already exist" in str(e.value) ks = { 'pvName': pv_name, 'pvStatus': 'Available', 'namespace': '', 'pvcName': '', 'lastPVCRefAt': '', 'lastPodRefAt': '', } wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pv(core_api, pv_name)
def test_pv_creation(client, core_api): # NOQA """ Test creating PV using Longhorn API 1. Create volume 2. Create PV for the volume 3. Try to create another PV for the same volume. It should fail. 4. Check Kubernetes Status for the volume since PV is created. """ volume_name = "test-pv-creation" # NOQA client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pv_name = "pv-" + volume_name create_pv_for_volume(client, core_api, volume, pv_name) # try to create one more pv for the volume pv_name_2 = "pv2-" + volume_name with pytest.raises(Exception) as e: volume.pvCreate(pvName=pv_name_2) assert "already exist" in str(e.value) ks = { 'pvName': pv_name, 'pvStatus': 'Available', 'namespace': '', 'pvcName': '', 'lastPVCRefAt': '', 'lastPodRefAt': '', } wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pv(core_api, pv_name)
def test_delete_with_static_pv(client, core_api, volume_name): # NOQA """ Test that deleting a Volume with related static Persistent Volume and Persistent Volume Claim resources successfully deletes the Volume and cleans up those resources. 1. Create a Volume in Longhorn. 2. Create a static Persistent Volume and Persistent Volume Claim for the Volume through Longhorn. 3. Wait for the Kubernetes Status to indicate the existence of these resources. 4. Attempt deletion of the Volume. 5. Verify that the Volume and its associated resources have been deleted. """ volume = create_and_check_volume(client, volume_name) pv_name = 'pv-' + volume_name pvc_name = 'pvc-' + volume_name create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ks = { 'pvName': pv_name, 'pvStatus': 'Bound', 'namespace': 'default', 'pvcName': pvc_name, 'lastPVCRefAt': '', 'lastPodRefAt': '', } wait_volume_kubernetes_status(client, volume_name, ks) client.delete(volume) wait_for_volume_delete(client, volume_name) wait_delete_pv(core_api, pv_name) wait_delete_pvc(core_api, pvc_name)
def test_xfs_pv_existing_volume(client, core_api, pod_manifest): # NOQA """ Test create PV with existing XFS filesystem 1. Create a volume 2. Create PV/PVC for the existing volume, specify `xfs` as filesystem 3. Attach the volume to the current node. 4. Format it to `xfs` 5. Create a POD using the volume FIXME: We should write data in step 4 and validate the data in step 5, make sure the disk won't be reformatted """ volume_name = generate_volume_name() volume = create_and_check_volume(client, volume_name) create_pv_for_volume(client, core_api, volume, volume_name, "xfs") create_pvc_for_volume(client, core_api, volume, volume_name) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) cmd = ['mkfs.xfs', get_volume_endpoint(volume)] subprocess.check_call(cmd) volume = volume.detach() volume = wait_for_volume_detached(client, volume_name) pod_manifest['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": volume_name } }] create_and_wait_pod(core_api, pod_manifest)
def test_xfs_pv(client, core_api, pod_manifest): # NOQA """ Test create PV with new XFS filesystem 1. Create a volume 2. Create a PV for the existing volume, specify `xfs` as filesystem 3. Create PVC and Pod 4. Make sure Pod is running. 5. Write data into the pod and read back for validation. Note: The volume will be formatted to XFS filesystem by Kubernetes in this case. """ volume_name = generate_volume_name() volume = create_and_check_volume(client, volume_name) create_pv_for_volume(client, core_api, volume, volume_name, "xfs") create_pvc_for_volume(client, core_api, volume, volume_name) pod_manifest['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": volume_name } }] pod_name = pod_manifest['metadata']['name'] create_and_wait_pod(core_api, pod_manifest) test_data = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name, test_data) resp = read_volume_data(core_api, pod_name) assert resp == test_data
def test_allow_volume_creation_with_degraded_availability_csi( client, core_api, apps_api, make_deployment_with_pvc): # NOQA """ Test Allow Volume Creation with Degraded Availability (CSI) Requirement: 1. Set `allow-volume-creation-with-degraded-availability` to true. 2. Set `node-level-soft-anti-affinity` to false. Steps: 1. Disable scheduling for node 3. 2. Create a Deployment Pod with a volume and 3 replicas. 1. After the volume is attached, scheduling error should be seen. 3. Write data to the Pod. 4. Scale down the deployment to 0 to detach the volume. 1. Scheduled condition should become true. 5. Scale up the deployment back to 1 and verify the data. 1. Scheduled condition should become false. 6. Enable the scheduling for node 3. 1. Volume should start rebuilding on the node 3 soon. 2. Once the rebuilding starts, the scheduled condition should become true. 7. Once rebuild finished, scale down and back the deployment to verify the data. """ setting = client.by_id_setting(common.SETTING_DEGRADED_AVAILABILITY) client.update(setting, value="true") setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") nodes = client.list_node() node3 = nodes[2] client.update(node3, allowScheduling=False) vol = common.create_and_check_volume(client, generate_volume_name(), size=str(500 * Mi)) pv_name = vol.name + "-pv" common.create_pv_for_volume(client, core_api, vol, pv_name) pvc_name = vol.name + "-pvc" common.create_pvc_for_volume(client, core_api, vol, pvc_name) deployment_name = vol.name + "-dep" deployment = make_deployment_with_pvc(deployment_name, pvc_name) deployment["spec"]["replicas"] = 3 apps_api.create_namespaced_deployment(body=deployment, namespace='default') common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) common.wait_scheduling_failure(client, vol.name) data_path = "/data/test" pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) common.write_pod_volume_random_data(core_api, pod.metadata.name, data_path, common.DATA_SIZE_IN_MB_2) created_md5sum = get_pod_data_md5sum(core_api, pod.metadata.name, data_path) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) vol = common.wait_for_volume_detached(client, vol.name) assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True" deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) common.wait_for_volume_condition_scheduled(client, vol.name, "status", common.CONDITION_STATUS_FALSE) pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) assert created_md5sum == get_pod_data_md5sum(core_api, pod.metadata.name, data_path) client.update(node3, allowScheduling=True) common.wait_for_rebuild_start(client, vol.name) vol = client.by_id_volume(vol.name) assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True" common.wait_for_rebuild_complete(client, vol.name) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_detached(client, vol.name) deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) assert created_md5sum == get_pod_data_md5sum(core_api, pod.metadata.name, data_path)
def test_replica_auto_balance_zone_best_effort_with_data_locality( client, core_api, volume_name, pod): # NOQA """ Background: Given set `replica-soft-anti-affinity` to `true`. And set `replica-zone-soft-anti-affinity` to `true`. And set `default-data-locality` to `best-effort`. And set `replicaAutoBalance` to `best-effort`. And set node-1 to zone-1. set node-2 to zone-1. set node-3 to zone-2. And create volume with 2 replicas. And create pv for volume. And create pvc for volume. Scenario Outline: replica auto-balance zones with best-effort should not remove pod local replicas when data locality is enabled (best-effort). Given create and wait pod on <pod-node>. And disable scheduling and evict node-3. And count replicas on each nodes. And 1 replica running on <pod-node>. 1 replica running on <duplicate-node>. 0 replica running on node-3. When enable scheduling for node-3. Then count replicas on each nodes. And 1 replica running on <pod-node>. 0 replica running on <duplicate-node>. 1 replica running on node-3. And count replicas in each zones. And 1 replica running in zone-1. 1 replica running in zone-2. And loop 3 times with each wait 5 seconds and count replicas on each nodes. To ensure no addition scheduling is happening. 1 replica running on <pod-node>. 0 replica running on <duplicate-node>. 1 replica running on node-3. And delete pod. Examples: | pod-node | duplicate-node | | node-1 | node-2 | | node-2 | node-1 | | node-1 | node-2 | """ common.update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true") common.update_setting(client, SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY, "true") common.update_setting(client, SETTING_DEFAULT_DATA_LOCALITY, "best-effort") common.update_setting(client, SETTING_REPLICA_AUTO_BALANCE, "best-effort") n1, n2, n3 = client.list_node() set_k8s_node_zone_label(core_api, n1.name, ZONE1) set_k8s_node_zone_label(core_api, n2.name, ZONE1) set_k8s_node_zone_label(core_api, n3.name, ZONE2) wait_longhorn_node_zone_updated(client) n_replicas = 2 volume = create_and_check_volume(client, volume_name, num_of_replicas=n_replicas) common.create_pv_for_volume(client, core_api, volume, volume_name) common.create_pvc_for_volume(client, core_api, volume, volume_name) pod['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": volume_name } }] for i in range(1, 4): pod_node_name = n2.name if i % 2 == 0 else n1.name pod['spec']['nodeSelector'] = {"kubernetes.io/hostname": pod_node_name} common.create_and_wait_pod(core_api, pod) client.update(n3, allowScheduling=False, evictionRequested=True) duplicate_node = [n1.name, n2.name] duplicate_node.remove(pod_node_name) for _ in range(RETRY_COUNTS): pod_node_r_count = common.get_host_replica_count(client, volume_name, pod_node_name, chk_running=True) duplicate_node_r_count = common.get_host_replica_count( client, volume_name, duplicate_node[0], chk_running=True) balance_node_r_count = common.get_host_replica_count( client, volume_name, n3.name, chk_running=False) if pod_node_r_count == duplicate_node_r_count == 1 and \ balance_node_r_count == 0: break time.sleep(RETRY_INTERVAL) assert pod_node_r_count == 1 assert duplicate_node_r_count == 1 assert balance_node_r_count == 0 client.update(n3, allowScheduling=True) for _ in range(RETRY_COUNTS): pod_node_r_count = common.get_host_replica_count(client, volume_name, pod_node_name, chk_running=True) duplicate_node_r_count = common.get_host_replica_count( client, volume_name, duplicate_node[0], chk_running=False) balance_node_r_count = common.get_host_replica_count( client, volume_name, n3.name, chk_running=True) if pod_node_r_count == balance_node_r_count == 1 and \ duplicate_node_r_count == 0: break time.sleep(RETRY_INTERVAL) assert pod_node_r_count == 1 assert duplicate_node_r_count == 0 assert balance_node_r_count == 1 z1_r_count = get_zone_replica_count(client, volume_name, ZONE1, chk_running=True) z2_r_count = get_zone_replica_count(client, volume_name, ZONE2, chk_running=True) assert z1_r_count == z2_r_count == 1 # loop 3 times and each to wait 5 seconds to ensure there is no # re-scheduling happening. for _ in range(3): time.sleep(5) assert pod_node_r_count == common.get_host_replica_count( client, volume_name, pod_node_name, chk_running=True) assert duplicate_node_r_count == common.get_host_replica_count( client, volume_name, duplicate_node[0], chk_running=False) assert balance_node_r_count == common.get_host_replica_count( client, volume_name, n3.name, chk_running=True) common.delete_and_wait_pod(core_api, pod['metadata']['name'])
def test_offline_node_with_attached_volume_and_pod( client, core_api, volume_name, make_deployment_with_pvc, reset_cluster_ready_status): # NOQA """ Test offline node with attached volume and pod 1. Create PV/PVC/Deployment manifest. 2. Update deployment's tolerations to 20 seconds to speed up test 3. Update deployment's node affinity rule to avoid the current node 4. Create volume, PV/PVC and deployment. 5. Find the pod in the deployment and write `test_data` into it 6. Shutdown the node pod is running on 7. Wait for deployment to delete the pod 1. Deployment cannot delete the pod here because kubelet doesn't response 8. Force delete the terminating pod 9. Wait for the new pod to be created and the volume attached 10. Check `test_data` in the new pod """ toleration_seconds = 20 apps_api = get_apps_api_client() cloudprovider = detect_cloudprovider() volume_name = generate_volume_name() pv_name = volume_name + "-pv" pvc_name = volume_name + "-pvc" deployment_name = volume_name + "-dep" longhorn_test_node_name = get_self_host_id() deployment_manifest = make_deployment_with_pvc(deployment_name, pvc_name) unreachable_toleration = { "key": "node.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": toleration_seconds } not_ready_toleration = { "key": "node.kubernetes.io/not-ready", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": toleration_seconds } deployment_manifest["spec"]["template"]["spec"]["tolerations"] =\ [unreachable_toleration, not_ready_toleration] node_affinity_roles = { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [{ "matchExpressions": [{ "key": "kubernetes.io/hostname", "operator": "NotIn", "values": [longhorn_test_node_name] }] }] } } } deployment_manifest["spec"]["template"]["spec"]["affinity"] =\ node_affinity_roles longhorn_volume = create_and_check_volume(client, volume_name, size=SIZE) wait_for_volume_detached(client, volume_name) create_pv_for_volume(client, core_api, longhorn_volume, pv_name) create_pvc_for_volume(client, core_api, longhorn_volume, pvc_name) create_and_wait_deployment(apps_api, deployment_manifest) deployment_label_selector =\ "name=" + deployment_manifest["metadata"]["labels"]["name"] deployment_pod_list =\ core_api.list_namespaced_pod(namespace="default", label_selector=deployment_label_selector) assert deployment_pod_list.items.__len__() == 1 pod_name = deployment_pod_list.items[0].metadata.name test_data = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name, test_data) node_name = deployment_pod_list.items[0].spec.node_name node = cloudprovider.node_id(node_name) cloudprovider.node_shutdown(node) k8s_node_down = wait_for_node_down_k8s(node_name, core_api) assert k8s_node_down client = get_longhorn_api_client() longhorn_node_down = wait_for_node_down_longhorn(node_name, client) assert longhorn_node_down time.sleep(toleration_seconds + 5) for i in range(TERMINATING_POD_RETRYS): deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) terminating_pod_name = None for pod in deployment_pod_list.items: if pod.metadata.__getattribute__("deletion_timestamp") is not None: terminating_pod_name = pod.metadata.name break if terminating_pod_name is not None: break else: time.sleep(TERMINATING_POD_INTERVAL) assert terminating_pod_name is not None core_api.delete_namespaced_pod(namespace="default", name=terminating_pod_name, grace_period_seconds=0) delete_and_wait_pod(core_api, terminating_pod_name) deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) assert deployment_pod_list.items.__len__() == 1 wait_for_volume_detached(client, volume_name) wait_for_volume_healthy(client, volume_name) deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) assert deployment_pod_list.items.__len__() == 1 new_pod_name = deployment_pod_list.items[0].metadata.name wait_pod(new_pod_name) resp_data = read_volume_data(core_api, new_pod_name) assert test_data == resp_data
def test_backup_kubernetes_status(client, core_api, pod): # NOQA """ Test that Backups have KubernetesStatus stored properly when there is an associated PersistentVolumeClaim and Pod. """ host_id = get_self_host_id() static_sc_name = "longhorn-static-test" setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC) setting = client.update(setting, value=static_sc_name) assert setting["value"] == static_sc_name volume_name = "test-backup-kubernetes-status-pod" client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pod_name = "pod-" + volume_name pv_name = "pv-" + volume_name pvc_name = "pvc-" + volume_name create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') pvc_found = False for item in ret.items: if item.metadata.name == pvc_name: pvc_found = item break assert pvc_found assert pvc_found.spec.storage_class_name == static_sc_name pod['metadata']['name'] = pod_name pod['spec']['volumes'] = [{ 'name': pod['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': { 'claimName': pvc_name, }, }] create_and_wait_pod(core_api, pod) ks = { 'lastPodRefAt': '', 'lastPVCRefAt': '', 'namespace': 'default', 'pvcName': pvc_name, 'pvName': pv_name, 'pvStatus': 'Bound', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, volume_name, ks) volume = wait_for_volume_healthy(client, volume_name) # Create Backup manually instead of calling create_backup since Kubernetes # is not guaranteed to mount our Volume to the test host. snap = volume.snapshotCreate() volume.snapshotBackup(name=snap["name"]) bv, b = find_backup(client, volume_name, snap["name"]) new_b = bv.backupGet(name=b["name"]) status = loads(new_b["labels"].get(KUBERNETES_STATUS_LABEL)) assert status == ks restore_name = generate_volume_name() client.create_volume(name=restore_name, size=SIZE, numberOfReplicas=2, fromBackup=b["url"]) wait_for_volume_restoration_completed(client, restore_name) wait_for_volume_detached(client, restore_name) snapshot_created = b["snapshotCreated"] ks = { 'lastPodRefAt': b["snapshotCreated"], 'lastPVCRefAt': b["snapshotCreated"], 'namespace': 'default', 'pvcName': pvc_name, # Restoration should not apply PersistentVolume data. 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, restore_name, ks) restore = client.by_id_volume(restore_name) # We need to compare LastPodRefAt and LastPVCRefAt manually since # wait_volume_kubernetes_status only checks for empty or non-empty state. assert restore["kubernetesStatus"]["lastPodRefAt"] == ks["lastPodRefAt"] assert restore["kubernetesStatus"]["lastPVCRefAt"] == ks["lastPVCRefAt"] bv.backupDelete(name=b["name"]) client.delete(restore) wait_for_volume_delete(client, restore_name) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name) delete_and_wait_pv(core_api, pv_name) # With the Pod, PVC, and PV deleted, the Volume should have both Ref # fields set. Check that a new Backup and Restore will use this instead of # manually populating the Ref fields. ks = { 'lastPodRefAt': 'NOT NULL', 'lastPVCRefAt': 'NOT NULL', 'namespace': 'default', 'pvcName': pvc_name, 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, volume_name, ks) volume = wait_for_volume_detached(client, volume_name) volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) snap = volume.snapshotCreate() volume.snapshotBackup(name=snap["name"]) bv, b = find_backup(client, volume_name, snap["name"]) new_b = bv.backupGet(name=b["name"]) status = loads(new_b["labels"].get(KUBERNETES_STATUS_LABEL)) # Check each field manually, we have no idea what the LastPodRefAt or the # LastPVCRefAt will be. We just know it shouldn't be SnapshotCreated. assert status["lastPodRefAt"] != snapshot_created assert status["lastPVCRefAt"] != snapshot_created assert status["namespace"] == "default" assert status["pvcName"] == pvc_name assert status["pvName"] == "" assert status["pvStatus"] == "" assert status["workloadsStatus"] == [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] restore_name = generate_volume_name() client.create_volume(name=restore_name, size=SIZE, numberOfReplicas=2, fromBackup=b["url"]) wait_for_volume_restoration_completed(client, restore_name) wait_for_volume_detached(client, restore_name) ks = { 'lastPodRefAt': status["lastPodRefAt"], 'lastPVCRefAt': status["lastPVCRefAt"], 'namespace': 'default', 'pvcName': pvc_name, 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, restore_name, ks) restore = client.by_id_volume(restore_name) assert restore["kubernetesStatus"]["lastPodRefAt"] == ks["lastPodRefAt"] assert restore["kubernetesStatus"]["lastPVCRefAt"] == ks["lastPVCRefAt"] bv.backupDelete(name=b["name"]) client.delete(restore) cleanup_volume(client, volume)
def test_backup_kubernetes_status(set_random_backupstore, client, core_api, pod): # NOQA """ Test that Backups have KubernetesStatus stored properly when there is an associated PersistentVolumeClaim and Pod. 1. Setup a random backupstore 2. Set settings Longhorn Static StorageClass to `longhorn-static-test` 3. Create a volume and PV/PVC. Verify the StorageClass of PVC 4. Create a Pod using the PVC. 5. Check volume's Kubernetes status to reflect PV/PVC/Pod correctly. 6. Create a backup for the volume. 7. Verify the labels of created backup reflect PV/PVC/Pod status. 8. Restore the backup to a volume. Wait for restoration to complete. 9. Check the volume's Kubernetes Status 1. Make sure the `lastPodRefAt` and `lastPVCRefAt` is snapshot created time 10. Delete the backup and restored volume. 11. Delete PV/PVC/Pod. 12. Verify volume's Kubernetes Status updated to reflect history data. 13. Attach the volume and create another backup. Verify the labels 14. Verify the volume's Kubernetes status. 15. Restore the previous backup to a new volume. Wait for restoration. 16. Verify the restored volume's Kubernetes status. 1. Make sure `lastPodRefAt` and `lastPVCRefAt` matched volume on step 12 """ host_id = get_self_host_id() static_sc_name = "longhorn-static-test" setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC) setting = client.update(setting, value=static_sc_name) assert setting.value == static_sc_name volume_name = "test-backup-kubernetes-status-pod" # NOQA client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pod_name = "pod-" + volume_name pv_name = "pv-" + volume_name pvc_name = "pvc-" + volume_name create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') pvc_found = False for item in ret.items: if item.metadata.name == pvc_name: pvc_found = item break assert pvc_found assert pvc_found.spec.storage_class_name == static_sc_name pod['metadata']['name'] = pod_name pod['spec']['volumes'] = [{ 'name': pod['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': { 'claimName': pvc_name, }, }] create_and_wait_pod(core_api, pod) ks = { 'lastPodRefAt': '', 'lastPVCRefAt': '', 'namespace': 'default', 'pvcName': pvc_name, 'pvName': pv_name, 'pvStatus': 'Bound', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, volume_name, ks) volume = wait_for_volume_healthy(client, volume_name) # Create Backup manually instead of calling create_backup since Kubernetes # is not guaranteed to mount our Volume to the test host. snap = create_snapshot(client, volume_name) volume.snapshotBackup(name=snap.name) wait_for_backup_completion(client, volume_name, snap.name) _, b = find_backup(client, volume_name, snap.name) # Check backup label status = loads(b.labels.get(KUBERNETES_STATUS_LABEL)) assert status == ks # Check backup volume label for _ in range(RETRY_COUNTS): bv = client.by_id_backupVolume(volume_name) if bv is not None and bv.labels is not None: break time.sleep(RETRY_INTERVAL) assert bv is not None and bv.labels is not None status = loads(bv.labels.get(KUBERNETES_STATUS_LABEL)) assert status == ks restore_name = generate_volume_name() client.create_volume(name=restore_name, size=SIZE, numberOfReplicas=2, fromBackup=b.url) wait_for_volume_restoration_completed(client, restore_name) wait_for_volume_detached(client, restore_name) snapshot_created = b.snapshotCreated ks = { 'lastPodRefAt': b.snapshotCreated, 'lastPVCRefAt': b.snapshotCreated, 'namespace': 'default', 'pvcName': pvc_name, # Restoration should not apply PersistentVolume data. 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, restore_name, ks) restore = client.by_id_volume(restore_name) # We need to compare LastPodRefAt and LastPVCRefAt manually since # wait_volume_kubernetes_status only checks for empty or non-empty state. assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"] assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"] delete_backup(client, bv.name, b.name) client.delete(restore) wait_for_volume_delete(client, restore_name) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name) delete_and_wait_pv(core_api, pv_name) # With the Pod, PVC, and PV deleted, the Volume should have both Ref # fields set. Check that a new Backup and Restore will use this instead of # manually populating the Ref fields. ks = { 'lastPodRefAt': 'NOT NULL', 'lastPVCRefAt': 'NOT NULL', 'namespace': 'default', 'pvcName': pvc_name, 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, volume_name, ks) volume = wait_for_volume_detached(client, volume_name) volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) snap = create_snapshot(client, volume_name) volume.snapshotBackup(name=snap.name) volume = wait_for_backup_completion(client, volume_name, snap.name) bv, b = find_backup(client, volume_name, snap.name) new_b = bv.backupGet(name=b.name) status = loads(new_b.labels.get(KUBERNETES_STATUS_LABEL)) # Check each field manually, we have no idea what the LastPodRefAt or the # LastPVCRefAt will be. We just know it shouldn't be SnapshotCreated. assert status['lastPodRefAt'] != snapshot_created assert status['lastPVCRefAt'] != snapshot_created assert status['namespace'] == "default" assert status['pvcName'] == pvc_name assert status['pvName'] == "" assert status['pvStatus'] == "" assert status['workloadsStatus'] == [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] restore_name = generate_volume_name() client.create_volume(name=restore_name, size=SIZE, numberOfReplicas=2, fromBackup=b.url) wait_for_volume_restoration_completed(client, restore_name) wait_for_volume_detached(client, restore_name) ks = { 'lastPodRefAt': status['lastPodRefAt'], 'lastPVCRefAt': status['lastPVCRefAt'], 'namespace': 'default', 'pvcName': pvc_name, 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, restore_name, ks) restore = client.by_id_volume(restore_name) assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"] assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"] # cleanup backupstore_cleanup(client) client.delete(restore) cleanup_volume(client, volume)
def test_recurring_job_kubernetes_status(client, core_api, volume_name): # NOQA """ Test RecurringJob properly backs up the KubernetesStatus 1. Setup a random backupstore. 2. Create a volume. 3. Create a PV from the volume, and verify the PV status. 4. Create a backup recurring job to run every 2 minutes. 5. Verify the recurring job runs correctly. 6. Verify the backup contains the Kubernetes Status labels """ set_random_backupstore(client) host_id = get_self_host_id() client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = common.wait_for_volume_detached(client, volume_name) pv_name = "pv-" + volume_name create_pv_for_volume(client, core_api, volume, pv_name) ks = { 'pvName': pv_name, 'pvStatus': 'Available', 'namespace': '', 'pvcName': '', 'lastPVCRefAt': '', 'lastPodRefAt': '', } wait_volume_kubernetes_status(client, volume_name, ks) # Simple Backup Job that runs every 2 minutes, retains 1. jobs = [{ "name": RECURRING_JOB_NAME, "cron": "*/2 * * * *", "task": "backup", "retain": 1 }] volume.recurringUpdate(jobs=jobs) volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) # 5 minutes time.sleep(300) snapshots = volume.snapshotList() count = 0 for snapshot in snapshots: if snapshot.removed is False: count += 1 # 1 from Backup, 1 from Volume Head. assert count == 2 # Verify the Labels on the actual Backup. bv = client.by_id_backupVolume(volume_name) backups = bv.backupList().data assert len(backups) == 1 b = bv.backupGet(name=backups[0].name) status = json.loads(b.labels.get(KUBERNETES_STATUS_LABEL)) assert b.labels.get(RECURRING_JOB_LABEL) == RECURRING_JOB_NAME assert status == { 'lastPodRefAt': '', 'lastPVCRefAt': '', 'namespace': '', 'pvcName': '', 'pvName': pv_name, 'pvStatus': 'Available', 'workloadsStatus': None } # Two Labels: KubernetesStatus and RecurringJob. assert len(b.labels) == 2 cleanup_volume(client, volume) delete_and_wait_pv(core_api, pv_name)
def restore_inc_test(client, core_api, volume_name, pod): # NOQA std_volume = create_and_check_volume(client, volume_name, 2, SIZE) lht_host_id = get_self_host_id() std_volume.attach(hostId=lht_host_id) std_volume = common.wait_for_volume_healthy(client, volume_name) with pytest.raises(Exception) as e: std_volume.activate(frontend="blockdev") assert "already in active mode" in str(e.value) data0 = {'len': 4 * 1024, 'pos': 0} data0['content'] = common.generate_random_data(data0['len']) bv, backup0, _, data0 = create_backup(client, volume_name, data0) sb_volume0_name = "sb-0-" + volume_name sb_volume1_name = "sb-1-" + volume_name sb_volume2_name = "sb-2-" + volume_name client.create_volume(name=sb_volume0_name, size=SIZE, numberOfReplicas=2, fromBackup=backup0['url'], frontend="", standby=True) client.create_volume(name=sb_volume1_name, size=SIZE, numberOfReplicas=2, fromBackup=backup0['url'], frontend="", standby=True) client.create_volume(name=sb_volume2_name, size=SIZE, numberOfReplicas=2, fromBackup=backup0['url'], frontend="", standby=True) common.wait_for_volume_restoration_completed(client, sb_volume0_name) common.wait_for_volume_restoration_completed(client, sb_volume1_name) common.wait_for_volume_restoration_completed(client, sb_volume2_name) sb_volume0 = common.wait_for_volume_healthy(client, sb_volume0_name) sb_volume1 = common.wait_for_volume_healthy(client, sb_volume1_name) sb_volume2 = common.wait_for_volume_healthy(client, sb_volume2_name) for i in range(RETRY_COUNTS): sb_volume0 = client.by_id_volume(sb_volume0_name) sb_volume1 = client.by_id_volume(sb_volume1_name) sb_volume2 = client.by_id_volume(sb_volume2_name) sb_engine0 = get_volume_engine(sb_volume0) sb_engine1 = get_volume_engine(sb_volume1) sb_engine2 = get_volume_engine(sb_volume2) if sb_volume0["lastBackup"] != backup0["name"] or \ sb_volume1["lastBackup"] != backup0["name"] or \ sb_volume2["lastBackup"] != backup0["name"] or \ sb_engine0["lastRestoredBackup"] != backup0["name"] or \ sb_engine1["lastRestoredBackup"] != backup0["name"] or \ sb_engine2["lastRestoredBackup"] != backup0["name"]: time.sleep(RETRY_INTERVAL) else: break assert sb_volume0["standby"] is True assert sb_volume0["lastBackup"] == backup0["name"] assert sb_volume0["frontend"] == "" assert sb_volume0["disableFrontend"] is True assert sb_volume0["initialRestorationRequired"] is False sb_engine0 = get_volume_engine(sb_volume0) assert sb_engine0["lastRestoredBackup"] == backup0["name"] assert sb_engine0["requestedBackupRestore"] == backup0["name"] assert sb_volume1["standby"] is True assert sb_volume1["lastBackup"] == backup0["name"] assert sb_volume1["frontend"] == "" assert sb_volume1["disableFrontend"] is True assert sb_volume1["initialRestorationRequired"] is False sb_engine1 = get_volume_engine(sb_volume1) assert sb_engine1["lastRestoredBackup"] == backup0["name"] assert sb_engine1["requestedBackupRestore"] == backup0["name"] assert sb_volume2["standby"] is True assert sb_volume2["lastBackup"] == backup0["name"] assert sb_volume2["frontend"] == "" assert sb_volume2["disableFrontend"] is True assert sb_volume2["initialRestorationRequired"] is False sb_engine2 = get_volume_engine(sb_volume2) assert sb_engine2["lastRestoredBackup"] == backup0["name"] assert sb_engine2["requestedBackupRestore"] == backup0["name"] sb0_snaps = sb_volume0.snapshotList() assert len(sb0_snaps) == 2 for s in sb0_snaps: if s['name'] != "volume-head": sb0_snap = s assert sb0_snaps with pytest.raises(Exception) as e: sb_volume0.snapshotCreate() assert "cannot create snapshot for standby volume" in str(e.value) with pytest.raises(Exception) as e: sb_volume0.snapshotRevert(name=sb0_snap["name"]) assert "cannot revert snapshot for standby volume" in str(e.value) with pytest.raises(Exception) as e: sb_volume0.snapshotDelete(name=sb0_snap["name"]) assert "cannot delete snapshot for standby volume" in str(e.value) with pytest.raises(Exception) as e: sb_volume0.snapshotBackup(name=sb0_snap["name"]) assert "cannot create backup for standby volume" in str(e.value) with pytest.raises(Exception) as e: sb_volume0.pvCreate(pvName=sb_volume0_name) assert "cannot create PV for standby volume" in str(e.value) with pytest.raises(Exception) as e: sb_volume0.pvcCreate(pvcName=sb_volume0_name) assert "cannot create PVC for standby volume" in str(e.value) setting = client.by_id_setting(common.SETTING_BACKUP_TARGET) with pytest.raises(Exception) as e: client.update(setting, value="random.backup.target") assert "cannot modify BackupTarget " \ "since there are existing standby volumes" in str(e.value) with pytest.raises(Exception) as e: sb_volume0.activate(frontend="wrong_frontend") assert "invalid frontend" in str(e.value) activate_standby_volume(client, sb_volume0_name) sb_volume0 = client.by_id_volume(sb_volume0_name) sb_volume0.attach(hostId=lht_host_id) sb_volume0 = common.wait_for_volume_healthy(client, sb_volume0_name) check_volume_data(sb_volume0, data0, False) zero_string = b'\x00'.decode('utf-8') _, backup1, _, data1 = create_backup(client, volume_name, { 'len': 2 * 1024, 'pos': 0, 'content': zero_string * 2 * 1024 }) # use this api to update field `last backup` client.list_backupVolume() check_volume_last_backup(client, sb_volume1_name, backup1['name']) activate_standby_volume(client, sb_volume1_name) sb_volume1 = client.by_id_volume(sb_volume1_name) sb_volume1.attach(hostId=lht_host_id) sb_volume1 = common.wait_for_volume_healthy(client, sb_volume1_name) data0_modified = { 'len': data0['len'] - data1['len'], 'pos': data1['len'], 'content': data0['content'][data1['len']:], } check_volume_data(sb_volume1, data0_modified, False) check_volume_data(sb_volume1, data1) data2 = {'len': 1 * 1024 * 1024, 'pos': 0} data2['content'] = common.generate_random_data(data2['len']) _, backup2, _, data2 = create_backup(client, volume_name, data2) client.list_backupVolume() check_volume_last_backup(client, sb_volume2_name, backup2['name']) activate_standby_volume(client, sb_volume2_name) sb_volume2 = client.by_id_volume(sb_volume2_name) sb_volume2.attach(hostId=lht_host_id) sb_volume2 = common.wait_for_volume_healthy(client, sb_volume2_name) check_volume_data(sb_volume2, data2) # allocated this active volume to a pod sb_volume2.detach() sb_volume2 = common.wait_for_volume_detached(client, sb_volume2_name) create_pv_for_volume(client, core_api, sb_volume2, sb_volume2_name) create_pvc_for_volume(client, core_api, sb_volume2, sb_volume2_name) sb_volume2_pod_name = "pod-" + sb_volume2_name pod['metadata']['name'] = sb_volume2_pod_name pod['spec']['volumes'] = [{ 'name': pod['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': { 'claimName': sb_volume2_name, }, }] create_and_wait_pod(core_api, pod) sb_volume2 = client.by_id_volume(sb_volume2_name) k_status = sb_volume2["kubernetesStatus"] workloads = k_status['workloadsStatus'] assert k_status['pvName'] == sb_volume2_name assert k_status['pvStatus'] == 'Bound' assert len(workloads) == 1 for i in range(RETRY_COUNTS): if workloads[0]['podStatus'] == 'Running': break time.sleep(RETRY_INTERVAL) sb_volume2 = client.by_id_volume(sb_volume2_name) k_status = sb_volume2["kubernetesStatus"] workloads = k_status['workloadsStatus'] assert len(workloads) == 1 assert workloads[0]['podName'] == sb_volume2_pod_name assert workloads[0]['podStatus'] == 'Running' assert not workloads[0]['workloadName'] assert not workloads[0]['workloadType'] assert k_status['namespace'] == 'default' assert k_status['pvcName'] == sb_volume2_name assert not k_status['lastPVCRefAt'] assert not k_status['lastPodRefAt'] delete_and_wait_pod(core_api, sb_volume2_pod_name) delete_and_wait_pvc(core_api, sb_volume2_name) delete_and_wait_pv(core_api, sb_volume2_name) # cleanup std_volume.detach() sb_volume0.detach() sb_volume1.detach() std_volume = common.wait_for_volume_detached(client, volume_name) sb_volume0 = common.wait_for_volume_detached(client, sb_volume0_name) sb_volume1 = common.wait_for_volume_detached(client, sb_volume1_name) sb_volume2 = common.wait_for_volume_detached(client, sb_volume2_name) bv.backupDelete(name=backup2["name"]) bv.backupDelete(name=backup1["name"]) bv.backupDelete(name=backup0["name"]) client.delete(std_volume) client.delete(sb_volume0) client.delete(sb_volume1) client.delete(sb_volume2) wait_for_volume_delete(client, volume_name) wait_for_volume_delete(client, sb_volume0_name) wait_for_volume_delete(client, sb_volume1_name) wait_for_volume_delete(client, sb_volume2_name) volumes = client.list_volume() assert len(volumes) == 0
def test_engine_live_upgrade_with_intensive_data_writing( client, core_api, volume_name, pod_make): # NOQA """ Test engine live upgrade with intensive data writing 1. Deploy a compatible new engine image 2. Create a volume(with the old default engine image) with /PV/PVC/Pod and wait for pod to be deployed. 3. Write data to a tmp file in the pod and get the md5sum 4. Upgrade the volume to the new engine image without waiting. 5. Keep copying data from the tmp file to the volume during the live upgrade. 6. Wait until the upgrade completed, verify the volume engine image changed 7. Wait for new replica mode update then check the engine status. 8. Verify all engine and replicas' engine image changed 9. Verify the reference count of the new engine image changed 10. Check the existing data. Then write new data to the upgraded volume and get the md5sum. 11. Delete the pod and wait for the volume detached. Then check engine and replicas's engine image again. 12. Recreate the pod. 13. Check if the attached volume is state `healthy` rather than `degraded`. 14. Check the data. """ default_img = common.get_default_engine_image(client) default_img_name = default_img.name default_img = wait_for_engine_image_ref_count(client, default_img_name, 0) cli_v = default_img.cliAPIVersion cli_minv = default_img.cliAPIMinVersion ctl_v = default_img.controllerAPIVersion ctl_minv = default_img.controllerAPIMinVersion data_v = default_img.dataFormatVersion data_minv = default_img.dataFormatMinVersion engine_upgrade_image = common.get_upgrade_test_image( cli_v, cli_minv, ctl_v, ctl_minv, data_v, data_minv) new_img = client.create_engine_image(image=engine_upgrade_image) new_img_name = new_img.name ei_status_value = get_engine_image_status_value(client, new_img_name) new_img = wait_for_engine_image_state(client, new_img_name, ei_status_value) assert new_img.refCount == 0 assert new_img.noRefSince != "" default_img = common.get_default_engine_image(client) default_img_name = default_img.name pod_name = volume_name + "-pod" pv_name = volume_name + "-pv" pvc_name = volume_name + "-pvc" pod = pod_make(name=pod_name) volume = create_and_check_volume(client, volume_name, num_of_replicas=3, size=str(1 * Gi)) original_engine_image = volume.engineImage assert original_engine_image != engine_upgrade_image create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) pod['spec']['volumes'] = [create_pvc_spec(pvc_name)] create_and_wait_pod(core_api, pod) volume = client.by_id_volume(volume_name) assert volume.engineImage == original_engine_image assert volume.currentImage == original_engine_image engine = get_volume_engine(volume) assert engine.engineImage == original_engine_image assert engine.currentImage == original_engine_image for replica in volume.replicas: assert replica.engineImage == original_engine_image assert replica.currentImage == original_engine_image data_path0 = "/tmp/test" data_path1 = "/data/test1" write_pod_volume_random_data(core_api, pod_name, data_path0, RANDOM_DATA_SIZE_LARGE) original_md5sum1 = get_pod_data_md5sum(core_api, pod_name, data_path0) volume.engineUpgrade(image=engine_upgrade_image) # Keep writing data to the volume during the live upgrade copy_pod_volume_data(core_api, pod_name, data_path0, data_path1) # Wait for live upgrade complete wait_for_volume_current_image(client, volume_name, engine_upgrade_image) volume = wait_for_volume_replicas_mode(client, volume_name, "RW") engine = get_volume_engine(volume) assert engine.engineImage == engine_upgrade_image check_volume_endpoint(volume) wait_for_engine_image_ref_count(client, default_img_name, 0) wait_for_engine_image_ref_count(client, new_img_name, 1) volume_file_md5sum1 = get_pod_data_md5sum(core_api, pod_name, data_path1) assert volume_file_md5sum1 == original_md5sum1 data_path2 = "/data/test2" write_pod_volume_random_data(core_api, pod_name, data_path2, RANDOM_DATA_SIZE_SMALL) original_md5sum2 = get_pod_data_md5sum(core_api, pod_name, data_path2) delete_and_wait_pod(core_api, pod_name) volume = wait_for_volume_detached(client, volume_name) assert len(volume.replicas) == 3 assert volume.engineImage == engine_upgrade_image engine = get_volume_engine(volume) assert engine.engineImage == engine_upgrade_image for replica in volume.replicas: assert replica.engineImage == engine_upgrade_image create_and_wait_pod(core_api, pod) common.wait_for_volume_healthy(client, volume_name) volume_file_md5sum1 = get_pod_data_md5sum(core_api, pod_name, data_path1) assert volume_file_md5sum1 == original_md5sum1 volume_file_md5sum2 = get_pod_data_md5sum(core_api, pod_name, data_path2) assert volume_file_md5sum2 == original_md5sum2
def test_pvc_creation_with_default_sc_set(client, core_api, storage_class, pod): # NOQA # set default storage class storage_class['metadata']['annotations'] = \ {"storageclass.kubernetes.io/is-default-class": "true"} create_storage_class(storage_class) static_sc_name = "longhorn-static-test" setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC) setting = client.update(setting, value=static_sc_name) assert setting["value"] == static_sc_name volume_name = "test-pvc-creation-with-sc" pod_name = "pod-" + volume_name client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pv_name = "pv-" + volume_name pvc_name = "pvc-" + volume_name pvc_name_extra = "pvc-" + volume_name + "-extra" create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') for item in ret.items: if item.metadata.name == pvc_name: pvc_found = item break assert pvc_found assert pvc_found.spec.storage_class_name == static_sc_name pod['metadata']['name'] = pod_name pod['spec']['volumes'] = [{ 'name': pod['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': { 'claimName': pvc_name, }, }] create_and_wait_pod(core_api, pod) ks = { 'pvName': pv_name, 'pvStatus': 'Bound', 'namespace': 'default', 'pvcName': pvc_name, 'lastPVCRefAt': '', 'lastPodRefAt': '', 'workloadsStatus': [ { 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '', }, ], } wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name) # try to reuse the pv volume = wait_for_volume_detached(client, volume_name) create_pvc_for_volume(client, core_api, volume, pvc_name_extra) pod['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = \ pvc_name_extra create_and_wait_pod(core_api, pod) ks['pvcName'] = pvc_name_extra wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name_extra) delete_and_wait_pv(core_api, pv_name) # without default storage class delete_storage_class(storage_class['metadata']['name']) create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') for item in ret.items: if item.metadata.name == pvc_name: pvc2 = item break assert pvc2 assert pvc2.spec.storage_class_name == static_sc_name delete_and_wait_pvc(core_api, pvc_name) delete_and_wait_pv(core_api, pv_name)
def test_recurring_jobs_for_detached_volume(set_random_backupstore, client, core_api, apps_api, volume_name, make_deployment_with_pvc): # NOQA """ Test recurring jobs for detached volume Context: In the current Longhorn implementation, users cannot do recurring backup when volumes are detached. This feature gives the users an option to do recurring backup even when volumes are detached. longhorn/longhorn#1509 Steps: 1. Change the setting allow-recurring-job-while-volume-detached to true. 2. Create and attach volume, write 50MB data to the volume. 3. Detach the volume. 4. Set the recurring backup for the volume on every minute. 5. In a 2-minutes retry loop, verify that there is exactly 1 new backup. 6. Delete the recurring backup. 7. Create a PV and PVC from the volume. 8. Create a deployment of 1 pod using the PVC. 9. Write 400MB data to the volume from the pod. 10. Scale down the deployment. Wait until the volume is detached. 11. Set the recurring backup for every 2 minutes. 12. Wait util the recurring backup starts, scale up the deployment to 1 pod. 13. Verify that during the recurring backup, the volume's frontend is disabled, and pod cannot start. 14. Wait for the recurring backup finishes. Delete the recurring backup. 15. In a 10-minutes retry loop, verify that the pod can eventually start. 16. Change the setting allow-recurring-job-while-volume-detached to false. 17. Cleanup. """ recurring_job_setting = \ client.by_id_setting(SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED) client.update(recurring_job_setting, value="true") vol = common.create_and_check_volume(client, volume_name, size=str(1 * Gi)) lht_hostId = get_self_host_id() vol.attach(hostId=lht_hostId) vol = wait_for_volume_healthy(client, vol.name) data = { 'pos': 0, 'content': common.generate_random_data(50 * Mi), } common.write_volume_data(vol, data) # Give sometimes for data to flush to disk time.sleep(15) vol.detach(hostId="") vol = common.wait_for_volume_detached(client, vol.name) jobs = [{ "name": RECURRING_JOB_NAME, "cron": "*/1 * * * *", "task": "backup", "retain": 1 }] vol.recurringUpdate(jobs=jobs) common.wait_for_backup_completion(client, vol.name) for _ in range(4): bv = client.by_id_backupVolume(vol.name) backups = bv.backupList().data assert len(backups) == 1 time.sleep(30) vol.recurringUpdate(jobs=[]) pv_name = volume_name + "-pv" common.create_pv_for_volume(client, core_api, vol, pv_name) pvc_name = volume_name + "-pvc" common.create_pvc_for_volume(client, core_api, vol, pvc_name) deployment_name = volume_name + "-dep" deployment = make_deployment_with_pvc(deployment_name, pvc_name) common.create_and_wait_deployment(apps_api, deployment) size_mb = 400 pod_names = common.get_deployment_pod_names(core_api, deployment) write_pod_volume_random_data(core_api, pod_names[0], "/data/test", size_mb) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment["metadata"]["name"]) vol = common.wait_for_volume_detached(client, vol.name) jobs = [{ "name": RECURRING_JOB_NAME, "cron": "*/2 * * * *", "task": "backup", "retain": 1 }] vol.recurringUpdate(jobs=jobs) common.wait_for_backup_to_start(client, vol.name) deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment["metadata"]["name"]) deployment_label_name = deployment["metadata"]["labels"]["name"] common.wait_pod_auto_attach_after_first_backup_completion( client, core_api, vol.name, deployment_label_name) vol.recurringUpdate(jobs=[]) pod_names = common.get_deployment_pod_names(core_api, deployment) common.wait_for_pod_phase(core_api, pod_names[0], pod_phase="Running")
def test_pvc_creation_with_default_sc_set(client, core_api, storage_class, pod): # NOQA """ Test creating PVC with default StorageClass set The target is to make sure the newly create PV/PVC won't use default StorageClass, and if there is no default StorageClass, PV/PVC can still be created. 1. Create a StorageClass and set it to be the default StorageClass 2. Update static StorageClass to `longhorn-static-test` 3. Create volume then PV/PVC. 4. Make sure the newly created PV/PVC using StorageClass `longhorn-static-test` 5. Create pod with PVC. 6. Verify volume's Kubernetes Status 7. Remove PVC and Pod. 8. Verify volume's Kubernetes Status only contains current PV and history 9. Wait for volume to detach (since pod is deleted) 10. Reuse the volume on a new pod. Wait for the pod to start 11. Verify volume's Kubernetes Status reflect the new pod. 12. Delete PV/PVC/Pod. 13. Verify volume's Kubernetes Status only contains history 14. Delete the default StorageClass. 15. Create PV/PVC for the volume. 16. Make sure the PV's StorageClass is static StorageClass """ # set default storage class storage_class['metadata']['annotations'] = \ {"storageclass.kubernetes.io/is-default-class": "true"} create_storage_class(storage_class) static_sc_name = "longhorn-static-test" setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC) setting = client.update(setting, value=static_sc_name) assert setting.value == static_sc_name volume_name = "test-pvc-creation-with-sc" # NOQA pod_name = "pod-" + volume_name client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pv_name = "pv-" + volume_name pvc_name = "pvc-" + volume_name pvc_name_extra = "pvc-" + volume_name + "-extra" create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') for item in ret.items: if item.metadata.name == pvc_name: pvc_found = item break assert pvc_found assert pvc_found.spec.storage_class_name == static_sc_name pod['metadata']['name'] = pod_name pod['spec']['volumes'] = [{ 'name': pod['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': { 'claimName': pvc_name, }, }] create_and_wait_pod(core_api, pod) ks = { 'pvName': pv_name, 'pvStatus': 'Bound', 'namespace': 'default', 'pvcName': pvc_name, 'lastPVCRefAt': '', 'lastPodRefAt': '', 'workloadsStatus': [ { 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '', }, ], } wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name) ks = { 'pvName': pv_name, 'pvStatus': 'Released', 'namespace': 'default', 'pvcName': pvc_name, 'lastPVCRefAt': 'not empty', 'lastPodRefAt': 'not empty', } wait_volume_kubernetes_status(client, volume_name, ks) # try to reuse the pv volume = wait_for_volume_detached(client, volume_name) create_pvc_for_volume(client, core_api, volume, pvc_name_extra) pod['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = \ pvc_name_extra create_and_wait_pod(core_api, pod) ks = { 'pvName': pv_name, 'pvStatus': 'Bound', 'namespace': 'default', 'pvcName': pvc_name_extra, 'lastPVCRefAt': '', 'lastPodRefAt': '', 'workloadsStatus': [ { 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '', }, ], } wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name_extra) delete_and_wait_pv(core_api, pv_name) ks = { 'pvName': '', 'pvStatus': '', 'namespace': 'default', 'pvcName': pvc_name_extra, 'lastPVCRefAt': 'not empty', 'lastPodRefAt': 'not empty', } wait_volume_kubernetes_status(client, volume_name, ks) # without default storage class delete_storage_class(storage_class['metadata']['name']) create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') for item in ret.items: if item.metadata.name == pvc_name: pvc2 = item break assert pvc2 assert pvc2.spec.storage_class_name == static_sc_name delete_and_wait_pvc(core_api, pvc_name) delete_and_wait_pv(core_api, pv_name)
def generate_load(request): index = get_random_suffix() longhorn_api_client = get_longhorn_api_client() k8s_api_client = get_core_api_client() check_and_set_backupstore(longhorn_api_client) volume_name = STRESS_VOLUME_NAME_PREFIX + index pv_name = STRESS_PV_NAME_PREFIX + index pvc_name = STRESS_PVC_NAME_PREFIX + index pod_name = STRESS_POD_NAME_PREFIX + index atexit.register(remove_datafile, pod_name) atexit.register(delete_and_wait_longhorn, longhorn_api_client, volume_name) atexit.register(delete_and_wait_pv, k8s_api_client, pv_name) atexit.register(delete_and_wait_pvc, k8s_api_client, pvc_name) atexit.register(delete_and_wait_pod, k8s_api_client, pod_name) longhorn_volume = create_and_check_volume(longhorn_api_client, volume_name, size=VOLUME_SIZE) wait_for_volume_detached(longhorn_api_client, volume_name) pod_manifest = generate_pod_with_pvc_manifest(pod_name, pvc_name) create_pv_for_volume(longhorn_api_client, k8s_api_client, longhorn_volume, pv_name) create_pvc_for_volume(longhorn_api_client, k8s_api_client, longhorn_volume, pvc_name) create_and_wait_pod(k8s_api_client, pod_manifest) snapshots_md5sum = dict() write_data(k8s_api_client, pod_name) create_recurring_jobs(longhorn_api_client, volume_name) global N_RANDOM_ACTIONS for round in range(N_RANDOM_ACTIONS): action = randrange(0, 8) if action == 0: print("write data started: " + time_now(), end=', ') write_data(k8s_api_client, pod_name) print("ended: " + time_now()) elif action == 1: print("delete data started: " + time_now(), end=', ') delete_data(k8s_api_client, pod_name) print("ended: " + time_now()) elif action == 2: print("create snapshot started: " + time_now(), end=', ') snapshot_create_and_record_md5sum(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 3: print("delete random snapshot started: " + time_now(), end=', ') delete_random_snapshot(longhorn_api_client, volume_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 4: print("revert random snapshot started: " + time_now(), end=', ') revert_random_snapshot(longhorn_api_client, k8s_api_client, volume_name, pod_manifest, snapshots_md5sum) print("ended: " + time_now()) elif action == 5: print("create backup started: " + time_now(), end=', ') backup_create_and_record_md5sum(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 6: print("delete replica started: " + time_now(), end=', ') delete_replica(longhorn_api_client, volume_name) print("ended: " + time_now()) elif action == 7: print("restore random backup started: " + time_now(), end=', ') restore_and_check_random_backup(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) clean_volume_backups(longhorn_api_client, volume_name)
def test_restore_rwo_volume_to_rwx(set_random_backupstore, client, core_api, volume_name, pvc, csi_pv, pod_make, make_deployment_with_pvc): # NOQA """ Test restoring a rwo to a rwx volume. 1. Create a volume with 'accessMode' rwo. 2. Create a PV and a PVC with access mode 'readwriteonce' and attach to the volume. 3. Create a pod and attach to the PVC. 4. Write some data into the pod and compute md5sum. 5. Take a backup of the volume. 6. Restore the backup with 'accessMode' rwx. 7. Create PV and PVC and attach to 2 pods. 8. Verify the data. """ data_path = "/data/test" pod_name, pv_name, pvc_name, md5sum = \ prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc, pod_make, volume_name, data_size_in_mb=DATA_SIZE_IN_MB_1, data_path=data_path) snap = create_snapshot(client, volume_name) volume = client.by_id_volume(volume_name) volume.snapshotBackup(name=snap.name) wait_for_backup_completion(client, volume_name, snap.name) bv, b1 = find_backup(client, volume_name, snap.name) restore_volume_name = 'restored-rwx-volume' restore_pv_name = restore_volume_name + "-pv" restore_pvc_name = restore_volume_name + "-pvc" client.create_volume(name=restore_volume_name, size=str(1 * Gi), numberOfReplicas=3, fromBackup=b1.url, accessMode='rwx') wait_for_volume_creation(client, restore_volume_name) restore_volume = wait_for_volume_detached(client, restore_volume_name) create_pv_for_volume(client, core_api, restore_volume, restore_pv_name) create_pvc_for_volume(client, core_api, restore_volume, restore_pvc_name) deployment = make_deployment_with_pvc('deployment-multi-pods-test', restore_pvc_name, replicas=2) apps_api = get_apps_api_client() create_and_wait_deployment(apps_api, deployment) deployment_label_selector = \ "name=" + deployment["metadata"]["labels"]["name"] deployment_pod_list = \ core_api.list_namespaced_pod(namespace="default", label_selector=deployment_label_selector) pod_name_1 = deployment_pod_list.items[0].metadata.name pod_name_2 = deployment_pod_list.items[1].metadata.name md5sum_pod1 = get_pod_data_md5sum(core_api, pod_name_1, data_path) md5sum_pod2 = get_pod_data_md5sum(core_api, pod_name_2, data_path) assert md5sum == md5sum_pod1 == md5sum_pod2
def test_data_locality_basic(client, core_api, volume_name, pod, settings_reset): # NOQA """ Test data locality basic feature Context: Data Locality feature allows users to have an option to keep a local replica on the same node as the consuming pod. Longhorn is currently supporting 2 modes: - disabled: Longhorn does not try to keep a local replica - best-effort: Longhorn try to keep a local replica See manual tests at: https://github.com/longhorn/longhorn/issues/1045#issuecomment-680706283 Steps: Case 1: Test that Longhorn builds a local replica on the engine node 1. Create a volume(1) with 1 replica and dataLocality set to disabled 2. Find node where the replica is located on. Let's call the node is replica-node 3. Attach the volume to a node different than replica-node. Let call the node is engine-node 4. Write 200MB data to volume(1) 5. Use a retry loop to verify that Longhorn does not create a replica on the engine-node 6. Update dataLocality to best-effort for volume(1) 7. Use a retry loop to verify that Longhorn creates and rebuilds a replica on the engine-node and remove the other replica 8. detach the volume(1) and attach it to a different node. Let's call the new node is new-engine-node and the old node is old-engine-node 9. Wait for volume(1) to finish attaching 10. Use a retry loop to verify that Longhorn creates and rebuilds a replica on the new-engine-node and remove the replica on old-engine-node Case 2: Test that Longhorn prioritizes deleting replicas on the same node 1. Add the tag AVAIL to node-1 and node-2 2. Set node soft anti-affinity to `true`. 3. Create a volume(2) with 3 replicas and dataLocality set to best-effort 4. Use a retry loop to verify that all 3 replicas are on node-1 and node-2, no replica is on node-3 5. Attach volume(2) to node-3 6. User a retry loop to verify that there is no replica on node-3 and we can still read/write to volume(2) 7. Find the node which contains 2 replicas. Let call the node is most-replica-node 8. Set the replica count to 2 for volume(2) 9. Verify that Longhorn remove one replica from most-replica-node Case 3: Test that the volume is not corrupted if there is an unexpected detachment during building local replica 1. Remove the tag AVAIL from node-1 and node-2 Set node soft anti-affinity to `false`. 2. Create a volume(3) with 1 replicas and dataLocality set to best-effort 3. Attach volume(3) to node-3. 4. Use a retry loop to verify that volume(3) has only 1 replica on node-3 5. Write 800MB data to volume(3) 6. Detach volume(3) 7. Attach volume(3) to node-1 8. Use a retry loop to: Wait until volume(3) finishes attaching. Wait until Longhorn start rebuilding a replica on node-1 Immediately detach volume(3) 9. Verify that the replica on node-1 is in ERR state. 10. Attach volume(3) to node-1 11. Wait until volume(3) finishes attaching. 12. Use a retry loop to verify the Longhorn cleanup the ERR replica, rebuild a new replica on node-1, and remove the replica on node-3 Case 4: Make sure failed to schedule local replica doesn't block the the creation of other replicas. 1. Disable scheduling for node-3 2. Create a vol with 1 replica, `dataLocality = best-effort`. The replica is scheduled on a node (say node-1) 3. Attach vol to node-3. There is a fail-to-schedule replica with Spec.HardNodeAffinity=node-3 4. Increase numberOfReplica to 3. Verify that the replica set contains: one on node-1, one on node-2, one failed replica with Spec.HardNodeAffinity=node-3. 5. Decrease numberOfReplica to 2. Verify that the replica set contains: one on node-1, one on node-2, one failed replica with Spec.HardNodeAffinity=node-3. 6. Decrease numberOfReplica to 1. Verify that the replica set contains: one on node-1 or node-2, one failed replica with Spec.HardNodeAffinity=node-3. 7. Decrease numberOfReplica to 2. Verify that the replica set contains: one on node-1, one on node-2, one failed replica with Spec.HardNodeAffinity=node-3. 8. Turn off data locality by set `dataLocality=disabled` for the vol. Verify that the replica set contains: one on node-1, one on node-2 9. clean up """ # Case 1: Test that Longhorn builds a local replica on the engine node nodes = client.list_node() default_data_locality_setting = \ client.by_id_setting(SETTING_DEFAULT_DATA_LOCALITY) try: client.update(default_data_locality_setting, value="disabled") except Exception as e: print("Exception when update Default Data Locality setting", default_data_locality_setting, e) volume1_name = volume_name + "-1" volume1_size = str(500 * Mi) volume1_data_path = "/data/test" pv1_name = volume1_name + "-pv" pvc1_name = volume1_name + "-pvc" pod1_name = volume1_name + "-pod" pod1 = pod pod1['metadata']['name'] = pod1_name volume1 = create_and_check_volume(client, volume1_name, num_of_replicas=1, size=volume1_size) volume1 = client.by_id_volume(volume1_name) create_pv_for_volume(client, core_api, volume1, pv1_name) create_pvc_for_volume(client, core_api, volume1, pvc1_name) volume1 = client.by_id_volume(volume1_name) volume1_replica_node = volume1.replicas[0]['hostId'] volume1_attached_node = None for node in nodes: if node.name != volume1_replica_node: volume1_attached_node = node.name break assert volume1_attached_node is not None pod1['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": pvc1_name } }] pod1['spec']['nodeSelector'] = \ {"kubernetes.io/hostname": volume1_attached_node} create_and_wait_pod(core_api, pod1) write_pod_volume_random_data(core_api, pod1_name, volume1_data_path, DATA_SIZE_IN_MB_2) for i in range(10): volume1 = client.by_id_volume(volume1_name) assert len(volume1.replicas) == 1 assert volume1.replicas[0]['hostId'] != volume1_attached_node time.sleep(1) volume1 = client.by_id_volume(volume1_name) volume1.updateDataLocality(dataLocality="best-effort") for _ in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) assert volume1[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY if len(volume1.replicas) == 1 and \ volume1.replicas[0]['hostId'] == volume1_attached_node: break time.sleep(RETRY_INTERVAL) assert len(volume1.replicas) == 1 assert volume1.replicas[0]['hostId'] == volume1_attached_node delete_and_wait_pod(core_api, pod1_name) volume1 = wait_for_volume_detached(client, volume1_name) volume1_replica_node = volume1.replicas[0]['hostId'] volume1_attached_node = None for node in nodes: if node.name != volume1_replica_node: volume1_attached_node = node.name break assert volume1_attached_node is not None pod1['spec']['nodeSelector'] = \ {"kubernetes.io/hostname": volume1_attached_node} create_and_wait_pod(core_api, pod1) for _ in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) assert volume1[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY if len(volume1.replicas) == 1 and \ volume1.replicas[0]['hostId'] == volume1_attached_node: break time.sleep(RETRY_INTERVAL) assert len(volume1.replicas) == 1 assert volume1.replicas[0]['hostId'] == volume1_attached_node delete_and_wait_pod(core_api, pod1_name) wait_for_volume_detached(client, volume1_name) # Case 2: Test that Longhorn prioritizes deleting replicas on the same node node1 = nodes[0] node2 = nodes[1] node3 = nodes[2] client.update(node1, allowScheduling=True, tags=["AVAIL"]) client.update(node2, allowScheduling=True, tags=["AVAIL"]) replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) try: client.update(replica_node_soft_anti_affinity_setting, value="true") except Exception as e: print( "Exception when update " "Replica Node Level Soft Anti-Affinity setting", replica_node_soft_anti_affinity_setting, e) volume2_name = volume_name + "-2" volume2_size = str(500 * Mi) pv2_name = volume2_name + "-pv" pvc2_name = volume2_name + "-pvc" pod2_name = volume2_name + "-pod" pod2 = pod pod2['metadata']['name'] = pod2_name volume2 = client.create_volume(name=volume2_name, size=volume2_size, numberOfReplicas=3, nodeSelector=["AVAIL"], dataLocality="best-effort") volume2 = wait_for_volume_detached(client, volume2_name) volume2 = client.by_id_volume(volume2_name) create_pv_for_volume(client, core_api, volume2, pv2_name) create_pvc_for_volume(client, core_api, volume2, pvc2_name) volume2 = client.by_id_volume(volume2_name) pod2['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": pvc2_name } }] pod2['spec']['nodeSelector'] = {"kubernetes.io/hostname": node3.name} create_and_wait_pod(core_api, pod2) volume2 = wait_for_volume_healthy(client, volume2_name) for replica in volume2.replicas: assert replica["hostId"] != node3.name volume2.updateReplicaCount(replicaCount=2) # 2 Healthy replicas and 1 replica failed to schedule # The failed to schedule replica is the local replica on node3 volume2 = wait_for_volume_replica_count(client, volume2_name, 3) volume2 = client.by_id_volume(volume2_name) volume2_healthy_replicas = [] for replica in volume2.replicas: if replica.running is True: volume2_healthy_replicas.append(replica) assert len(volume2_healthy_replicas) == 2 volume2_rep1 = volume2_healthy_replicas[0] volume2_rep2 = volume2_healthy_replicas[1] assert volume2_rep1["hostId"] != volume2_rep2["hostId"] delete_and_wait_pod(core_api, pod2_name) wait_for_volume_detached(client, volume2_name) # Case 3: Test that the volume is not corrupted if there is an unexpected # detachment during building local replica client.update(node1, allowScheduling=True, tags=[]) client.update(node2, allowScheduling=True, tags=[]) replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) try: client.update(replica_node_soft_anti_affinity_setting, value="false") except Exception as e: print( "Exception when update " "Replica Node Level Soft Anti-Affinity setting", replica_node_soft_anti_affinity_setting, e) volume3_name = volume_name + "-3" volume3_size = str(1 * Gi) volume3_data_path = "/data/test" pv3_name = volume3_name + "-pv" pvc3_name = volume3_name + "-pvc" pod3_name = volume3_name + "-pod" pod3 = pod pod3['metadata']['name'] = pod3_name volume3 = client.create_volume(name=volume3_name, size=volume3_size, numberOfReplicas=1) volume3 = wait_for_volume_detached(client, volume3_name) volume3 = client.by_id_volume(volume3_name) create_pv_for_volume(client, core_api, volume3, pv3_name) create_pvc_for_volume(client, core_api, volume3, pvc3_name) volume3 = client.by_id_volume(volume3_name) pod3['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": pvc3_name } }] pod3['spec']['nodeSelector'] = {"kubernetes.io/hostname": node3.name} create_and_wait_pod(core_api, pod3) volume3 = wait_for_volume_healthy(client, volume3_name) write_pod_volume_random_data(core_api, pod3_name, volume3_data_path, DATA_SIZE_IN_MB_4) volume3.updateDataLocality(dataLocality="best-effort") volume3 = client.by_id_volume(volume3_name) if volume3.replicas[0]['hostId'] != node3.name: wait_for_rebuild_start(client, volume3_name) volume3 = client.by_id_volume(volume3_name) assert len(volume3.replicas) == 2 wait_for_rebuild_complete(client, volume3_name) volume3 = wait_for_volume_replica_count(client, volume3_name, 1) assert volume3.replicas[0]["hostId"] == node3.name delete_and_wait_pod(core_api, pod3_name) pod3['spec']['nodeSelector'] = {"kubernetes.io/hostname": node1.name} create_and_wait_pod(core_api, pod3) wait_for_rebuild_start(client, volume3_name) crash_engine_process_with_sigkill(client, core_api, volume3_name) delete_and_wait_pod(core_api, pod3_name) wait_for_volume_detached(client, volume3_name) volume3 = client.by_id_volume(volume3_name) assert len(volume3.replicas) == 1 assert volume3.replicas[0]["hostId"] == node3.name create_and_wait_pod(core_api, pod3) wait_for_rebuild_start(client, volume3_name) volume3 = client.by_id_volume(volume3_name) assert len(volume3.replicas) == 2 wait_for_rebuild_complete(client, volume3_name) # Wait for deletion of extra replica volume3 = wait_for_volume_replica_count(client, volume3_name, 1) assert volume3.replicas[0]["hostId"] == node1.name assert volume3.replicas[0]["mode"] == "RW" assert volume3.replicas[0]["running"] is True delete_and_wait_pod(core_api, pod3_name) wait_for_volume_detached(client, volume3_name) # Case 4: Make sure failed to schedule local replica doesn't block the # the creation of other replicas. replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) try: client.update(replica_node_soft_anti_affinity_setting, value="false") except Exception as e: print( "Exception when update " "Replica Node Level Soft Anti-Affinity setting", replica_node_soft_anti_affinity_setting, e) client.update(node3, allowScheduling=False) volume4_name = volume_name + "-4" volume4_size = str(1 * Gi) volume4 = client.create_volume(name=volume4_name, size=volume4_size, numberOfReplicas=1, dataLocality="best-effort") volume4 = wait_for_volume_detached(client, volume4_name) volume4 = client.by_id_volume(volume4_name) volume4_replica_name = volume4.replicas[0]["name"] volume4.attach(hostId=node3.name) wait_for_volume_healthy(client, volume4_name) volume4 = client.by_id_volume(volume4_name) assert len(volume4.replicas) == 2 for replica in volume4.replicas: if replica["name"] == volume4_replica_name: assert replica["running"] is True assert replica["mode"] == "RW" else: assert replica["running"] is False assert replica["mode"] == "" assert volume4.conditions.scheduled.reason == \ "LocalReplicaSchedulingFailure" volume4 = volume4.updateReplicaCount(replicaCount=3) volume4 = wait_for_volume_degraded(client, volume4_name) v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_failed_replica_count = 0 for replica in volume4.replicas: if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == "": v4_failed_replica_count += 1 assert v4_node1_replica_count == 1 assert v4_node2_replica_count == 1 assert v4_failed_replica_count > 0 volume4 = volume4.updateReplicaCount(replicaCount=2) volume4 = wait_for_volume_replica_count(client, volume4_name, 3) v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_failed_replica_count = 0 for replica in volume4.replicas: if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == "": v4_failed_replica_count += 1 assert v4_node1_replica_count == 1 assert v4_node2_replica_count == 1 assert v4_failed_replica_count > 0 volume4 = volume4.updateReplicaCount(replicaCount=1) volume4 = wait_for_volume_replica_count(client, volume4_name, 2) v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_failed_replica_count = 0 for replica in volume4.replicas: if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == "": v4_failed_replica_count += 1 assert v4_node1_replica_count + v4_node2_replica_count == 1 assert v4_failed_replica_count == 1 volume4 = volume4.updateDataLocality(dataLocality="disabled") volume4 = volume4.updateReplicaCount(replicaCount=2) running_replica_count = 0 for _ in range(RETRY_COUNTS): volume4 = client.by_id_volume(volume4_name) running_replica_count = 0 for r in volume4.replicas: if r.failedAt == "" and r.running is True: running_replica_count += 1 if running_replica_count == 2: break time.sleep(RETRY_INTERVAL) assert running_replica_count == 2 v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_node3_replica_count = 0 for replica in volume4.replicas: wait_for_replica_running(client, volume4_name, replica["name"]) if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == node3.name: v4_node3_replica_count += 1 assert v4_node1_replica_count == 1 assert v4_node2_replica_count == 1 assert v4_node3_replica_count == 0