def test_replica_zone_anti_affinity(client, core_api, volume_name, k8s_node_zone_tags): # NOQA """ Test replica scheduler with zone anti-affinity 1. Set zone anti-affinity to hard. 2. Label nodes 1 & 2 with same zone label "zone1". Label node 3 with zone label "zone2". 3. Create a volume with 3 replicas. 4. Wait for volume condition `scheduled` to be false. 5. Label node 2 with zone label "zone3". 6. Wait for volume condition `scheduled` to be success. 7. Clear the volume. 8. Set zone anti-affinity to soft. 9. Change the zone labels on node 1 & 2 & 3 to "zone1". 10. Create a volume. 11. Wait for volume condition `scheduled` to be success. 12. Clean up the replica count, the zone labels and the volume. """ wait_longhorn_node_zone_updated(client) replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(replica_node_soft_anti_affinity_setting, value="false") replica_zone_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY) client.update(replica_zone_soft_anti_affinity_setting, value="false") volume = create_and_check_volume(client, volume_name) lh_nodes = client.list_node() count = 0 for node in lh_nodes: count += 1 set_k8s_node_zone_label(core_api, node.name, "lh-zone" + str(count)) wait_longhorn_node_zone_updated(client) wait_for_volume_condition_scheduled(client, volume_name, "status", CONDITION_STATUS_TRUE) replica_zone_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY) client.update(replica_zone_soft_anti_affinity_setting, value="true") volume = client.by_id_volume(volume_name) client.delete(volume) wait_for_volume_delete(client, volume_name) for node in lh_nodes: set_k8s_node_zone_label(core_api, node.name, "lh-zone1") wait_longhorn_node_zone_updated(client) volume = create_and_check_volume(client, volume_name) wait_for_volume_condition_scheduled(client, volume_name, "status", CONDITION_STATUS_TRUE)
def csi_backup_test(client, core_api, csi_pv, pvc, pod_make, base_image=""): # NOQA pod_name = 'csi-backup-test' create_and_wait_csi_pod(pod_name, client, core_api, csi_pv, pvc, pod_make, base_image, "") test_data = generate_random_data(VOLUME_RWTEST_SIZE) setting = client.by_id_setting(common.SETTING_BACKUP_TARGET) # test backupTarget for multiple settings backupstores = common.get_backupstore_url() i = 1 for backupstore in backupstores: if common.is_backupTarget_s3(backupstore): backupsettings = backupstore.split("$") setting = client.update(setting, value=backupsettings[0]) assert setting.value == backupsettings[0] credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value=backupsettings[1]) assert credential.value == backupsettings[1] else: setting = client.update(setting, value=backupstore) assert setting.value == backupstore credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value="") assert credential.value == "" backupstore_test(client, core_api, csi_pv, pvc, pod_make, pod_name, base_image, test_data, i) i += 1
def csi_backup_test(client, core_api, csi_pv, pvc, pod_make, base_image=""): # NOQA pod_name = 'csi-backup-test' create_and_wait_csi_pod(pod_name, client, core_api, csi_pv, pvc, pod_make, base_image, "") test_data = generate_random_data(VOLUME_RWTEST_SIZE) setting = client.by_id_setting(common.SETTING_BACKUP_TARGET) # test backupTarget for multiple settings backupstores = common.get_backupstore_url() i = 1 for backupstore in backupstores: if common.is_backupTarget_s3(backupstore): backupsettings = backupstore.split("$") setting = client.update(setting, value=backupsettings[0]) assert setting["value"] == backupsettings[0] credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value=backupsettings[1]) assert credential["value"] == backupsettings[1] else: setting = client.update(setting, value=backupstore) assert setting["value"] == backupstore credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value="") assert credential["value"] == "" backupstore_test(client, core_api, csi_pv, pvc, pod_make, pod_name, base_image, test_data, i) i += 1
def test_soft_anti_affinity_detach(client, volume_name): # NOQA """ Test that volumes with Soft Anti-Affinity can detach and reattach to a node properly. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY) client.update(setting, value="true") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume["replicas"]) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica["name"]) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) assert len(volume["replicas"]) == 3 volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_soft_anti_affinity_scheduling(client, volume_name): # NOQA """ Test that volumes with Soft Anti-Affinity work as expected. With Soft Anti-Affinity, a new replica should still be scheduled on a node with an existing replica, which will result in "Healthy" state but limited redundancy. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY) client.update(setting, value="true") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume["replicas"]) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica["name"]) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_hard_anti_affinity_offline_rebuild(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity can build new replicas during the attaching process once a valid node is available. Once a new replica has been built as part of the attaching process, the volume should be Healthy again. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume["replicas"]) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica["name"]) volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) client.update(node, allowScheduling=True) volume.attach(hostId=host_id) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_hard_anti_affinity_live_rebuild(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity can build new replicas live once a valid node is available. If no nodes without existing replicas are available, the volume should remain in "Degraded" state. However, once one is available, the replica should now be scheduled successfully, with the volume returning to "Healthy" state. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume["replicas"]) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica["name"]) wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) # Allow scheduling on host node again client.update(node, allowScheduling=True) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_replica_scheduler_update_over_provisioning(client): # NOQA nodes = client.list_node() lht_hostId = get_self_host_id() expect_node_disk = {} for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 0 # to test all replica couldn't be scheduled over_provisioning_setting = client.update(over_provisioning_setting, value="0") vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=SIZE, numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_FALSE) # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") # check volume status volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)
def test_replica_scheduler_update_over_provisioning(client): # NOQA nodes = client.list_node() lht_hostId = get_self_host_id() expect_node_disk = {} for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 0 # to test all replica couldn't be scheduled over_provisioning_setting = client.update(over_provisioning_setting, value="0") vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=SIZE, numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_FALSE) # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") # check volume status volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)
def test_replica_scheduler_exceed_over_provisioning(client): # NOQA over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") # test exceed over provisioning limit couldn't be scheduled nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): disk["storageReserved"] = \ disk["storageMaximum"] - 1*Gi update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_status(client, node["name"], fsid, "storageReserved", disk["storageMaximum"] - 1 * Gi) vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=str(2 * Gi), numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled( client, vol_name, "status", CONDITION_STATUS_FALSE) client.delete(volume) common.wait_for_volume_delete(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)
def test_replica_scheduler_just_under_over_provisioning(client): # NOQA over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") lht_hostId = get_self_host_id() nodes = client.list_node() expect_node_disk = {} max_size_array = [] for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk max_size_array.append(disk["storageMaximum"]) disk["storageReserved"] = 0 update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_status(client, node["name"], fsid, "storageReserved", 0) max_size = min(max_size_array) # test just under over provisioning limit could be scheduled vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=str(max_size), numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled( client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) nodes = client.list_node() node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)
def test_replica_scheduler_exceed_over_provisioning(client): # NOQA over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") # test exceed over provisioning limit couldn't be scheduled nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): disk["storageReserved"] = \ disk["storageMaximum"] - 1*Gi update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_status(client, node["name"], fsid, "storageReserved", disk["storageMaximum"] - 1*Gi) vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=str(2*Gi), numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_FALSE) client.delete(volume) common.wait_for_volume_delete(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)
def reset_settings(): yield client = get_longhorn_api_client() # NOQA host_id = get_self_host_id() node = client.by_id_node(host_id) client.update(node, allowScheduling=True) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="true")
def test_hard_anti_affinity_detach(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity are still able to detach and reattach to a node properly, even in degraded state. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 6. Detach the volume. 7. Verify that volume only have 2 replicas 1. Unhealthy replica will be removed upon detach. 8. Attach the volume again. 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 3. Verify only two of three replicas of volume are healthy. 4. Verify the remaining replica doesn't have `replica.HostID`, meaning it's unscheduled 9. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) assert len(volume.replicas) == 2 volume.attach(hostId=host_id) # Make sure we're still not getting another successful replica. volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) assert sum([ 1 for replica in volume.replicas if replica.running and replica.mode == "RW" ]) == 2 assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1 assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_hard_anti_affinity_scheduling(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity work as expected. With Hard Anti-Affinity, scheduling on nodes with existing replicas should be forbidden, resulting in "Degraded" state. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 3. Verify only two of three replicas of volume are healthy. 4. Verify the remaining replica doesn't have `replica.HostID`, meaning it's unscheduled 6. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) # Instead of waiting for timeout and lengthening the tests a significant # amount we can make sure the scheduling isn't working by making sure the # volume becomes Degraded and reports a scheduling error. wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) # While there are three replicas that should exist to meet the Volume's # request, only two of those volumes should actually be Healthy. volume = client.by_id_volume(volume_name) assert sum([ 1 for replica in volume.replicas if replica.running and replica.mode == "RW" ]) == 2 # Confirm that the final volume is an unscheduled volume. assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1 # Three replicas in total should still exist. assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_replica_rebuild_per_volume_limit(client, core_api, storage_class, sts_name, statefulset): # NOQA """ Test the volume always only have one replica scheduled for rebuild 1. Set soft anti-affinity to `true`. 2. Create a volume with 1 replica. 3. Attach the volume and write a few hundreds MB data to it. 4. Scale the volume replica to 5. 5. Constantly checking the volume replica list to make sure there should be only 1 replica in WO state. 6. Wait for the volume to complete rebuilding. Then remove 4 of the 5 replicas. 7. Monitoring the volume replica list again. 8. Once the rebuild was completed again, verify the data checksum. """ replica_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(replica_soft_anti_affinity_setting, value="true") data_path = '/data/test' storage_class['parameters']['numberOfReplicas'] = "1" vol_name, pod_name, md5sum = \ common.prepare_statefulset_with_data_in_mb( client, core_api, statefulset, sts_name, storage_class, data_path=data_path, data_size_in_mb=DATA_SIZE_IN_MB_2) # Scale the volume replica to 5 r_count = 5 vol = client.by_id_volume(vol_name) vol.updateReplicaCount(replicaCount=r_count) vol = common.wait_for_volume_replicas_mode(client, vol_name, 'RW', replica_count=r_count) # Delete 4 volume replicas del vol.replicas[0] for r in vol.replicas: vol.replicaRemove(name=r.name) r_count = 1 common.wait_for_volume_replicas_mode(client, vol_name, 'RW', replica_count=r_count) assert md5sum == common.get_pod_data_md5sum(core_api, pod_name, data_path)
def test_provisioner_tags(client, core_api, node_default_tags, storage_class, pvc, pod): # NOQA """ Test that a StorageClass can properly provision a volume with requested Tags. Test prerequisite: - set Replica Node Level Soft Anti-Affinity enabled 1. Use `node_default_tags` to add default tags to nodes. 2. Create a StorageClass with disk and node tag set. 3. Create PVC and Pod. 4. Verify the volume has the correct parameters and tags. """ replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(replica_node_soft_anti_affinity_setting, value="true") # Prepare pod and volume specs. pod_name = 'provisioner-tags-test' tag_spec = { "disk": ["ssd", "nvme"], "expected": 1, "node": ["storage", "main"] } pod['metadata']['name'] = pod_name pod['spec']['volumes'] = [create_pvc_spec(pvc['metadata']['name'])] pvc['spec']['storageClassName'] = DEFAULT_STORAGECLASS_NAME storage_class['metadata']['name'] = DEFAULT_STORAGECLASS_NAME storage_class['parameters']['diskSelector'] = 'ssd,nvme' storage_class['parameters']['nodeSelector'] = 'storage,main' volume_size = DEFAULT_VOLUME_SIZE * Gi create_storage(core_api, storage_class, pvc) create_and_wait_pod(core_api, pod) pvc_volume_name = get_volume_name(core_api, pvc['metadata']['name']) # Confirm that the volume has all the correct parameters we gave it. volumes = client.list_volume() assert len(volumes) == 1 assert volumes.data[0].name == pvc_volume_name assert volumes.data[0].size == str(volume_size) assert volumes.data[0].numberOfReplicas == \ int(storage_class['parameters']['numberOfReplicas']) assert volumes.data[0].state == "attached" check_volume_replicas(volumes.data[0], tag_spec, node_default_tags)
def test_hard_anti_affinity_offline_rebuild(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity can build new replicas during the attaching process once a valid node is available. Once a new replica has been built as part of the attaching process, the volume should be Healthy again. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 6. Detach the volume. 7. Enable current node's scheduling. 8. Attach the volume again. 9. Wait for volume to become healthy with 3 replicas 10. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume.replicas) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) client.update(node, allowScheduling=True) volume.attach(hostId=host_id) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_hard_anti_affinity_live_rebuild(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity can build new replicas live once a valid node is available. If no nodes without existing replicas are available, the volume should remain in "Degraded" state. However, once one is available, the replica should now be scheduled successfully, with the volume returning to "Healthy" state. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 6. Enable the current node's scheduling 7. Wait for volume to start rebuilding and become healthy again 8. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume.replicas) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) # Allow scheduling on host node again client.update(node, allowScheduling=True) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_soft_anti_affinity_detach(client, volume_name): # NOQA """ Test that volumes with Soft Anti-Affinity can detach and reattach to a node properly. 1. Create a volume and attach to the current node. 2. Generate and write `data` to the volume 3. Set `soft anti-affinity` to true 4. Disable current node's scheduling. 5. Remove the replica on the current node 6. Wait for the new replica to be rebuilt 7. Detach the volume. 8. Verify there are 3 replicas 9. Attach the volume again. Verify there are still 3 replicas 10. Verify the `data`. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="true") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = list(map(lambda replica: replica.name, volume.replicas)) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) assert len(volume.replicas) == 3 volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_orphan_auto_deletion(client, volume_name, request): # NOQA """ Test orphaned dirs creation and background deletion in multiple disks 1. Create a new disks for holding orphaned replica directories 2. Create a volume and attach to the current node 3. Create orphaned replica directories by copying the active replica directory 4. Clean up volume 5. Verify orphan list contains the orphan CRs for replica directories 6. Enable the orphan-auto-deletion setting 7. Verify orphan list is empty and the orphaned directory is deleted in background 8. Clean up disk """ disk_names = ["vol-disk-" + generate_random_id(4)] # Step 1 lht_hostId = get_self_host_id() cleanup_node_disks(client, lht_hostId) disk_paths = crate_disks_on_host(client, disk_names, request) # Step 2 volume = create_volume_with_replica_on_host(client, volume_name) # Step 3 create_orphaned_directories_on_host(volume, disk_paths, 1) # Step 4 cleanup_volume_by_name(client, volume_name) # Step 5 assert wait_for_orphan_count(client, 1, 180) == 1 # Step 6: enable orphan auto deletion setting = client.by_id_setting(SETTING_ORPHAN_AUTO_DELETION) client.update(setting, value="true") # Step 7 assert wait_for_orphan_count(client, 0, 180) == 0 assert wait_for_file_count(os.path.join(disk_paths[0], "replicas"), 0, 180) == 0
def test_node_controller_sync_disk_state(client): # NOQA # update StorageMinimalAvailablePercentage to test Disk State setting = client.by_id_setting( SETTING_STORAGE_MINIMAL_AVAILABLE_PERCENTAGE) old_minimal_available_percentage = setting["value"] setting = client.update(setting, value="100") assert setting["value"] == "100" nodes = client.list_node() # wait for node controller to update disk state for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_FALSE) nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): conditions = disk["conditions"] assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_FALSE setting = client.update(setting, value=old_minimal_available_percentage) assert setting["value"] == old_minimal_available_percentage # wait for node controller to update disk state nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_TRUE) nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): conditions = disk["conditions"] assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_TRUE
def test_node_controller_sync_disk_state(client): # NOQA # update StorageMinimalAvailablePercentage to test Disk State setting = client.by_id_setting( SETTING_STORAGE_MINIMAL_AVAILABLE_PERCENTAGE) old_minimal_available_percentage = setting["value"] setting = client.update(setting, value="100") assert setting["value"] == "100" nodes = client.list_node() # wait for node controller to update disk state for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_FALSE) nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): conditions = disk["conditions"] assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_FALSE setting = client.update(setting, value=old_minimal_available_percentage) assert setting["value"] == old_minimal_available_percentage # wait for node controller to update disk state nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_TRUE) nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): conditions = disk["conditions"] assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_TRUE
def test_hard_anti_affinity_scheduling(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity work as expected. With Hard Anti-Affinity, scheduling on nodes with existing replicas should be forbidden, resulting in "Degraded" state. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica["name"]) # Instead of waiting for timeout and lengthening the tests a significant # amount we can make sure the scheduling isn't working by making sure the # volume becomes Degraded and reports a scheduling error. volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) # While there are three replicas that should exist to meet the Volume's # request, only two of those volumes should actually be Healthy. assert sum([ 1 for replica in volume["replicas"] if replica["running"] and replica["mode"] == "RW" ]) == 2 # Confirm that the final volume is an unscheduled volume. assert sum([1 for replica in volume["replicas"] if not replica["hostId"]]) == 1 # Three replicas in total should still exist. assert len(volume["replicas"]) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_hard_anti_affinity_detach(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity are still able to detach and reattach to a node properly, even in degraded state. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica["name"]) volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) assert len(volume["replicas"]) == 2 volume.attach(hostId=host_id) # Make sure we're still not getting another successful replica. volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) assert sum([ 1 for replica in volume["replicas"] if replica["running"] and replica["mode"] == "RW" ]) == 2 assert sum([1 for replica in volume["replicas"] if not replica["hostId"]]) == 1 assert len(volume["replicas"]) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_soft_anti_affinity_scheduling(client, volume_name): # NOQA """ Test that volumes with Soft Anti-Affinity work as expected. With Soft Anti-Affinity, a new replica should still be scheduled on a node with an existing replica, which will result in "Healthy" state but limited redundancy. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to true 4. Disable current node's scheduling. 5. Remove the replica on the current node 6. Wait for the volume to complete rebuild. Volume should have 3 replicas. 7. Verify `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="true") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = list(map(lambda replica: replica.name, volume.replicas)) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_backup_kubernetes_status(set_random_backupstore, client, core_api, pod): # NOQA """ Test that Backups have KubernetesStatus stored properly when there is an associated PersistentVolumeClaim and Pod. 1. Setup a random backupstore 2. Set settings Longhorn Static StorageClass to `longhorn-static-test` 3. Create a volume and PV/PVC. Verify the StorageClass of PVC 4. Create a Pod using the PVC. 5. Check volume's Kubernetes status to reflect PV/PVC/Pod correctly. 6. Create a backup for the volume. 7. Verify the labels of created backup reflect PV/PVC/Pod status. 8. Restore the backup to a volume. Wait for restoration to complete. 9. Check the volume's Kubernetes Status 1. Make sure the `lastPodRefAt` and `lastPVCRefAt` is snapshot created time 10. Delete the backup and restored volume. 11. Delete PV/PVC/Pod. 12. Verify volume's Kubernetes Status updated to reflect history data. 13. Attach the volume and create another backup. Verify the labels 14. Verify the volume's Kubernetes status. 15. Restore the previous backup to a new volume. Wait for restoration. 16. Verify the restored volume's Kubernetes status. 1. Make sure `lastPodRefAt` and `lastPVCRefAt` matched volume on step 12 """ host_id = get_self_host_id() static_sc_name = "longhorn-static-test" setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC) setting = client.update(setting, value=static_sc_name) assert setting.value == static_sc_name volume_name = "test-backup-kubernetes-status-pod" # NOQA client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pod_name = "pod-" + volume_name pv_name = "pv-" + volume_name pvc_name = "pvc-" + volume_name create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') pvc_found = False for item in ret.items: if item.metadata.name == pvc_name: pvc_found = item break assert pvc_found assert pvc_found.spec.storage_class_name == static_sc_name pod['metadata']['name'] = pod_name pod['spec']['volumes'] = [{ 'name': pod['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': { 'claimName': pvc_name, }, }] create_and_wait_pod(core_api, pod) ks = { 'lastPodRefAt': '', 'lastPVCRefAt': '', 'namespace': 'default', 'pvcName': pvc_name, 'pvName': pv_name, 'pvStatus': 'Bound', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, volume_name, ks) volume = wait_for_volume_healthy(client, volume_name) # Create Backup manually instead of calling create_backup since Kubernetes # is not guaranteed to mount our Volume to the test host. snap = create_snapshot(client, volume_name) volume.snapshotBackup(name=snap.name) wait_for_backup_completion(client, volume_name, snap.name) _, b = find_backup(client, volume_name, snap.name) # Check backup label status = loads(b.labels.get(KUBERNETES_STATUS_LABEL)) assert status == ks # Check backup volume label for _ in range(RETRY_COUNTS): bv = client.by_id_backupVolume(volume_name) if bv is not None and bv.labels is not None: break time.sleep(RETRY_INTERVAL) assert bv is not None and bv.labels is not None status = loads(bv.labels.get(KUBERNETES_STATUS_LABEL)) assert status == ks restore_name = generate_volume_name() client.create_volume(name=restore_name, size=SIZE, numberOfReplicas=2, fromBackup=b.url) wait_for_volume_restoration_completed(client, restore_name) wait_for_volume_detached(client, restore_name) snapshot_created = b.snapshotCreated ks = { 'lastPodRefAt': b.snapshotCreated, 'lastPVCRefAt': b.snapshotCreated, 'namespace': 'default', 'pvcName': pvc_name, # Restoration should not apply PersistentVolume data. 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, restore_name, ks) restore = client.by_id_volume(restore_name) # We need to compare LastPodRefAt and LastPVCRefAt manually since # wait_volume_kubernetes_status only checks for empty or non-empty state. assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"] assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"] delete_backup(client, bv.name, b.name) client.delete(restore) wait_for_volume_delete(client, restore_name) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name) delete_and_wait_pv(core_api, pv_name) # With the Pod, PVC, and PV deleted, the Volume should have both Ref # fields set. Check that a new Backup and Restore will use this instead of # manually populating the Ref fields. ks = { 'lastPodRefAt': 'NOT NULL', 'lastPVCRefAt': 'NOT NULL', 'namespace': 'default', 'pvcName': pvc_name, 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, volume_name, ks) volume = wait_for_volume_detached(client, volume_name) volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) snap = create_snapshot(client, volume_name) volume.snapshotBackup(name=snap.name) volume = wait_for_backup_completion(client, volume_name, snap.name) bv, b = find_backup(client, volume_name, snap.name) new_b = bv.backupGet(name=b.name) status = loads(new_b.labels.get(KUBERNETES_STATUS_LABEL)) # Check each field manually, we have no idea what the LastPodRefAt or the # LastPVCRefAt will be. We just know it shouldn't be SnapshotCreated. assert status['lastPodRefAt'] != snapshot_created assert status['lastPVCRefAt'] != snapshot_created assert status['namespace'] == "default" assert status['pvcName'] == pvc_name assert status['pvName'] == "" assert status['pvStatus'] == "" assert status['workloadsStatus'] == [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] restore_name = generate_volume_name() client.create_volume(name=restore_name, size=SIZE, numberOfReplicas=2, fromBackup=b.url) wait_for_volume_restoration_completed(client, restore_name) wait_for_volume_detached(client, restore_name) ks = { 'lastPodRefAt': status['lastPodRefAt'], 'lastPVCRefAt': status['lastPVCRefAt'], 'namespace': 'default', 'pvcName': pvc_name, 'pvName': '', 'pvStatus': '', 'workloadsStatus': [{ 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '' }] } wait_volume_kubernetes_status(client, restore_name, ks) restore = client.by_id_volume(restore_name) assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"] assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"] # cleanup backupstore_cleanup(client) client.delete(restore) cleanup_volume(client, volume)
def test_pvc_creation_with_default_sc_set(client, core_api, storage_class, pod): # NOQA """ Test creating PVC with default StorageClass set The target is to make sure the newly create PV/PVC won't use default StorageClass, and if there is no default StorageClass, PV/PVC can still be created. 1. Create a StorageClass and set it to be the default StorageClass 2. Update static StorageClass to `longhorn-static-test` 3. Create volume then PV/PVC. 4. Make sure the newly created PV/PVC using StorageClass `longhorn-static-test` 5. Create pod with PVC. 6. Verify volume's Kubernetes Status 7. Remove PVC and Pod. 8. Verify volume's Kubernetes Status only contains current PV and history 9. Wait for volume to detach (since pod is deleted) 10. Reuse the volume on a new pod. Wait for the pod to start 11. Verify volume's Kubernetes Status reflect the new pod. 12. Delete PV/PVC/Pod. 13. Verify volume's Kubernetes Status only contains history 14. Delete the default StorageClass. 15. Create PV/PVC for the volume. 16. Make sure the PV's StorageClass is static StorageClass """ # set default storage class storage_class['metadata']['annotations'] = \ {"storageclass.kubernetes.io/is-default-class": "true"} create_storage_class(storage_class) static_sc_name = "longhorn-static-test" setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC) setting = client.update(setting, value=static_sc_name) assert setting.value == static_sc_name volume_name = "test-pvc-creation-with-sc" # NOQA pod_name = "pod-" + volume_name client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = wait_for_volume_detached(client, volume_name) pv_name = "pv-" + volume_name pvc_name = "pvc-" + volume_name pvc_name_extra = "pvc-" + volume_name + "-extra" create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') for item in ret.items: if item.metadata.name == pvc_name: pvc_found = item break assert pvc_found assert pvc_found.spec.storage_class_name == static_sc_name pod['metadata']['name'] = pod_name pod['spec']['volumes'] = [{ 'name': pod['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': { 'claimName': pvc_name, }, }] create_and_wait_pod(core_api, pod) ks = { 'pvName': pv_name, 'pvStatus': 'Bound', 'namespace': 'default', 'pvcName': pvc_name, 'lastPVCRefAt': '', 'lastPodRefAt': '', 'workloadsStatus': [ { 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '', }, ], } wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name) ks = { 'pvName': pv_name, 'pvStatus': 'Released', 'namespace': 'default', 'pvcName': pvc_name, 'lastPVCRefAt': 'not empty', 'lastPodRefAt': 'not empty', } wait_volume_kubernetes_status(client, volume_name, ks) # try to reuse the pv volume = wait_for_volume_detached(client, volume_name) create_pvc_for_volume(client, core_api, volume, pvc_name_extra) pod['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = \ pvc_name_extra create_and_wait_pod(core_api, pod) ks = { 'pvName': pv_name, 'pvStatus': 'Bound', 'namespace': 'default', 'pvcName': pvc_name_extra, 'lastPVCRefAt': '', 'lastPodRefAt': '', 'workloadsStatus': [ { 'podName': pod_name, 'podStatus': 'Running', 'workloadName': '', 'workloadType': '', }, ], } wait_volume_kubernetes_status(client, volume_name, ks) delete_and_wait_pod(core_api, pod_name) delete_and_wait_pvc(core_api, pvc_name_extra) delete_and_wait_pv(core_api, pv_name) ks = { 'pvName': '', 'pvStatus': '', 'namespace': 'default', 'pvcName': pvc_name_extra, 'lastPVCRefAt': 'not empty', 'lastPodRefAt': 'not empty', } wait_volume_kubernetes_status(client, volume_name, ks) # without default storage class delete_storage_class(storage_class['metadata']['name']) create_pv_for_volume(client, core_api, volume, pv_name) create_pvc_for_volume(client, core_api, volume, pvc_name) ret = core_api.list_namespaced_persistent_volume_claim(namespace='default') for item in ret.items: if item.metadata.name == pvc_name: pvc2 = item break assert pvc2 assert pvc2.spec.storage_class_name == static_sc_name delete_and_wait_pvc(core_api, pvc_name) delete_and_wait_pv(core_api, pv_name)
def test_allow_volume_creation_with_degraded_availability_csi( client, core_api, apps_api, make_deployment_with_pvc): # NOQA """ Test Allow Volume Creation with Degraded Availability (CSI) Requirement: 1. Set `allow-volume-creation-with-degraded-availability` to true. 2. Set `node-level-soft-anti-affinity` to false. Steps: 1. Disable scheduling for node 3. 2. Create a Deployment Pod with a volume and 3 replicas. 1. After the volume is attached, scheduling error should be seen. 3. Write data to the Pod. 4. Scale down the deployment to 0 to detach the volume. 1. Scheduled condition should become true. 5. Scale up the deployment back to 1 and verify the data. 1. Scheduled condition should become false. 6. Enable the scheduling for node 3. 1. Volume should start rebuilding on the node 3 soon. 2. Once the rebuilding starts, the scheduled condition should become true. 7. Once rebuild finished, scale down and back the deployment to verify the data. """ setting = client.by_id_setting(common.SETTING_DEGRADED_AVAILABILITY) client.update(setting, value="true") setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") nodes = client.list_node() node3 = nodes[2] client.update(node3, allowScheduling=False) vol = common.create_and_check_volume(client, generate_volume_name(), size=str(500 * Mi)) pv_name = vol.name + "-pv" common.create_pv_for_volume(client, core_api, vol, pv_name) pvc_name = vol.name + "-pvc" common.create_pvc_for_volume(client, core_api, vol, pvc_name) deployment_name = vol.name + "-dep" deployment = make_deployment_with_pvc(deployment_name, pvc_name) deployment["spec"]["replicas"] = 3 apps_api.create_namespaced_deployment(body=deployment, namespace='default') common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) common.wait_scheduling_failure(client, vol.name) data_path = "/data/test" pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) common.write_pod_volume_random_data(core_api, pod.metadata.name, data_path, common.DATA_SIZE_IN_MB_2) created_md5sum = get_pod_data_md5sum(core_api, pod.metadata.name, data_path) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) vol = common.wait_for_volume_detached(client, vol.name) assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True" deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) common.wait_for_volume_condition_scheduled(client, vol.name, "status", common.CONDITION_STATUS_FALSE) pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) assert created_md5sum == get_pod_data_md5sum(core_api, pod.metadata.name, data_path) client.update(node3, allowScheduling=True) common.wait_for_rebuild_start(client, vol.name) vol = client.by_id_volume(vol.name) assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True" common.wait_for_rebuild_complete(client, vol.name) deployment['spec']['replicas'] = 0 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_detached(client, vol.name) deployment['spec']['replicas'] = 1 apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) common.wait_for_volume_status(client, vol.name, common.VOLUME_FIELD_STATE, common.VOLUME_STATE_ATTACHED) pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name) assert created_md5sum == get_pod_data_md5sum(core_api, pod.metadata.name, data_path)
def test_csi_expansion_with_replica_failure(client, core_api, storage_class, pvc, pod_manifest): # NOQA """ Test expansion success but with one replica expansion failure 1. Create a new `storage_class` with `allowVolumeExpansion` set 2. Create PVC and Pod with dynamic provisioned volume from the StorageClass 3. Create an empty directory with expansion snapshot tmp meta file path for one replica so that the replica expansion will fail 4. Generate `test_data` and write to the pod 5. Delete the pod and wait for volume detachment 6. Update pvc.spec.resources to expand the volume 7. Check expansion result using Longhorn API. There will be expansion error caused by the failed replica but overall the expansion should succeed. 8. Create a new pod and check if the volume will reuse the failed replica during rebuilding. 9. Validate the volume content, then check if data writing looks fine """ replenish_wait_setting = \ client.by_id_setting(SETTING_REPLICA_REPLENISHMENT_WAIT_INTERVAL) client.update(replenish_wait_setting, value="600") create_storage_class(storage_class) pod_name = 'csi-expansion-with-replica-failure-test' pvc_name = pod_name + "-pvc" pvc['metadata']['name'] = pvc_name pvc['spec']['storageClassName'] = storage_class['metadata']['name'] create_pvc(pvc) pod_manifest['metadata']['name'] = pod_name pod_manifest['spec']['volumes'] = [{ 'name': pod_manifest['spec']['containers'][0]['volumeMounts'][0]['name'], 'persistentVolumeClaim': {'claimName': pvc_name}, }] create_and_wait_pod(core_api, pod_manifest) expand_size = str(EXPANDED_VOLUME_SIZE*Gi) pv = wait_and_get_pv_for_pvc(core_api, pvc_name) assert pv.status.phase == "Bound" volume_name = pv.spec.csi.volume_handle volume = client.by_id_volume(volume_name) failed_replica = volume.replicas[0] fail_replica_expansion(client, core_api, volume_name, expand_size, [failed_replica]) test_data = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name, test_data) delete_and_wait_pod(core_api, pod_name) wait_for_volume_detached(client, volume_name) # There will be replica expansion error info # but the expansion should succeed. pvc['spec']['resources'] = { 'requests': { 'storage': size_to_string(EXPANDED_VOLUME_SIZE*Gi) } } expand_and_wait_for_pvc(core_api, pvc) wait_for_expansion_failure(client, volume_name) wait_for_volume_expansion(client, volume_name) volume = client.by_id_volume(volume_name) assert volume.state == "detached" assert volume.size == expand_size for r in volume.replicas: if r.name == failed_replica.name: assert r.failedAt != "" else: assert r.failedAt == "" # Check if the failed replica will be reused during rebuilding, # and if the volume still works fine. create_and_wait_pod(core_api, pod_manifest) volume = wait_for_volume_healthy(client, volume_name) for r in volume.replicas: assert r.mode == "RW" resp = read_volume_data(core_api, pod_name) assert resp == test_data test_data = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name, test_data) resp = read_volume_data(core_api, pod_name) assert resp == test_data
def test_setting_toleration(): """ Test toleration setting 1. Set `taint-toleration` to "key1=value1:NoSchedule; key2:InvalidEffect". 2. Verify the request fails. 3. Create a volume and attach it. 4. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 5. Verify that cannot update toleration setting when any volume is attached. 6. Generate and write `data1` into the volume. 7. Detach the volume. 8. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 9. Wait for all the Longhorn system components to restart with new toleration. 10. Verify that UI, manager, and drive deployer don't restart and don't have new toleration. 11. Attach the volume again and verify the volume `data1`. 12. Generate and write `data2` to the volume. 13. Detach the volume. 14. Clean the `toleration` setting. 15. Wait for all the Longhorn system components to restart with no toleration. 16. Attach the volume and validate `data2`. 17. Generate and write `data3` to the volume. """ client = get_longhorn_api_client() # NOQA apps_api = get_apps_api_client() # NOQA core_api = get_core_api_client() # NOQA count = len(client.list_node()) setting = client.by_id_setting(SETTING_TAINT_TOLERATION) with pytest.raises(Exception) as e: client.update(setting, value="key1=value1:NoSchedule; key2:InvalidEffect") assert 'invalid effect' in str(e.value) volume_name = "test-toleration-vol" # NOQA volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) setting_value_str = "key1=value1:NoSchedule; key2:NoExecute" setting_value_dicts = [ { "key": "key1", "value": "value1", "operator": "Equal", "effect": "NoSchedule" }, { "key": "key2", "value": None, "operator": "Exists", "effect": "NoExecute" }, ] with pytest.raises(Exception) as e: client.update(setting, value=setting_value_str) assert 'cannot modify toleration setting before all volumes are detached' \ in str(e.value) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) # cleanup setting_value_str = "" setting_value_dicts = [] setting = client.by_id_setting(SETTING_TAINT_TOLERATION) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_class, volume_name): # NOQA """ Test that the Priority Class setting is validated and utilized correctly. 1. Verify that the name of a non-existent Priority Class cannot be used for the Setting. 2. Create a new Priority Class in Kubernetes. 3. Create and attach a Volume. 4. Verify that the Priority Class Setting cannot be updated with an attached Volume. 5. Generate and write `data1`. 6. Detach the Volume. 7. Update the Priority Class Setting to the new Priority Class. 8. Wait for all the Longhorn system components to restart with the new Priority Class. 9. Verify that UI, manager, and drive deployer don't have Priority Class 10. Attach the Volume and verify `data1`. 11. Generate and write `data2`. 12. Unset the Priority Class Setting. 13. Wait for all the Longhorn system components to restart with the new Priority Class. 14. Verify that UI, manager, and drive deployer don't have Priority Class 15. Attach the Volume and verify `data2`. 16. Generate and write `data3`. Note: system components are workloads other than UI, manager, driver deployer """ client = get_longhorn_api_client() # NOQA count = len(client.list_node()) name = priority_class['metadata']['name'] setting = client.by_id_setting(SETTING_PRIORITY_CLASS) with pytest.raises(Exception) as e: client.update(setting, value=name) assert 'failed to get priority class ' in str(e.value) scheduling_api.create_priority_class(priority_class) volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) with pytest.raises(Exception) as e: client.update(setting, value=name) assert 'cannot modify priority class setting before all volumes are ' \ 'detached' in str(e.value) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) setting = client.update(setting, value=name) assert setting.value == name wait_for_priority_class_update(core_api, apps_api, count, priority_class) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) setting = client.by_id_setting(SETTING_PRIORITY_CLASS) setting = client.update(setting, value='') assert setting.value == '' wait_for_priority_class_update(core_api, apps_api, count) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
def test_instance_manager_cpu_reservation(client, core_api): # NOQA """ Test if the CPU requests of instance manager pods are controlled by the settings and the node specs correctly. 1. Try to change the deprecated setting `Guaranteed Engine CPU`. --> The setting update should fail. 2. Pick up node 1, set `node.engineManagerCPURequest` and `node.replicaManagerCPURequest` to 150 and 250, respectively. --> The IM pods on this node will be restarted. And the CPU requests of these IM pods matches the above milli value. 3. Change the new settings `Guaranteed Engine Manager CPU` and `Guaranteed Replica Manager CPU` to 10 and 20, respectively. Then wait for all IM pods except for the pods on node 1 restarting. --> The CPU requests of the restarted IM pods equals to the new setting value multiply the kube node allocatable CPU. 4. Set the both new settings to 0. --> All IM pods except for the pod on node 1 will be restarted without CPU requests. 5. Set the fields on node 1 to 0. --> The IM pods on node 1 will be restarted without CPU requests. 6. Set the both new settings to 2 random values, and the sum of the 2 values is small than 40. Then wait for all IM pods restarting. --> The CPU requests of all IM pods equals to the new setting value multiply the kube node allocatable CPU. 7. Set the both new settings to 2 random values, and the single value or the sum of the 2 values is greater than 40. --> The setting update should fail. 8. Create a volume, verify everything works as normal Note: use fixture to restore the setting into the original state """ instance_managers = client.list_instance_manager() deprecated_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_CPU) with pytest.raises(Exception) as e: client.update(deprecated_setting, value="0.1") host_node_name = get_self_host_id() host_node = client.by_id_node(host_node_name) other_ems, other_rms = [], [] for im in instance_managers: if im.managerType == "engine": if im.nodeID == host_node_name: em_on_host = im else: other_ems.append(im) else: if im.nodeID == host_node_name: rm_on_host = im else: other_rms.append(im) assert em_on_host and rm_on_host host_kb_node = core_api.read_node(host_node_name) if host_kb_node.status.allocatable["cpu"].endswith('m'): allocatable_millicpu = int(host_kb_node.status.allocatable["cpu"][:-1]) else: allocatable_millicpu = int( host_kb_node.status.allocatable["cpu"]) * 1000 client.update(host_node, allowScheduling=True, engineManagerCPURequest=150, replicaManagerCPURequest=250) time.sleep(5) guaranteed_engine_cpu_setting_check(client, core_api, [em_on_host], "Running", True, "150m") guaranteed_engine_cpu_setting_check(client, core_api, [rm_on_host], "Running", True, "250m") em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU) client.update(em_setting, value="10") rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU) client.update(rm_setting, value="20") time.sleep(5) guaranteed_engine_cpu_setting_check( client, core_api, other_ems, "Running", True, str(int(allocatable_millicpu * 10 / 100)) + "m") guaranteed_engine_cpu_setting_check( client, core_api, other_rms, "Running", True, str(int(allocatable_millicpu * 20 / 100)) + "m") em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU) client.update(em_setting, value="0") rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU) client.update(rm_setting, value="0") time.sleep(5) guaranteed_engine_cpu_setting_check(client, core_api, other_ems, "Running", True, "") guaranteed_engine_cpu_setting_check(client, core_api, other_rms, "Running", True, "") ems, rms = other_ems, other_rms ems.append(em_on_host) rms.append(rm_on_host) host_node = client.by_id_node(host_node_name) client.update(host_node, allowScheduling=True, engineManagerCPURequest=0, replicaManagerCPURequest=0) time.sleep(5) guaranteed_engine_cpu_setting_check(client, core_api, ems, "Running", True, "") guaranteed_engine_cpu_setting_check(client, core_api, rms, "Running", True, "") client.update(em_setting, value="20") rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU) client.update(rm_setting, value="15") time.sleep(5) guaranteed_engine_cpu_setting_check( client, core_api, ems, "Running", True, str(int(allocatable_millicpu * 20 / 100)) + "m") guaranteed_engine_cpu_setting_check( client, core_api, rms, "Running", True, str(int(allocatable_millicpu * 15 / 100)) + "m") with pytest.raises(Exception) as e: client.update(em_setting, value="41") assert "should be between 0 to 40" in \ str(e.value) em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU) with pytest.raises(Exception) as e: client.update(em_setting, value="35") assert "The sum should not be smaller than 0% or greater than 40%" in \ str(e.value) # Create a volume to test vol_name = generate_volume_name() volume = create_and_check_volume(client, vol_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, vol_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) check_volume_data(volume, data) cleanup_volume(client, volume)
def test_setting_toleration_extra(core_api, apps_api): # NOQA """ Steps: 1. Set Kubernetes Taint Toleration to: `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`. 2. Verify that all system components have the 2 tolerations `ex.com/foobar:NoExecute; ex.com/foobar:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 3. Set Kubernetes Taint Toleration to: `node-role.kubernetes.io/controlplane=true:NoSchedule`. 4. Verify that all system components have the the toleration `node-role.kubernetes.io/controlplane=true:NoSchedule`, and don't have the 2 tolerations `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 5. Set Kubernetes Taint Toleration to special value: `:`. 6. Verify that all system components have the toleration with `operator: Exists` and other field of the toleration are empty. Verify that all system components don't have the toleration `node-role.kubernetes.io/controlplane=true:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 7. Clear Kubernetes Taint Toleration Note: system components are workloads other than UI, manager, driver deployer """ settings = [ { "value": "ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule", "expect": [ { "key": "ex.com/foobar", "value": None, "operator": "Exists", "effect": "NoExecute" }, { "key": "ex.com/foobar", "value": None, "operator": "Exists", "effect": "NoSchedule" }, ], }, { "value": "node-role.kubernetes.io/controlplane=true:NoSchedule", "expect": [ { "key": "node-role.kubernetes.io/controlplane", "value": "true", "operator": "Equal", "effect": "NoSchedule" }, ], }, # Skip the this special toleration for now because it makes # Longhorn deploy manager pods on control/etcd nodes # and the control/etcd nodes become "down" after the test # clear this toleration. # We will enable this test once we implement logic for # deleting failed nodes. # { # "value": ":", # "expect": [ # { # "key": None, # "value": None, # "operator": "Exists", # "effect": None, # }, # ] # }, { "value": "", "expect": [], }, ] chk_removed_tolerations = [] for setting in settings: client = get_longhorn_api_client() # NOQA taint_toleration = client.by_id_setting(SETTING_TAINT_TOLERATION) updated = client.update(taint_toleration, value=setting["value"]) assert updated.value == setting["value"] node_count = len(client.list_node()) wait_for_toleration_update(core_api, apps_api, node_count, setting["expect"], chk_removed_tolerations) chk_removed_tolerations = setting["expect"]
def ha_backup_deletion_recovery_test(client, volume_name, size, base_image=""): # NOQA volume = client.create_volume(name=volume_name, size=size, numberOfReplicas=2, baseImage=base_image) volume = common.wait_for_volume_detached(client, volume_name) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) setting = client.by_id_setting(common.SETTING_BACKUP_TARGET) # test backupTarget for multiple settings backupstores = common.get_backupstore_url() for backupstore in backupstores: if common.is_backupTarget_s3(backupstore): backupsettings = backupstore.split("$") setting = client.update(setting, value=backupsettings[0]) assert setting["value"] == backupsettings[0] credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value=backupsettings[1]) assert credential["value"] == backupsettings[1] else: setting = client.update(setting, value=backupstore) assert setting["value"] == backupstore credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value="") assert credential["value"] == "" data = write_volume_random_data(volume) snap2 = volume.snapshotCreate() volume.snapshotCreate() volume.snapshotBackup(name=snap2["name"]) _, b = common.find_backup(client, volume_name, snap2["name"]) res_name = common.generate_volume_name() res_volume = client.create_volume(name=res_name, size=size, numberOfReplicas=2, fromBackup=b["url"]) res_volume = common.wait_for_volume_detached(client, res_name) res_volume = res_volume.attach(hostId=host_id) res_volume = common.wait_for_volume_healthy(client, res_name) check_volume_data(res_volume, data) snapshots = res_volume.snapshotList() # only the backup snapshot + volume-head assert len(snapshots) == 2 backup_snapshot = "" for snap in snapshots: if snap["name"] != "volume-head": backup_snapshot = snap["name"] assert backup_snapshot != "" res_volume.snapshotCreate() snapshots = res_volume.snapshotList() assert len(snapshots) == 3 res_volume.snapshotDelete(name=backup_snapshot) res_volume.snapshotPurge() snapshots = res_volume.snapshotList() assert len(snapshots) == 2 ha_rebuild_replica_test(client, res_name) res_volume = res_volume.detach() res_volume = common.wait_for_volume_detached(client, res_name) client.delete(res_volume) common.wait_for_volume_delete(client, res_name) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) common.wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def test_replica_scheduler_update_minimal_available(client): # NOQA minimal_available_setting = client.by_id_setting( SETTING_STORAGE_MINIMAL_AVAILABLE_PERCENTAGE) old_minimal_setting = minimal_available_setting["value"] nodes = client.list_node() expect_node_disk = {} for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk # set storage minimal available percentage to 100 # to test all replica couldn't be scheduled minimal_available_setting = client.update(minimal_available_setting, value="100") # wait for disks state nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_FALSE) lht_hostId = get_self_host_id() vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=SIZE, numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_FALSE) # set storage minimal available percentage to default value(10) minimal_available_setting = client.update(minimal_available_setting, value=old_minimal_setting) # wait for disks state nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_TRUE) # check volume status volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) nodes = client.list_node() node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name)
def test_replica_scheduler_just_under_over_provisioning(client): # NOQA over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") lht_hostId = get_self_host_id() nodes = client.list_node() expect_node_disk = {} max_size_array = [] for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk max_size_array.append(disk["storageMaximum"]) disk["storageReserved"] = 0 update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_status(client, node["name"], fsid, "storageReserved", 0) max_size = min(max_size_array) # test just under over provisioning limit could be scheduled vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=str(max_size), numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) nodes = client.list_node() node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)