def ha_simple_recovery_test(client, volume_name, size, base_image=""): # NOQA volume = client.create_volume(name=volume_name, size=size, numberOfReplicas=2, baseImage=base_image) volume = common.wait_for_volume_detached(client, volume_name) assert volume["name"] == volume_name assert volume["size"] == size assert volume["numberOfReplicas"] == 2 assert volume["state"] == "detached" assert volume["created"] != "" assert volume["baseImage"] == base_image host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) ha_rebuild_replica_test(client, volume_name) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) common.wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def test_ha_prohibit_deleting_last_replica(client, volume_name): # NOQA volume = client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=1) volume = common.wait_for_volume_detached(client, volume_name) assert volume["name"] == volume_name assert volume["size"] == SIZE assert volume["numberOfReplicas"] == 1 assert volume["state"] == "detached" assert volume["created"] != "" host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 1 replica0 = volume["replicas"][0] with pytest.raises(Exception) as e: volume.replicaRemove(name=replica0["name"]) assert "no other healthy replica available" in str(e.value) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) common.wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def test_replica_scheduler_update_over_provisioning(client): # NOQA nodes = client.list_node() lht_hostId = get_self_host_id() expect_node_disk = {} for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 0 # to test all replica couldn't be scheduled over_provisioning_setting = client.update(over_provisioning_setting, value="0") vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=SIZE, numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_FALSE) # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") # check volume status volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)
def test_node_controller_sync_storage_available(client): # NOQA lht_hostId = get_self_host_id() # create a disk to test storageAvailable node = client.by_id_node(lht_hostId) test_disk_path = create_host_disk(client, "vol-test", SIZE, lht_hostId) test_disk = {"path": test_disk_path, "allowScheduling": True} update_disks = get_update_disks(node["disks"]) update_disks.append(test_disk) node = node.diskUpdate(disks=update_disks) node = common.wait_for_disk_update(client, lht_hostId, len(update_disks)) assert len(node["disks"]) == len(update_disks) # write specified byte data into disk test_file_path = os.path.join(test_disk_path, TEST_FILE) if os.path.exists(test_file_path): os.remove(test_file_path) cmd = ['dd', 'if=/dev/zero', 'of=' + test_file_path, 'bs=1M', 'count=1'] subprocess.check_call(cmd) node = client.by_id_node(lht_hostId) disks = node["disks"] # wait for node controller update disk status expect_disk = {} free, total = common.get_host_disk_size(test_disk_path) for fsid, disk in disks.iteritems(): if disk["path"] == test_disk_path: node = wait_for_disk_status(client, lht_hostId, fsid, "storageAvailable", free) expect_disk = node["disks"][fsid] break assert expect_disk["storageAvailable"] == free os.remove(test_file_path) # cleanup test disks node = client.by_id_node(lht_hostId) disks = node["disks"] wait_fsid = '' for fsid, disk in disks.iteritems(): if disk["path"] == test_disk_path: wait_fsid = fsid disk["allowScheduling"] = False update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) node = wait_for_disk_status(client, lht_hostId, wait_fsid, "allowScheduling", False) disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == test_disk_path: disks.pop(fsid) break update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) node = wait_for_disk_update(client, lht_hostId, len(update_disks)) assert len(node["disks"]) == len(update_disks) cleanup_host_disk(client, 'vol-test')
def test_hard_anti_affinity_detach(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity are still able to detach and reattach to a node properly, even in degraded state. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 6. Detach the volume. 7. Verify that volume only have 2 replicas 1. Unhealthy replica will be removed upon detach. 8. Attach the volume again. 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 3. Verify only two of three replicas of volume are healthy. 4. Verify the remaining replica doesn't have `replica.HostID`, meaning it's unscheduled 9. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) assert len(volume.replicas) == 2 volume.attach(hostId=host_id) # Make sure we're still not getting another successful replica. volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) assert sum([ 1 for replica in volume.replicas if replica.running and replica.mode == "RW" ]) == 2 assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1 assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def recurring_job_labels_test(client, labels, volume_name, size=SIZE, backing_image=""): # NOQA host_id = get_self_host_id() client.create_volume(name=volume_name, size=size, numberOfReplicas=2, backingImage=backing_image) volume = common.wait_for_volume_detached(client, volume_name) # Simple Backup Job that runs every 1 minute, retains 1. jobs = [ { "name": RECURRING_JOB_NAME, "cron": "*/1 * * * *", "task": "backup", "retain": 1, "labels": labels } ] volume.recurringUpdate(jobs=jobs) volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) write_volume_random_data(volume) # 1 minutes 15s time.sleep(75) labels["we-added-this-label"] = "definitely" jobs[0]["labels"] = labels volume = volume.recurringUpdate(jobs=jobs) volume = wait_for_volume_healthy(client, volume_name) write_volume_random_data(volume) # 2 minutes 15s time.sleep(135) snapshots = volume.snapshotList() count = 0 for snapshot in snapshots: if snapshot.removed is False: count += 1 # 1 from Backup, 1 from Volume Head. assert count == 2 # Verify the Labels on the actual Backup. bv = client.by_id_backupVolume(volume_name) backups = bv.backupList().data assert len(backups) == 1 b = bv.backupGet(name=backups[0].name) for key, val in iter(labels.items()): assert b.labels.get(key) == val assert b.labels.get(RECURRING_JOB_LABEL) == RECURRING_JOB_NAME # One extra Label from RecurringJob. assert len(b.labels) == len(labels) + 1 if backing_image: assert b.volumeBackingImageName == \ backing_image assert b.volumeBackingImageURL != "" cleanup_volume(client, volume)
def ha_simple_recovery_test(client, volume_name, size, base_image=""): # NOQA volume = create_and_check_volume(client, volume_name, 2, size, base_image) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) ha_rebuild_replica_test(client, volume_name) cleanup_volume(client, volume)
def create_volume_with_replica_on_host(client, volume_name): # NOQA lht_hostId = get_self_host_id() nodes = client.list_node() volume = create_and_check_volume(client, volume_name, len(nodes), SIZE) volume.attach(hostId=lht_hostId, disableFrontend=False) wait_for_volume_healthy(client, volume_name) return volume
def test_tag_basic(client): # NOQA """ Test that applying Tags to Nodes/Disks and retrieving them work as expected. Ensures that Tags are properly validated when updated. 1. Generate tags and apply to the disk and nodes 2. Make sure the tags are applied 3. Try to apply invalid tags to the disk and node. Action will fail. """ host_id = get_self_host_id() node = client.by_id_node(host_id) disks = get_update_disks(node.disks) assert len(node.disks) == 1 assert len(node.disks[list(node.disks)[0]].tags) == 0, f" disks = {disks}" assert len(node.tags) == 0 unsorted_disk, sorted_disk = generate_unordered_tag_names() unsorted_node, sorted_node = generate_unordered_tag_names() update_disks = get_update_disks(node.disks) update_disks[list(update_disks)[0]].tags = unsorted_disk node = update_node_disks(client, node.name, disks=update_disks) disks = get_update_disks(node.disks) assert disks[list(disks)[0]].tags == sorted_disk node = set_node_tags(client, node, unsorted_node) assert node.tags == sorted_node improper_tag_cases = [ [""], # Empty string [" "], # Whitespace ["/"], # Leading / [","], # Illegal character ] for tags in improper_tag_cases: with pytest.raises(Exception) as e: set_node_tags(client, node, tags) assert "at least one error encountered while validating tags" in \ str(e.value) with pytest.raises(Exception) as e: update_disks = get_update_disks(node.disks) update_disks[list(update_disks)[0]].tags = tags update_node_disks(client, node.name, disks=update_disks) assert "at least one error encountered while validating tags" in \ str(e.value) update_disks = get_update_disks(node.disks) update_disks[list(update_disks)[0]].tags = [] node = update_node_disks(client, node.name, disks=update_disks) disks = get_update_disks(node.disks) assert len(node.disks[list(node.disks)[0]].tags) == 0, f"disks = {disks}" node = set_node_tags(client, node) assert len(node.tags) == 0
def wait_for_longhorn_node_ready(): client = get_longhorn_api_client() # NOQA ei = get_default_engine_image(client) ei_name = ei["name"] ei_state = get_engine_image_status_value(client, ei_name) wait_for_engine_image_state(client, ei_name, ei_state) node = get_self_host_id() wait_for_node_up_longhorn(node, client) return client, node
def test_hard_anti_affinity_scheduling(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity work as expected. With Hard Anti-Affinity, scheduling on nodes with existing replicas should be forbidden, resulting in "Degraded" state. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 3. Verify only two of three replicas of volume are healthy. 4. Verify the remaining replica doesn't have `replica.HostID`, meaning it's unscheduled 6. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) # Instead of waiting for timeout and lengthening the tests a significant # amount we can make sure the scheduling isn't working by making sure the # volume becomes Degraded and reports a scheduling error. wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) # While there are three replicas that should exist to meet the Volume's # request, only two of those volumes should actually be Healthy. volume = client.by_id_volume(volume_name) assert sum([ 1 for replica in volume.replicas if replica.running and replica.mode == "RW" ]) == 2 # Confirm that the final volume is an unscheduled volume. assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1 # Three replicas in total should still exist. assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_orphan_creation_and_deletion_in_multiple_disks( client, volume_name, request): # NOQA """ Test orphan creation and deletion in multiple disks 1. Create multiple new-disks for holding orphaned replica directories 2. Create a volume and attach to the current node 3. Create multiple orphaned replica directories by copying the active replica directory 4. Clean up volume 5. Verify orphan list contains the orphan CRs for replica directories 6. Delete all orphaned CRs 7. Verify orphan list is empty 8. Verify orphaned replica directories are deleted """ disk_names = [ "vol-disk-" + generate_random_id(4), "vol-disk-" + generate_random_id(4) ] # Step 1 lht_hostId = get_self_host_id() cleanup_node_disks(client, lht_hostId) disk_paths = crate_disks_on_host(client, disk_names, request) # Step 2 volume = create_volume_with_replica_on_host(client, volume_name) # Step 3 num_orphans = 5 create_orphaned_directories_on_host(volume, disk_paths, num_orphans) # Step 4 cleanup_volume_by_name(client, volume_name) # Step 5 count = wait_for_orphan_count(client, num_orphans * len(disk_paths), 180) assert count == num_orphans * len(disk_paths) # Step 6 delete_orphans(client) # Step 7 assert wait_for_orphan_count(client, 0, 180) == 0 # Step 8 assert wait_for_file_count(os.path.join(disk_paths[0], "replicas"), 0, 180) == 0 assert wait_for_file_count(os.path.join(disk_paths[1], "replicas"), 0, 180) == 0
def get_hosts_for_migration_test(clients): # NOQA """ Filters out the current node from the returned hosts list We use the current node for device writing before the test and verification of the data after the test """ hosts = [] current_host = common.get_self_host_id() for host in list(clients): if host is not current_host: hosts.append(host) return hosts[0], hosts[1]
def check_volume_data(client, volume_name, data): # NOQA """ Attaches the volume to the current node then compares the volumes data against the passed data. """ volume = client.by_id_volume(volume_name) volume.attach(hostId=common.get_self_host_id()) volume = common.wait_for_volume_healthy(client, volume_name) common.check_volume_data(volume, data) volume.detach(hostId="") volume = common.wait_for_volume_detached(client, volume_name)
def backup_test(clients, volume_name, size, base_image=""): # NOQA for host_id, client in clients.iteritems(): break volume = client.create_volume(name=volume_name, size=size, numberOfReplicas=2, baseImage=base_image) volume = common.wait_for_volume_detached(client, volume_name) assert volume["name"] == volume_name assert volume["size"] == size assert volume["numberOfReplicas"] == 2 assert volume["state"] == "detached" assert volume["baseImage"] == base_image lht_hostId = get_self_host_id() volume = volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, volume_name) setting = client.by_id_setting(common.SETTING_BACKUP_TARGET) # test backupTarget for multiple settings backupstores = common.get_backupstore_url() for backupstore in backupstores: if common.is_backupTarget_s3(backupstore): backupsettings = backupstore.split("$") setting = client.update(setting, value=backupsettings[0]) assert setting["value"] == backupsettings[0] credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value=backupsettings[1]) assert credential["value"] == backupsettings[1] else: setting = client.update(setting, value=backupstore) assert setting["value"] == backupstore credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value="") assert credential["value"] == "" backupstore_test(client, lht_hostId, volume_name, size) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) volume = wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def test_ha_salvage(client, volume_name): # NOQA # get a random client volume = client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2) volume = common.wait_for_volume_detached(client, volume_name) assert volume["name"] == volume_name assert volume["size"] == SIZE assert volume["numberOfReplicas"] == 2 assert volume["state"] == "detached" assert volume["created"] != "" host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 2 replica0_name = volume["replicas"][0]["name"] replica1_name = volume["replicas"][1]["name"] data = write_random_data(volume["endpoint"]) common.k8s_delete_replica_pods_for_volume(volume_name) volume = common.wait_for_volume_faulted(client, volume_name) assert len(volume["replicas"]) == 2 assert volume["replicas"][0]["failedAt"] != "" assert volume["replicas"][1]["failedAt"] != "" volume.salvage(names=[replica0_name, replica1_name]) volume = common.wait_for_volume_detached(client, volume_name) assert len(volume["replicas"]) == 2 assert volume["replicas"][0]["failedAt"] == "" assert volume["replicas"][1]["failedAt"] == "" volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) check_data(volume["endpoint"], data) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) common.wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def backing_image_basic_operation_test(client, volume_name, bi_name, bi_url): # NOQA """ Test Backing Image APIs. 1. Create a backing image. 2. Create and attach a Volume with the backing image set. 3. Verify that the all disk states in the backing image are "downloaded". 4. Try to use the API to manually clean up one disk for the backing image but get failed. 5. Try to use the API to directly delete the backing image but get failed. 6. Delete the volume. 7. Use the API to manually clean up one disk for the backing image 8. Delete the backing image. """ volume = create_and_check_volume(client, volume_name, 3, str(BACKING_IMAGE_EXT4_SIZE), bi_name) lht_host_id = get_self_host_id() volume.attach(hostId=lht_host_id) volume = wait_for_volume_healthy(client, volume_name) assert volume.backingImage == bi_name assert volume.size == str(BACKING_IMAGE_EXT4_SIZE) random_disk_id = "" backing_image = client.by_id_backing_image(bi_name) assert backing_image.sourceType == BACKING_IMAGE_SOURCE_TYPE_DOWNLOAD assert backing_image.parameters["url"] == bi_url assert backing_image.currentChecksum != "" assert not backing_image.deletionTimestamp assert len(backing_image.diskFileStatusMap) == 3 for disk_id, status in iter(backing_image.diskFileStatusMap.items()): assert status.state == "ready" random_disk_id = disk_id assert random_disk_id != '' with pytest.raises(Exception): backing_image.backingImageCleanup(disks=[random_disk_id]) with pytest.raises(Exception): client.delete(backing_image) client.delete(volume) wait_for_volume_delete(client, volume_name) backing_image = client.by_id_backing_image(bi_name) backing_image.backingImageCleanup(disks=[random_disk_id]) backing_image = wait_for_backing_image_disk_cleanup( client, bi_name, random_disk_id) client.delete(backing_image)
def test_tag_basic(client): # NOQA """ Test that applying Tags to Nodes/Disks and retrieving them work as expected. Ensures that Tags are properly validated when updated. """ host_id = get_self_host_id() node = client.by_id_node(host_id) disks = get_update_disks(node["disks"]) assert len(node["disks"]) == 1 assert disks[0]["tags"] is None assert node["tags"] is None unsorted_disk, sorted_disk = generate_unordered_tag_names() unsorted_node, sorted_node = generate_unordered_tag_names() update_disks = get_update_disks(node["disks"]) update_disks[0]["tags"] = unsorted_disk node = node.diskUpdate(disks=update_disks) disks = get_update_disks(node["disks"]) assert disks[0]["tags"] == sorted_disk node = set_node_tags(client, node, unsorted_node) assert node["tags"] == sorted_node improper_tag_cases = [ [""], # Empty string [" "], # Whitespace ["/"], # Leading / [","], # Illegal character ] for tags in improper_tag_cases: with pytest.raises(Exception) as e: set_node_tags(client, node, tags) assert "at least one error encountered while validating tags" in \ str(e.value) with pytest.raises(Exception) as e: update_disks = get_update_disks(node["disks"]) update_disks[0]["tags"] = tags node.diskUpdate(disks=update_disks) assert "at least one error encountered while validating tags" in \ str(e.value) update_disks = get_update_disks(node["disks"]) update_disks[0]["tags"] = [] node = node.diskUpdate(disks=update_disks) disks = get_update_disks(node["disks"]) assert disks[0]["tags"] is None node = set_node_tags(client, node) assert node["tags"] is None
def recurring_job_labels_test(client, labels, volume_name, size=SIZE, base_image=""): # NOQA host_id = get_self_host_id() client.create_volume(name=volume_name, size=size, numberOfReplicas=2) volume = common.wait_for_volume_detached(client, volume_name) # Simple Backup Job that runs every 2 minutes, retains 1. jobs = [{ "name": RECURRING_JOB_NAME, "cron": "*/2 * * * *", "task": "backup", "retain": 1, "labels": labels }] volume.recurringUpdate(jobs=jobs) volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) # 5 minutes time.sleep(300) snapshots = volume.snapshotList() count = 0 for snapshot in snapshots: if snapshot["removed"] is False: count += 1 # 1 from Backup, 1 from Volume Head. assert count == 2 # Verify the Labels on the actual Backup. bv = client.by_id_backupVolume(volume_name) backups = bv.backupList() assert len(backups) == 1 b = bv.backupGet(name=backups[0]["name"]) for key, val in labels.iteritems(): assert b["labels"].get(key) == val assert b["labels"].get(RECURRING_JOB_LABEL) == RECURRING_JOB_NAME if base_image: assert b["labels"].get(BASE_IMAGE_LABEL) == base_image # One extra Label from the BaseImage being set. assert len(b["labels"]) == len(labels) + 2 else: # At least one extra Label from RecurringJob. assert len(b["labels"]) == len(labels) + 1 cleanup_volume(client, volume)
def test_ha_prohibit_deleting_last_replica(client, volume_name): # NOQA volume = create_and_check_volume(client, volume_name, 1) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 1 replica0 = volume["replicas"][0] with pytest.raises(Exception) as e: volume.replicaRemove(name=replica0["name"]) assert "no other healthy replica available" in str(e.value) cleanup_volume(client, volume)
def delete_extra_disks_on_host(client, disk_names): # NOQA lht_hostId = get_self_host_id() node = client.by_id_node(lht_hostId) update_disk = get_update_disks(node.disks) for disk_name in disk_names: update_disk[disk_name].allowScheduling = False update_disk[disk_name].evictionRequested = True node = node.diskUpdate(disks=update_disk) for disk_name in disk_names: wait_for_disk_status(client, lht_hostId, disk_name, "storageScheduled", 0)
def create_orphaned_directories_on_host(volume, disk_paths, num_orphans): # NOQA lht_hostId = get_self_host_id() paths = [] for replica in volume.replicas: if replica.hostId != lht_hostId: continue for _ in range(num_orphans): for i, disk_path in enumerate(disk_paths): replica_dir_name = volume.name + "-" + generate_random_id(8) path = os.path.join(disk_path, "replicas", replica_dir_name) paths.append(path) exec_nsenter("cp -a {} {}".format(replica.dataPath, path)) return paths
def test_hard_anti_affinity_offline_rebuild(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity can build new replicas during the attaching process once a valid node is available. Once a new replica has been built as part of the attaching process, the volume should be Healthy again. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 6. Detach the volume. 7. Enable current node's scheduling. 8. Attach the volume again. 9. Wait for volume to become healthy with 3 replicas 10. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume.replicas) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) volume = wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) client.update(node, allowScheduling=True) volume.attach(hostId=host_id) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def ha_salvage_test(client, volume_name, base_image=""): # NOQA volume = client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2, baseImage=base_image) volume = common.wait_for_volume_detached(client, volume_name) assert volume["name"] == volume_name assert volume["size"] == SIZE assert volume["numberOfReplicas"] == 2 assert volume["state"] == "detached" assert volume["created"] != "" assert volume["baseImage"] == base_image host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 2 replica0_name = volume["replicas"][0]["name"] replica1_name = volume["replicas"][1]["name"] data = write_volume_random_data(volume) common.k8s_delete_replica_pods_for_volume(volume_name) volume = common.wait_for_volume_faulted(client, volume_name) assert len(volume["replicas"]) == 2 assert volume["replicas"][0]["failedAt"] != "" assert volume["replicas"][1]["failedAt"] != "" volume.salvage(names=[replica0_name, replica1_name]) volume = common.wait_for_volume_detached(client, volume_name) assert len(volume["replicas"]) == 2 assert volume["replicas"][0]["failedAt"] == "" assert volume["replicas"][1]["failedAt"] == "" volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) common.wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def test_attach_without_frontend(clients, volume_name): # NOQA for host_id, client in clients.iteritems(): break volume = create_and_check_volume(client, volume_name) lht_hostId = get_self_host_id() volume.attach(hostId=lht_hostId, disableFrontend=False) common.wait_for_volume_healthy(client, volume_name) volume = client.by_id_volume(volume_name) assert volume["disableFrontend"] is False assert volume["frontend"] == "blockdev" snap1_data = write_volume_random_data(volume) snap1 = volume.snapshotCreate() write_volume_random_data(volume) volume.snapshotCreate() volume.detach() volume = common.wait_for_volume_detached(client, volume_name) volume.attach(hostId=lht_hostId, disableFrontend=True) common.wait_for_volume_healthy(client, volume_name) volume = client.by_id_volume(volume_name) engine = get_volume_engine(volume) assert volume["disableFrontend"] is True assert volume["frontend"] == "blockdev" assert engine["endpoint"] == "" volume.snapshotRevert(name=snap1["name"]) volume.detach() volume = common.wait_for_volume_detached(client, volume_name) volume.attach(hostId=lht_hostId, disableFrontend=False) common.wait_for_volume_healthy(client, volume_name) volume = client.by_id_volume(volume_name) assert volume["disableFrontend"] is False assert volume["frontend"] == "blockdev" check_volume_data(volume, snap1_data) client.delete(volume) wait_for_volume_delete(client, volume_name)
def test_recurring_job_in_volume_creation(set_random_backupstore, client, volume_name): # NOQA """ Test create volume with recurring jobs 1. Create volume with recurring jobs though Longhorn API 2. Verify the recurring jobs run correctly """ host_id = get_self_host_id() # error when creating volume with duplicate jobs with pytest.raises(Exception) as e: client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2, recurringJobs=create_jobs1() + create_jobs1()) assert "duplicate job" in str(e.value) client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2, recurringJobs=create_jobs1()) volume = common.wait_for_volume_detached(client, volume_name) volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) # wait until the beginning of an even minute wait_until_begin_of_an_even_minute() # wait until the 10th second of an even minute # to avoid writing data at the same time backup is taking time.sleep(10) write_volume_random_data(volume) time.sleep(150) # 2.5 minutes write_volume_random_data(volume) time.sleep(150) # 2.5 minutes check_jobs1_result(volume) volume = volume.detach(hostId="") common.wait_for_volume_detached(client, volume_name) client.delete(volume) wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def backup_labels_test(clients, random_labels, volume_name, size=SIZE, base_image=""): # NOQA for _, client in clients.iteritems(): break host_id = get_self_host_id() volume = create_and_check_volume(client, volume_name, 2, size, base_image) volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) setting = client.by_id_setting(common.SETTING_BACKUP_TARGET) # test backupTarget for multiple settings backupstores = common.get_backupstore_url() for backupstore in backupstores: if common.is_backupTarget_s3(backupstore): backupsettings = backupstore.split("$") setting = client.update(setting, value=backupsettings[0]) assert setting["value"] == backupsettings[0] credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value=backupsettings[1]) assert credential["value"] == backupsettings[1] else: setting = client.update(setting, value=backupstore) assert setting["value"] == backupstore credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value="") assert credential["value"] == "" bv, b, _, _ = create_backup(client, volume_name, labels=random_labels) # If we're running the test with a BaseImage, check that this Label is # set properly. backup = bv.backupGet(name=b["name"]) if base_image: assert backup["labels"].get(common.BASE_IMAGE_LABEL) == base_image # One extra Label from the BaseImage being set. assert len(backup["labels"]) == len(random_labels) + 1 else: assert len(backup["labels"]) == len(random_labels) cleanup_volume(client, volume)
def test_tag_scheduling(client, node_default_tags): # NOQA """ Test that scheduling succeeds if there are available Nodes/Disks with the requested Tags. """ host_id = get_self_host_id() tag_specs = [ # Select all Nodes. { "disk": [], "expected": 3, "node": [] }, # Selector works with AND on Disk Tags. { "disk": ["ssd", "nvme"], "expected": 2, "node": [] }, # Selector works with AND on Node Tags. { "disk": [], "expected": 2, "node": ["main", "storage"] }, # Selector works based on combined Disk AND Node selector. { "disk": ["ssd", "nvme"], "expected": 1, "node": ["storage", "main"] } ] for specs in tag_specs: volume_name = generate_volume_name() # NOQA client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=3, diskSelector=specs["disk"], nodeSelector=specs["node"]) volume = wait_for_volume_detached(client, volume_name) assert volume["diskSelector"] == specs["disk"] assert volume["nodeSelector"] == specs["node"] volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == 3 check_volume_replicas(volume, specs, node_default_tags) cleanup_volume(client, volume)
def test_volume_scheduling_failure(clients, volume_name): # NOQA ''' Test fail to schedule by disable scheduling for all the nodes Also test cannot attach a scheduling failed volume ''' client = get_random_client(clients) nodes = client.list_node() assert len(nodes) > 0 for node in nodes: node = client.update(node, allowScheduling=False) node = common.wait_for_node_update(client, node["id"], "allowScheduling", False) volume = client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=3) volume = common.wait_for_volume_condition_scheduled(client, volume_name, "status", CONDITION_STATUS_FALSE) volume = common.wait_for_volume_detached(client, volume_name) self_node = get_self_host_id() with pytest.raises(Exception) as e: volume.attach(hostId=self_node) assert "not scheduled" in str(e.value) for node in nodes: node = client.update(node, allowScheduling=True) node = common.wait_for_node_update(client, node["id"], "allowScheduling", True) volume = common.wait_for_volume_condition_scheduled(client, volume_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, volume_name) volume = volume.attach(hostId=self_node) volume = common.wait_for_volume_healthy(client, volume_name) endpoint = get_volume_endpoint(volume) assert endpoint != "" volume_rw_test(endpoint) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) wait_for_volume_delete(client, volume_name)
def test_delete_orphan_after_orphaned_dir_is_deleted(client, volume_name, request): # NOQA """ Test the immediate deletion of orphan CRs after the orphaned replica directory is deleted 1. Create a new disk for holding valid and invalid orphaned replica directories 2. Create a volume and attach to the current node 3. Create a valid orphaned replica directories by copying the active replica directory 4. Clean up volume 5. Verify orphan list contains the orphan CR for the orphaned replica directories 6. Delete the on-disk orphaned replica directories 7. Delete the orphan CRs immediately 8. Verify orphan list is empty """ disk_names = ["vol-disk-" + generate_random_id(4)] # Step 1 lht_hostId = get_self_host_id() cleanup_node_disks(client, lht_hostId) disk_paths = crate_disks_on_host(client, disk_names, request) # Step 2 volume = create_volume_with_replica_on_host(client, volume_name) # Step 3 orphaned_directories = create_orphaned_directories_on_host( volume, disk_paths, 1) # Step 4 cleanup_volume_by_name(client, volume_name) # Step 5 assert wait_for_orphan_count(client, 1, 180) == 1 # Step 6 delete_orphaned_directory_on_host(orphaned_directories) # Step 7 delete_orphans(client) # Step 8 assert wait_for_orphan_count(client, 0, 180) == 0
def test_hard_anti_affinity_live_rebuild(client, volume_name): # NOQA """ Test that volumes with Hard Anti-Affinity can build new replicas live once a valid node is available. If no nodes without existing replicas are available, the volume should remain in "Degraded" state. However, once one is available, the replica should now be scheduled successfully, with the volume returning to "Healthy" state. 1. Create a volume and attach to the current node 2. Generate and write `data` to the volume. 3. Set `soft anti-affinity` to false 4. Disable current node's scheduling. 5. Remove the replica on the current node 1. Verify volume will be in degraded state. 2. Verify volume reports condition `scheduled == false` 6. Enable the current node's scheduling 7. Wait for volume to start rebuilding and become healthy again 8. Check volume `data` """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="false") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = map(lambda replica: replica.name, volume.replicas) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) wait_for_volume_degraded(client, volume_name) wait_scheduling_failure(client, volume_name) # Allow scheduling on host node again client.update(node, allowScheduling=True) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_tag_scheduling_on_update(client, node_default_tags, volume_name): # NOQA """ Test that Replicas get scheduled if a Node/Disk disks updated with the proper Tags. """ tag_spec = { "disk": ["ssd", "m2"], "expected": 1, "node": ["main", "fallback"] } client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=3, diskSelector=tag_spec["disk"], nodeSelector=tag_spec["node"]) volume = wait_for_volume_detached(client, volume_name) assert volume["diskSelector"] == tag_spec["disk"] assert volume["nodeSelector"] == tag_spec["node"] wait_scheduling_failure(client, volume_name) host_id = get_self_host_id() node = client.by_id_node(host_id) update_disks = get_update_disks(node["disks"]) update_disks[0]["tags"] = tag_spec["disk"] node = node.diskUpdate(disks=update_disks) set_node_tags(client, node, tag_spec["node"]) scheduled = False for i in range(RETRY_COUNTS): v = client.by_id_volume(volume_name) if v["conditions"]["scheduled"]["status"] == "True": scheduled = True if scheduled: break sleep(RETRY_INTERVAL) assert scheduled volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) nodes = client.list_node() node_mapping = {node["id"]: { "disk": get_update_disks(node["disks"])[0]["tags"], "node": node["tags"] } for node in nodes} assert len(volume["replicas"]) == 3 check_volume_replicas(volume, tag_spec, node_mapping) cleanup_volume(client, volume)
def test_deleting_backup_volume(clients): # NOQA for host_id, client in clients.iteritems(): break lht_hostId = get_self_host_id() volName = generate_volume_name() volume = create_and_check_volume(client, volName) volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, volName) bv, _, snap1, _ = create_backup(client, volName) _, _, snap2, _ = create_backup(client, volName) bv = client.by_id_backupVolume(volName) client.delete(bv) common.wait_for_backup_volume_delete(client, volName) cleanup_volume(client, volume)
def test_soft_anti_affinity_detach(client, volume_name): # NOQA """ Test that volumes with Soft Anti-Affinity can detach and reattach to a node properly. 1. Create a volume and attach to the current node. 2. Generate and write `data` to the volume 3. Set `soft anti-affinity` to true 4. Disable current node's scheduling. 5. Remove the replica on the current node 6. Wait for the new replica to be rebuilt 7. Detach the volume. 8. Verify there are 3 replicas 9. Attach the volume again. Verify there are still 3 replicas 10. Verify the `data`. """ volume = create_and_check_volume(client, volume_name) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="true") node = client.by_id_node(host_id) client.update(node, allowScheduling=False) replica_names = list(map(lambda replica: replica.name, volume.replicas)) host_replica = get_host_replica(volume, host_id) volume.replicaRemove(name=host_replica.name) wait_new_replica_ready(client, volume_name, replica_names) volume = wait_for_volume_healthy(client, volume_name) volume.detach() volume = wait_for_volume_detached(client, volume_name) assert len(volume.replicas) == 3 volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) assert len(volume.replicas) == 3 check_volume_data(volume, data) cleanup_volume(client, volume)
def test_update_node(client): # NOQA # test node update nodes = client.list_node() assert len(nodes) > 0 lht_hostId = get_self_host_id() node = client.by_id_node(lht_hostId) node = client.update(node, allowScheduling=False) node = common.wait_for_node_update(client, lht_hostId, "allowScheduling", False) assert not node["allowScheduling"] node = client.by_id_node(lht_hostId) assert not node["allowScheduling"] node = client.update(node, allowScheduling=True) node = common.wait_for_node_update(client, lht_hostId, "allowScheduling", True) assert node["allowScheduling"] node = client.by_id_node(lht_hostId) assert node["allowScheduling"]
def test_node_controller_sync_storage_scheduled(client): # NOQA lht_hostId = get_self_host_id() nodes = client.list_node() for node in nodes: for fsid, disk in node["disks"].iteritems(): # wait for node controller update disk status wait_for_disk_status(client, node["name"], fsid, "storageScheduled", 0) # create a volume and test update StorageScheduled of each node vol_name = common.generate_volume_name() volume = create_volume(client, vol_name, str(SMALL_DISK_SIZE), lht_hostId, len(nodes)) replicas = volume["replicas"] for replica in replicas: id = replica["hostId"] assert id != "" assert replica["running"] # wait for node controller to update disk status for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_status(client, node["name"], fsid, "storageScheduled", SMALL_DISK_SIZE) nodes = client.list_node() for node in nodes: disks = node["disks"] for replica in replicas: if replica["hostId"] == node["name"]: disk = disks[replica["diskID"]] conditions = disk["conditions"] assert disk["storageScheduled"] == SMALL_DISK_SIZE assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_TRUE break # clean volumes cleanup_volume(client, vol_name)
def test_replica_scheduler_just_under_over_provisioning(client): # NOQA over_provisioning_setting = client.by_id_setting( SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE) old_provisioning_setting = over_provisioning_setting["value"] # set storage over provisioning percentage to 100 over_provisioning_setting = client.update(over_provisioning_setting, value="100") lht_hostId = get_self_host_id() nodes = client.list_node() expect_node_disk = {} max_size_array = [] for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk max_size_array.append(disk["storageMaximum"]) disk["storageReserved"] = 0 update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_status(client, node["name"], fsid, "storageReserved", 0) max_size = min(max_size_array) # test just under over provisioning limit could be scheduled vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=str(max_size), numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) nodes = client.list_node() node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name) client.update(over_provisioning_setting, value=old_provisioning_setting)
def test_replica_scheduler_large_volume_fit_small_disk(client): # NOQA nodes = client.list_node() # create a small size disk on current node lht_hostId = get_self_host_id() node = client.by_id_node(lht_hostId) small_disk_path = create_host_disk(client, "vol-small", SIZE, lht_hostId) small_disk = {"path": small_disk_path, "allowScheduling": True} update_disks = get_update_disks(node["disks"]) update_disks.append(small_disk) node = node.diskUpdate(disks=update_disks) node = common.wait_for_disk_update(client, lht_hostId, len(update_disks)) assert len(node["disks"]) == len(update_disks) unexpected_disk = {} for fsid, disk in node["disks"].iteritems(): if disk["path"] == small_disk_path: unexpected_disk["fsid"] = fsid unexpected_disk["path"] = disk["path"] break # volume is too large to fill into small size disk on current node vol_name = common.generate_volume_name() volume = create_volume(client, vol_name, str(Gi), lht_hostId, len(nodes)) nodes = client.list_node() node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check replica on current node shouldn't schedule to small disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] if id == lht_hostId: assert replica["diskID"] != unexpected_disk["fsid"] assert replica["dataPath"] != unexpected_disk["path"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 cleanup_volume(client, vol_name) # cleanup test disks node = client.by_id_node(lht_hostId) disks = node["disks"] disk = disks[unexpected_disk["fsid"]] disk["allowScheduling"] = False update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) node = wait_for_disk_status(client, lht_hostId, unexpected_disk["fsid"], "allowScheduling", False) disks = node["disks"] disk = disks[unexpected_disk["fsid"]] assert not disk["allowScheduling"] disks.pop(unexpected_disk["fsid"]) update_disks = get_update_disks(disks) node.diskUpdate(disks=update_disks) cleanup_host_disk(client, 'vol-small')
def test_replica_scheduler_too_large_volume_fit_any_disks(client): # NOQA nodes = client.list_node() lht_hostId = get_self_host_id() expect_node_disk = {} for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk disk["storageReserved"] = disk["storageMaximum"] update_disks = get_update_disks(disks) node.diskUpdate(disks=update_disks) # volume is too large to fill into any disks vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=str(4*Gi), numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_FALSE) # reduce StorageReserved of each default disk nodes = client.list_node() for node in nodes: disks = node["disks"] update_disks = get_update_disks(disks) for disk in update_disks: disk["storageReserved"] = 0 node = node.diskUpdate(disks=update_disks) disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_status(client, node["name"], fsid, "storageReserved", 0) # check volume status volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) nodes = client.list_node() node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name)
def test_replica_cleanup(client): # NOQA nodes = client.list_node() lht_hostId = get_self_host_id() node = client.by_id_node(lht_hostId) extra_disk_path = create_host_disk(client, "extra-disk", "10G", lht_hostId) extra_disk = {"path": extra_disk_path, "allowScheduling": True} update_disks = get_update_disks(node["disks"]) update_disks.append(extra_disk) node = node.diskUpdate(disks=update_disks) node = common.wait_for_disk_update(client, lht_hostId, len(update_disks)) assert len(node["disks"]) == len(update_disks) extra_disk_fsid = "" for fsid, disk in node["disks"].iteritems(): if disk["path"] == extra_disk_path: extra_disk_fsid = fsid break for node in nodes: # disable all the disks except the ones on the current node if node["name"] == lht_hostId: continue for fsid, disk in node["disks"].iteritems(): break disk["allowScheduling"] = False update_disks = get_update_disks(node["disks"]) node.diskUpdate(disks=update_disks) node = wait_for_disk_status(client, node["name"], fsid, "allowScheduling", False) vol_name = common.generate_volume_name() # more replicas, make sure both default and extra disk will get one volume = create_volume(client, vol_name, str(Gi), lht_hostId, 5) data_paths = [] for replica in volume["replicas"]: data_paths.append(replica["dataPath"]) # data path should exist now for data_path in data_paths: assert exec_nsenter("ls {}".format(data_path)) cleanup_volume(client, vol_name) # data path should be gone due to the cleanup of replica for data_path in data_paths: with pytest.raises(subprocess.CalledProcessError): exec_nsenter("ls {}".format(data_path)) node = client.by_id_node(lht_hostId) disks = node["disks"] disk = disks[extra_disk_fsid] disk["allowScheduling"] = False update_disks = get_update_disks(disks) node = node.diskUpdate(disks=update_disks) node = wait_for_disk_status(client, lht_hostId, extra_disk_fsid, "allowScheduling", False) wait_for_disk_status(client, lht_hostId, extra_disk_fsid, "storageScheduled", 0) disks = node["disks"] disk = disks[extra_disk_fsid] assert not disk["allowScheduling"] disks.pop(extra_disk_fsid) update_disks = get_update_disks(disks) node.diskUpdate(disks=update_disks) node = common.wait_for_disk_update(client, lht_hostId, len(update_disks)) cleanup_host_disk(client, 'extra-disk')
def test_node_delete_umount_disks(client): # NOQA # create test disks for node disk_volume_name = 'vol-disk-1' lht_hostId = get_self_host_id() node = client.by_id_node(lht_hostId) disks = node["disks"] disk_path1 = create_host_disk(client, disk_volume_name, str(Gi), lht_hostId) disk1 = {"path": disk_path1, "allowScheduling": True, "storageReserved": SMALL_DISK_SIZE} update_disk = get_update_disks(disks) for disk in update_disk: disk["allowScheduling"] = False # add new disk for node update_disk.append(disk1) # save disks to node node = node.diskUpdate(disks=update_disk) node = common.wait_for_disk_update(client, lht_hostId, len(update_disk)) assert len(node["disks"]) == len(update_disk) node = client.by_id_node(lht_hostId) assert len(node["disks"]) == len(update_disk) disks = node["disks"] # wait for node controller to update disk status for fsid, disk in disks.iteritems(): if disk["path"] == disk_path1: wait_for_disk_status(client, lht_hostId, fsid, "allowScheduling", True) wait_for_disk_status(client, lht_hostId, fsid, "storageReserved", SMALL_DISK_SIZE) free, total = common.get_host_disk_size(disk_path1) wait_for_disk_status(client, lht_hostId, fsid, "storageAvailable", free) wait_for_disk_status(client, lht_hostId, fsid, "storageMaximum", total) node = client.by_id_node(lht_hostId) disks = node["disks"] for key, disk in disks.iteritems(): if disk["path"] == disk_path1: assert disk["allowScheduling"] assert disk["storageReserved"] == SMALL_DISK_SIZE assert disk["storageScheduled"] == 0 free, total = common.get_host_disk_size(disk_path1) assert disk["storageMaximum"] == total assert disk["storageAvailable"] == free conditions = disk["conditions"] assert conditions[DISK_CONDITION_READY]["status"] == \ CONDITION_STATUS_TRUE assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_TRUE else: assert not disk["allowScheduling"] # create a volume nodes = client.list_node() vol_name = common.generate_volume_name() volume = create_volume(client, vol_name, str(SMALL_DISK_SIZE), lht_hostId, len(nodes)) replicas = volume["replicas"] for replica in replicas: id = replica["hostId"] assert id != "" assert replica["running"] if id == lht_hostId: assert replica["dataPath"].startswith(disk_path1) # umount the disk mount_path = os.path.join(DIRECTORY_PATH, disk_volume_name) common.umount_disk(mount_path) # wait for update node status node = client.by_id_node(lht_hostId) disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == disk_path1: wait_for_disk_status(client, lht_hostId, fsid, "allowScheduling", False) wait_for_disk_status(client, lht_hostId, fsid, "storageMaximum", 0) wait_for_disk_conditions(client, lht_hostId, fsid, DISK_CONDITION_READY, CONDITION_STATUS_FALSE) # check result node = client.by_id_node(lht_hostId) disks = node["disks"] update_disks = [] for fsid, disk in disks.iteritems(): if disk["path"] == disk_path1: assert not disk["allowScheduling"] assert disk["storageMaximum"] == 0 assert disk["storageAvailable"] == 0 assert disk["storageReserved"] == SMALL_DISK_SIZE assert disk["storageScheduled"] == SMALL_DISK_SIZE conditions = disk["conditions"] assert conditions[DISK_CONDITION_READY]["status"] == \ CONDITION_STATUS_FALSE assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_FALSE else: conditions = disk["conditions"] assert conditions[DISK_CONDITION_READY]["status"] == \ CONDITION_STATUS_TRUE assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_TRUE update_disks.append(disk) # delete umount disk exception with pytest.raises(Exception) as e: node.diskUpdate(disks=update_disks) assert "disable the disk" in str(e.value) # update other disks disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] != disk_path1: disk["allowScheduling"] = True test_update = get_update_disks(disks) node = node.diskUpdate(disks=test_update) disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] != disk_path1: wait_for_disk_status(client, lht_hostId, fsid, "allowScheduling", True) node = client.by_id_node(lht_hostId) disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] != disk_path1: assert disk["allowScheduling"] # mount the disk back mount_path = os.path.join(DIRECTORY_PATH, disk_volume_name) disk_volume = client.by_id_volume(disk_volume_name) dev = get_volume_endpoint(disk_volume) common.mount_disk(dev, mount_path) # wait for update node status node = client.by_id_node(lht_hostId) disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == disk_path1: wait_for_disk_status(client, lht_hostId, fsid, "allowScheduling", False) wait_for_disk_conditions(client, lht_hostId, fsid, DISK_CONDITION_READY, CONDITION_STATUS_TRUE) # check result node = client.by_id_node(lht_hostId) disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == disk_path1: free, total = common.get_host_disk_size(disk_path1) assert not disk["allowScheduling"] assert disk["storageMaximum"] == total assert disk["storageAvailable"] == free assert disk["storageReserved"] == SMALL_DISK_SIZE assert disk["storageScheduled"] == SMALL_DISK_SIZE conditions = disk["conditions"] assert conditions[DISK_CONDITION_READY]["status"] == \ CONDITION_STATUS_TRUE assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_TRUE else: conditions = disk["conditions"] assert conditions[DISK_CONDITION_READY]["status"] == \ CONDITION_STATUS_TRUE assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \ CONDITION_STATUS_TRUE # delete volume and umount disk cleanup_volume(client, vol_name) mount_path = os.path.join(DIRECTORY_PATH, disk_volume_name) common.umount_disk(mount_path) # wait for update node status node = client.by_id_node(lht_hostId) disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == disk_path1: wait_for_disk_status(client, lht_hostId, fsid, "allowScheduling", False) wait_for_disk_status(client, lht_hostId, fsid, "storageScheduled", 0) wait_for_disk_status(client, lht_hostId, fsid, "storageMaximum", 0) # test delete the umount disk node = client.by_id_node(lht_hostId) node.diskUpdate(disks=update_disks) node = common.wait_for_disk_update(client, lht_hostId, len(update_disks)) assert len(node["disks"]) == len(update_disks) cmd = ['rm', '-r', mount_path] subprocess.check_call(cmd)
def test_node_disk_update(client): # NOQA lht_hostId = get_self_host_id() node = client.by_id_node(lht_hostId) disks = node["disks"] # test add same disk by different mount path exception with pytest.raises(Exception) as e: disk = {"path": "/var/lib", "allowScheduling": True, "storageReserved": 2 * Gi} update_disk = get_update_disks(disks) update_disk.append(disk) node = node.diskUpdate(disks=update_disk) assert "the same file system" in str(e.value) # test delete disk exception with pytest.raises(Exception) as e: node.diskUpdate(disks=[]) assert "disable the disk" in str(e.value) # test storageReserved invalid exception with pytest.raises(Exception) as e: for fsid, disk in disks.iteritems(): disk["storageReserved"] = disk["storageMaximum"] + 1*Gi update_disk = get_update_disks(disks) node.diskUpdate(disks=update_disk) assert "storageReserved setting of disk" in str(e.value) # create multiple disks for node node = client.by_id_node(lht_hostId) disks = node["disks"] disk_path1 = create_host_disk(client, 'vol-disk-1', str(Gi), lht_hostId) disk1 = {"path": disk_path1, "allowScheduling": True} disk_path2 = create_host_disk(client, 'vol-disk-2', str(Gi), lht_hostId) disk2 = {"path": disk_path2, "allowScheduling": True} update_disk = get_update_disks(disks) # add new disk for node update_disk.append(disk1) update_disk.append(disk2) # save disks to node node = node.diskUpdate(disks=update_disk) node = common.wait_for_disk_update(client, lht_hostId, len(update_disk)) assert len(node["disks"]) == len(update_disk) node = client.by_id_node(lht_hostId) assert len(node["disks"]) == len(update_disk) # update disk disks = node["disks"] update_disk = get_update_disks(disks) for disk in update_disk: # keep default disk for other tests if disk["path"] == disk_path1 or disk["path"] == disk_path2: disk["allowScheduling"] = False disk["storageReserved"] = SMALL_DISK_SIZE node = node.diskUpdate(disks=update_disk) disks = node["disks"] # wait for node controller to update disk status for fsid, disk in disks.iteritems(): if disk["path"] == disk_path1 or disk["path"] == disk_path2: wait_for_disk_status(client, lht_hostId, fsid, "allowScheduling", False) wait_for_disk_status(client, lht_hostId, fsid, "storageReserved", SMALL_DISK_SIZE) free, total = common.get_host_disk_size(disk_path1) wait_for_disk_status(client, lht_hostId, fsid, "storageAvailable", free) node = client.by_id_node(lht_hostId) disks = node["disks"] for key, disk in disks.iteritems(): if disk["path"] == disk_path1: assert not disk["allowScheduling"] assert disk["storageReserved"] == SMALL_DISK_SIZE assert disk["storageScheduled"] == 0 free, total = common.get_host_disk_size(disk_path1) assert disk["storageMaximum"] == total assert disk["storageAvailable"] == free elif disk["path"] == disk_path2: assert not disk["allowScheduling"] assert disk["storageReserved"] == SMALL_DISK_SIZE assert disk["storageScheduled"] == 0 free, total = common.get_host_disk_size(disk_path2) assert disk["storageMaximum"] == total assert disk["storageAvailable"] == free # delete other disks, just remain default disk update_disk = get_update_disks(disks) remain_disk = [] for disk in update_disk: if disk["path"] != disk_path1 and disk["path"] != disk_path2: remain_disk.append(disk) node = node.diskUpdate(disks=remain_disk) node = wait_for_disk_update(client, lht_hostId, len(remain_disk)) assert len(node["disks"]) == len(remain_disk) # cleanup disks cleanup_host_disk(client, 'vol-disk-1', 'vol-disk-2')
def ha_backup_deletion_recovery_test(client, volume_name, size, base_image=""): # NOQA volume = client.create_volume(name=volume_name, size=size, numberOfReplicas=2, baseImage=base_image) volume = common.wait_for_volume_detached(client, volume_name) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) setting = client.by_id_setting(common.SETTING_BACKUP_TARGET) # test backupTarget for multiple settings backupstores = common.get_backupstore_url() for backupstore in backupstores: if common.is_backupTarget_s3(backupstore): backupsettings = backupstore.split("$") setting = client.update(setting, value=backupsettings[0]) assert setting["value"] == backupsettings[0] credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value=backupsettings[1]) assert credential["value"] == backupsettings[1] else: setting = client.update(setting, value=backupstore) assert setting["value"] == backupstore credential = client.by_id_setting( common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET) credential = client.update(credential, value="") assert credential["value"] == "" data = write_volume_random_data(volume) snap2 = volume.snapshotCreate() volume.snapshotCreate() volume.snapshotBackup(name=snap2["name"]) _, b = common.find_backup(client, volume_name, snap2["name"]) res_name = common.generate_volume_name() res_volume = client.create_volume(name=res_name, size=size, numberOfReplicas=2, fromBackup=b["url"]) res_volume = common.wait_for_volume_detached(client, res_name) res_volume = res_volume.attach(hostId=host_id) res_volume = common.wait_for_volume_healthy(client, res_name) check_volume_data(res_volume, data) snapshots = res_volume.snapshotList() # only the backup snapshot + volume-head assert len(snapshots) == 2 backup_snapshot = "" for snap in snapshots: if snap["name"] != "volume-head": backup_snapshot = snap["name"] assert backup_snapshot != "" res_volume.snapshotCreate() snapshots = res_volume.snapshotList() assert len(snapshots) == 3 res_volume.snapshotDelete(name=backup_snapshot) res_volume.snapshotPurge() snapshots = res_volume.snapshotList() assert len(snapshots) == 2 ha_rebuild_replica_test(client, res_name) res_volume = res_volume.detach() res_volume = common.wait_for_volume_detached(client, res_name) client.delete(res_volume) common.wait_for_volume_delete(client, res_name) volume = volume.detach() volume = common.wait_for_volume_detached(client, volume_name) client.delete(volume) common.wait_for_volume_delete(client, volume_name) volumes = client.list_volume() assert len(volumes) == 0
def test_replica_scheduler_update_minimal_available(client): # NOQA minimal_available_setting = client.by_id_setting( SETTING_STORAGE_MINIMAL_AVAILABLE_PERCENTAGE) old_minimal_setting = minimal_available_setting["value"] nodes = client.list_node() expect_node_disk = {} for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): if disk["path"] == DEFAULT_DISK_PATH: expect_disk = disk expect_disk["fsid"] = fsid expect_node_disk[node["name"]] = expect_disk # set storage minimal available percentage to 100 # to test all replica couldn't be scheduled minimal_available_setting = client.update(minimal_available_setting, value="100") # wait for disks state nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_FALSE) lht_hostId = get_self_host_id() vol_name = common.generate_volume_name() volume = client.create_volume(name=vol_name, size=SIZE, numberOfReplicas=len(nodes)) volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_FALSE) # set storage minimal available percentage to default value(10) minimal_available_setting = client.update(minimal_available_setting, value=old_minimal_setting) # wait for disks state nodes = client.list_node() for node in nodes: disks = node["disks"] for fsid, disk in disks.iteritems(): wait_for_disk_conditions(client, node["name"], fsid, DISK_CONDITION_SCHEDULABLE, CONDITION_STATUS_TRUE) # check volume status volume = common.wait_for_volume_condition_scheduled(client, vol_name, "status", CONDITION_STATUS_TRUE) volume = common.wait_for_volume_detached(client, vol_name) assert volume["state"] == "detached" assert volume["created"] != "" volume.attach(hostId=lht_hostId) volume = common.wait_for_volume_healthy(client, vol_name) nodes = client.list_node() node_hosts = [] for node in nodes: node_hosts.append(node["name"]) # check all replica should be scheduled to default disk for replica in volume["replicas"]: id = replica["hostId"] assert id != "" assert replica["running"] expect_disk = expect_node_disk[id] assert replica["diskID"] == expect_disk["fsid"] assert expect_disk["path"] in replica["dataPath"] node_hosts = filter(lambda x: x != id, node_hosts) assert len(node_hosts) == 0 # clean volume and disk cleanup_volume(client, vol_name)