def delete_replica(client, volume_name): volume = client.by_id_volume(volume_name) replica_count = len(volume.replicas) healthy_replica_count = 0 for replica in volume.replicas: if replica.running is True and replica.mode == "RW": healthy_replica_count += 1 # return if there is no or only one healthy replica left if healthy_replica_count == 1: print("skipped, only one healthy replica found", end=" ") return if healthy_replica_count == 0: print("skipped, no healthy replicas found", end=" ") return replica_id = randrange(0, replica_count) replica_name = volume["replicas"][replica_id]["name"] volume.replicaRemove(name=replica_name) wait_for_volume_degraded(client, volume_name) global WAIT_REPLICA_REBUILD if WAIT_REPLICA_REBUILD is None: WAIT_REPLICA_REBUILD = bool(random.getrandbits(1)) if WAIT_REPLICA_REBUILD is True: wait_for_volume_replica_count(client, volume_name, replica_count) replica_names = map(lambda replica: replica.name, volume["replicas"]) wait_new_replica_ready(client, volume_name, replica_names)
def test_volume_update_replica_count(clients, volume_name): # NOQA for host_id, client in clients.iteritems(): break replica_count = 3 volume = create_and_check_volume(client, volume_name, replica_count) volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) replica_count = 5 volume = volume.updateReplicaCount(replicaCount=replica_count) volume = common.wait_for_volume_degraded(client, volume_name) volume = common.wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == replica_count old_replica_count = replica_count replica_count = 2 volume = volume.updateReplicaCount(replicaCount=replica_count) volume = common.wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == old_replica_count volume.replicaRemove(name=volume["replicas"][0]["name"]) volume.replicaRemove(name=volume["replicas"][1]["name"]) volume.replicaRemove(name=volume["replicas"][2]["name"]) volume = common.wait_for_volume_replica_count(client, volume_name, replica_count) volume = common.wait_for_volume_healthy(client, volume_name) assert len(volume["replicas"]) == replica_count client.delete(volume) wait_for_volume_delete(client, volume_name)
def engine_live_upgrade_rollback_test(client, volume_name, base_image=""): # NOQA default_img = common.get_default_engine_image(client) default_img_name = default_img["name"] default_img = wait_for_engine_image_ref_count(client, default_img_name, 0) cli_v = default_img["cliAPIVersion"] cli_minv = default_img["cliAPIMinVersion"] ctl_v = default_img["controllerAPIVersion"] ctl_minv = default_img["controllerAPIMinVersion"] data_v = default_img["dataFormatVersion"] data_minv = default_img["dataFormatMinVersion"] wrong_engine_upgrade_image = common.get_compatibility_test_image( cli_v, cli_minv, ctl_v, ctl_minv, data_v, data_minv) new_img = client.create_engine_image(image=wrong_engine_upgrade_image) new_img_name = new_img["name"] new_img = wait_for_engine_image_state(client, new_img_name, "ready") assert new_img["refCount"] == 0 assert new_img["noRefSince"] != "" default_img = common.get_default_engine_image(client) default_img_name = default_img["name"] volume = client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2, baseImage=base_image) volume = common.wait_for_volume_detached(client, volume_name) default_img = wait_for_engine_image_ref_count(client, default_img_name, 1) assert volume["baseImage"] == base_image original_engine_image = volume["engineImage"] assert original_engine_image != wrong_engine_upgrade_image host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) data = write_volume_random_data(volume) volume.engineUpgrade(image=wrong_engine_upgrade_image) volume = client.by_id_volume(volume["name"]) assert volume["engineImage"] == wrong_engine_upgrade_image assert volume["currentImage"] == original_engine_image with pytest.raises(Exception): # this will timeout wait_for_volume_current_image(client, volume_name, wrong_engine_upgrade_image) # rollback volume.engineUpgrade(image=original_engine_image) volume = wait_for_volume_current_image(client, volume_name, original_engine_image) assert volume["engineImage"] == original_engine_image assert volume["currentImage"] == original_engine_image engine = get_volume_engine(volume) assert engine["engineImage"] == original_engine_image assert engine["currentImage"] == original_engine_image volume = common.wait_for_volume_replica_count(client, volume_name, REPLICA_COUNT) check_volume_data(volume, data) assert volume["state"] == common.VOLUME_STATE_ATTACHED assert volume["robustness"] == common.VOLUME_ROBUSTNESS_HEALTHY # try again, this time let's try detach volume.engineUpgrade(image=wrong_engine_upgrade_image) volume = client.by_id_volume(volume["name"]) assert volume["engineImage"] == wrong_engine_upgrade_image assert volume["currentImage"] == original_engine_image with pytest.raises(Exception): # this will timeout wait_for_volume_current_image(client, volume_name, wrong_engine_upgrade_image) volume = volume.detach() volume = wait_for_volume_current_image(client, volume_name, wrong_engine_upgrade_image) # all the images would be updated assert volume["engineImage"] == wrong_engine_upgrade_image engine = get_volume_engine(volume) assert engine["engineImage"] == wrong_engine_upgrade_image volume = common.wait_for_volume_replica_count(client, volume_name, REPLICA_COUNT) for replica in volume["replicas"]: assert replica["engineImage"] == wrong_engine_upgrade_image # upgrade to the correct image when offline volume.engineUpgrade(image=original_engine_image) volume = client.by_id_volume(volume["name"]) assert volume["engineImage"] == original_engine_image volume = volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) assert volume["engineImage"] == original_engine_image assert volume["currentImage"] == original_engine_image engine = get_volume_engine(volume) assert engine["engineImage"] == original_engine_image assert engine["currentImage"] == original_engine_image for replica in volume["replicas"]: assert replica["engineImage"] == original_engine_image assert replica["currentImage"] == original_engine_image check_volume_data(volume, data) client.delete(volume) wait_for_volume_delete(client, volume_name) client.delete(new_img)
def test_zone_tags(client, core_api, volume_name, k8s_node_zone_tags): # NOQA """ Test anti affinity zone feature 1. Add Kubernetes zone labels to the nodes 1. Only two zones now: zone1 and zone2 2. Create a volume with two replicas 3. Verify zone1 and zone2 either has one replica. 4. Remove a random replica and wait for volume to finish rebuild 5. Verify zone1 and zone2 either has one replica. 6. Repeat step 4-5 a few times. 7. Update volume to 3 replicas, make sure they're scheduled on 3 nodes 8. Remove a random replica and wait for volume to finish rebuild 9. Make sure replicas are on different nodes 10. Repeat step 8-9 a few times """ wait_longhorn_node_zone_updated(client) volume = create_and_check_volume(client, volume_name, num_of_replicas=2) host_id = get_self_host_id() volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) volume = client.by_id_volume(volume_name) zone1_replica_count = get_zone_replica_count(client, volume_name, ZONE1) zone2_replica_count = get_zone_replica_count(client, volume_name, ZONE2) assert zone1_replica_count == zone2_replica_count for i in range(randrange(3, 5)): volume = client.by_id_volume(volume_name) replica_count = len(volume.replicas) assert replica_count == 2 replica_id = randrange(0, replica_count) replica_name = volume.replicas[replica_id].name volume.replicaRemove(name=replica_name) wait_for_volume_degraded(client, volume_name) wait_for_volume_healthy(client, volume_name) wait_for_volume_replica_count(client, volume_name, replica_count) volume = client.by_id_volume(volume_name) replica_names = map(lambda replica: replica.name, volume["replicas"]) wait_new_replica_ready(client, volume_name, replica_names) zone1_replica_count = \ get_zone_replica_count(client, volume_name, ZONE1) zone2_replica_count = \ get_zone_replica_count(client, volume_name, ZONE2) assert zone1_replica_count == zone2_replica_count volume.updateReplicaCount(replicaCount=3) wait_for_volume_degraded(client, volume_name) wait_for_volume_replica_count(client, volume_name, 3) wait_for_volume_healthy(client, volume_name) volume = client.by_id_volume(volume_name) lh_node_names = list(map(lambda node: node.name, client.list_node())) for replica in volume.replicas: lh_node_names.remove(replica.hostId) assert lh_node_names == [] for i in range(randrange(3, 5)): volume = client.by_id_volume(volume_name) replica_count = len(volume.replicas) assert replica_count == 3 replica_id = randrange(0, replica_count) replica_name = volume.replicas[replica_id].name volume.replicaRemove(name=replica_name) wait_for_volume_degraded(client, volume_name) wait_for_volume_healthy(client, volume_name) wait_for_volume_replica_count(client, volume_name, replica_count) volume = client.by_id_volume(volume_name) lh_node_names = list(map(lambda node: node.name, client.list_node())) for replica in volume.replicas: lh_node_names.remove(replica.hostId) assert lh_node_names == []
def engine_live_upgrade_rollback_test(client, core_api, volume_name, backing_image=""): # NOQA default_img = common.get_default_engine_image(client) default_img_name = default_img.name default_img = wait_for_engine_image_ref_count(client, default_img_name, 0) cli_v = default_img.cliAPIVersion cli_minv = default_img.cliAPIMinVersion ctl_v = default_img.controllerAPIVersion ctl_minv = default_img.controllerAPIMinVersion data_v = default_img.dataFormatVersion data_minv = default_img.dataFormatMinVersion wrong_engine_upgrade_image = common.get_compatibility_test_image( cli_v, cli_minv, ctl_v, ctl_minv, data_v, data_minv) new_img = client.create_engine_image(image=wrong_engine_upgrade_image) new_img_name = new_img.name ei_status_value = get_engine_image_status_value(client, new_img_name) new_img = wait_for_engine_image_state(client, new_img_name, ei_status_value) assert new_img.refCount == 0 assert new_img.noRefSince != "" default_img = common.get_default_engine_image(client) default_img_name = default_img.name client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2, backingImage=backing_image) volume = common.wait_for_volume_detached(client, volume_name) wait_for_engine_image_ref_count(client, default_img_name, 1) assert volume.backingImage == backing_image original_engine_image = volume.engineImage assert original_engine_image != wrong_engine_upgrade_image host_id = get_self_host_id() volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) data = write_volume_random_data(volume) volume.engineUpgrade(image=wrong_engine_upgrade_image) volume = client.by_id_volume(volume.name) assert volume.engineImage == wrong_engine_upgrade_image assert volume.currentImage == original_engine_image with pytest.raises(Exception): # this will timeout wait_for_volume_current_image(client, volume_name, wrong_engine_upgrade_image) # rollback volume.engineUpgrade(image=original_engine_image) wait_for_volume_current_image(client, volume_name, original_engine_image) volume = wait_for_volume_replicas_mode(client, volume_name, "RW") assert volume.engineImage == original_engine_image assert volume.currentImage == original_engine_image engine = get_volume_engine(volume) assert engine.engineImage == original_engine_image assert engine.currentImage == original_engine_image volume = common.wait_for_volume_replica_count(client, volume_name, REPLICA_COUNT) check_volume_data(volume, data) assert volume.state == common.VOLUME_STATE_ATTACHED assert volume.robustness == common.VOLUME_ROBUSTNESS_HEALTHY # try again, this time let's try detach volume.engineUpgrade(image=wrong_engine_upgrade_image) volume = client.by_id_volume(volume.name) assert volume.engineImage == wrong_engine_upgrade_image assert volume.currentImage == original_engine_image with pytest.raises(Exception): # this will timeout wait_for_volume_current_image(client, volume_name, wrong_engine_upgrade_image) volume.detach(hostId="") volume = wait_for_volume_current_image(client, volume_name, wrong_engine_upgrade_image) # all the images would be updated assert volume.engineImage == wrong_engine_upgrade_image engine = get_volume_engine(volume) assert engine.engineImage == wrong_engine_upgrade_image volume = common.wait_for_volume_replica_count(client, volume_name, REPLICA_COUNT) for replica in volume.replicas: assert replica.engineImage == wrong_engine_upgrade_image # upgrade to the correct image when offline volume.engineUpgrade(image=original_engine_image) volume = wait_for_volume_current_image(client, volume_name, original_engine_image) volume = client.by_id_volume(volume.name) assert volume.engineImage == original_engine_image volume.attach(hostId=host_id) common.wait_for_volume_healthy(client, volume_name) volume = wait_for_volume_replicas_mode(client, volume_name, "RW") assert volume.engineImage == original_engine_image assert volume.currentImage == original_engine_image engine = get_volume_engine(volume) assert engine.engineImage == original_engine_image assert engine.currentImage == original_engine_image check_volume_endpoint(volume) for replica in volume.replicas: assert replica.engineImage == original_engine_image assert replica.currentImage == original_engine_image check_volume_data(volume, data) client.delete(volume) wait_for_volume_delete(client, volume_name) client.delete(new_img) wait_for_engine_image_deletion(client, core_api, new_img.name)
def test_data_locality_basic(client, core_api, volume_name, pod, settings_reset): # NOQA """ Test data locality basic feature Context: Data Locality feature allows users to have an option to keep a local replica on the same node as the consuming pod. Longhorn is currently supporting 2 modes: - disabled: Longhorn does not try to keep a local replica - best-effort: Longhorn try to keep a local replica See manual tests at: https://github.com/longhorn/longhorn/issues/1045#issuecomment-680706283 Steps: Case 1: Test that Longhorn builds a local replica on the engine node 1. Create a volume(1) with 1 replica and dataLocality set to disabled 2. Find node where the replica is located on. Let's call the node is replica-node 3. Attach the volume to a node different than replica-node. Let call the node is engine-node 4. Write 200MB data to volume(1) 5. Use a retry loop to verify that Longhorn does not create a replica on the engine-node 6. Update dataLocality to best-effort for volume(1) 7. Use a retry loop to verify that Longhorn creates and rebuilds a replica on the engine-node and remove the other replica 8. detach the volume(1) and attach it to a different node. Let's call the new node is new-engine-node and the old node is old-engine-node 9. Wait for volume(1) to finish attaching 10. Use a retry loop to verify that Longhorn creates and rebuilds a replica on the new-engine-node and remove the replica on old-engine-node Case 2: Test that Longhorn prioritizes deleting replicas on the same node 1. Add the tag AVAIL to node-1 and node-2 2. Set node soft anti-affinity to `true`. 3. Create a volume(2) with 3 replicas and dataLocality set to best-effort 4. Use a retry loop to verify that all 3 replicas are on node-1 and node-2, no replica is on node-3 5. Attach volume(2) to node-3 6. User a retry loop to verify that there is no replica on node-3 and we can still read/write to volume(2) 7. Find the node which contains 2 replicas. Let call the node is most-replica-node 8. Set the replica count to 2 for volume(2) 9. Verify that Longhorn remove one replica from most-replica-node Case 3: Test that the volume is not corrupted if there is an unexpected detachment during building local replica 1. Remove the tag AVAIL from node-1 and node-2 Set node soft anti-affinity to `false`. 2. Create a volume(3) with 1 replicas and dataLocality set to best-effort 3. Attach volume(3) to node-3. 4. Use a retry loop to verify that volume(3) has only 1 replica on node-3 5. Write 800MB data to volume(3) 6. Detach volume(3) 7. Attach volume(3) to node-1 8. Use a retry loop to: Wait until volume(3) finishes attaching. Wait until Longhorn start rebuilding a replica on node-1 Immediately detach volume(3) 9. Verify that the replica on node-1 is in ERR state. 10. Attach volume(3) to node-1 11. Wait until volume(3) finishes attaching. 12. Use a retry loop to verify the Longhorn cleanup the ERR replica, rebuild a new replica on node-1, and remove the replica on node-3 Case 4: Make sure failed to schedule local replica doesn't block the the creation of other replicas. 1. Disable scheduling for node-3 2. Create a vol with 1 replica, `dataLocality = best-effort`. The replica is scheduled on a node (say node-1) 3. Attach vol to node-3. There is a fail-to-schedule replica with Spec.HardNodeAffinity=node-3 4. Increase numberOfReplica to 3. Verify that the replica set contains: one on node-1, one on node-2, one failed replica with Spec.HardNodeAffinity=node-3. 5. Decrease numberOfReplica to 2. Verify that the replica set contains: one on node-1, one on node-2, one failed replica with Spec.HardNodeAffinity=node-3. 6. Decrease numberOfReplica to 1. Verify that the replica set contains: one on node-1 or node-2, one failed replica with Spec.HardNodeAffinity=node-3. 7. Decrease numberOfReplica to 2. Verify that the replica set contains: one on node-1, one on node-2, one failed replica with Spec.HardNodeAffinity=node-3. 8. Turn off data locality by set `dataLocality=disabled` for the vol. Verify that the replica set contains: one on node-1, one on node-2 9. clean up """ # Case 1: Test that Longhorn builds a local replica on the engine node nodes = client.list_node() default_data_locality_setting = \ client.by_id_setting(SETTING_DEFAULT_DATA_LOCALITY) try: client.update(default_data_locality_setting, value="disabled") except Exception as e: print("Exception when update Default Data Locality setting", default_data_locality_setting, e) volume1_name = volume_name + "-1" volume1_size = str(500 * Mi) volume1_data_path = "/data/test" pv1_name = volume1_name + "-pv" pvc1_name = volume1_name + "-pvc" pod1_name = volume1_name + "-pod" pod1 = pod pod1['metadata']['name'] = pod1_name volume1 = create_and_check_volume(client, volume1_name, num_of_replicas=1, size=volume1_size) volume1 = client.by_id_volume(volume1_name) create_pv_for_volume(client, core_api, volume1, pv1_name) create_pvc_for_volume(client, core_api, volume1, pvc1_name) volume1 = client.by_id_volume(volume1_name) volume1_replica_node = volume1.replicas[0]['hostId'] volume1_attached_node = None for node in nodes: if node.name != volume1_replica_node: volume1_attached_node = node.name break assert volume1_attached_node is not None pod1['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": pvc1_name } }] pod1['spec']['nodeSelector'] = \ {"kubernetes.io/hostname": volume1_attached_node} create_and_wait_pod(core_api, pod1) write_pod_volume_random_data(core_api, pod1_name, volume1_data_path, DATA_SIZE_IN_MB_2) for i in range(10): volume1 = client.by_id_volume(volume1_name) assert len(volume1.replicas) == 1 assert volume1.replicas[0]['hostId'] != volume1_attached_node time.sleep(1) volume1 = client.by_id_volume(volume1_name) volume1.updateDataLocality(dataLocality="best-effort") for _ in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) assert volume1[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY if len(volume1.replicas) == 1 and \ volume1.replicas[0]['hostId'] == volume1_attached_node: break time.sleep(RETRY_INTERVAL) assert len(volume1.replicas) == 1 assert volume1.replicas[0]['hostId'] == volume1_attached_node delete_and_wait_pod(core_api, pod1_name) volume1 = wait_for_volume_detached(client, volume1_name) volume1_replica_node = volume1.replicas[0]['hostId'] volume1_attached_node = None for node in nodes: if node.name != volume1_replica_node: volume1_attached_node = node.name break assert volume1_attached_node is not None pod1['spec']['nodeSelector'] = \ {"kubernetes.io/hostname": volume1_attached_node} create_and_wait_pod(core_api, pod1) for _ in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) assert volume1[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY if len(volume1.replicas) == 1 and \ volume1.replicas[0]['hostId'] == volume1_attached_node: break time.sleep(RETRY_INTERVAL) assert len(volume1.replicas) == 1 assert volume1.replicas[0]['hostId'] == volume1_attached_node delete_and_wait_pod(core_api, pod1_name) wait_for_volume_detached(client, volume1_name) # Case 2: Test that Longhorn prioritizes deleting replicas on the same node node1 = nodes[0] node2 = nodes[1] node3 = nodes[2] client.update(node1, allowScheduling=True, tags=["AVAIL"]) client.update(node2, allowScheduling=True, tags=["AVAIL"]) replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) try: client.update(replica_node_soft_anti_affinity_setting, value="true") except Exception as e: print( "Exception when update " "Replica Node Level Soft Anti-Affinity setting", replica_node_soft_anti_affinity_setting, e) volume2_name = volume_name + "-2" volume2_size = str(500 * Mi) pv2_name = volume2_name + "-pv" pvc2_name = volume2_name + "-pvc" pod2_name = volume2_name + "-pod" pod2 = pod pod2['metadata']['name'] = pod2_name volume2 = client.create_volume(name=volume2_name, size=volume2_size, numberOfReplicas=3, nodeSelector=["AVAIL"], dataLocality="best-effort") volume2 = wait_for_volume_detached(client, volume2_name) volume2 = client.by_id_volume(volume2_name) create_pv_for_volume(client, core_api, volume2, pv2_name) create_pvc_for_volume(client, core_api, volume2, pvc2_name) volume2 = client.by_id_volume(volume2_name) pod2['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": pvc2_name } }] pod2['spec']['nodeSelector'] = {"kubernetes.io/hostname": node3.name} create_and_wait_pod(core_api, pod2) volume2 = wait_for_volume_healthy(client, volume2_name) for replica in volume2.replicas: assert replica["hostId"] != node3.name volume2.updateReplicaCount(replicaCount=2) # 2 Healthy replicas and 1 replica failed to schedule # The failed to schedule replica is the local replica on node3 volume2 = wait_for_volume_replica_count(client, volume2_name, 3) volume2 = client.by_id_volume(volume2_name) volume2_healthy_replicas = [] for replica in volume2.replicas: if replica.running is True: volume2_healthy_replicas.append(replica) assert len(volume2_healthy_replicas) == 2 volume2_rep1 = volume2_healthy_replicas[0] volume2_rep2 = volume2_healthy_replicas[1] assert volume2_rep1["hostId"] != volume2_rep2["hostId"] delete_and_wait_pod(core_api, pod2_name) wait_for_volume_detached(client, volume2_name) # Case 3: Test that the volume is not corrupted if there is an unexpected # detachment during building local replica client.update(node1, allowScheduling=True, tags=[]) client.update(node2, allowScheduling=True, tags=[]) replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) try: client.update(replica_node_soft_anti_affinity_setting, value="false") except Exception as e: print( "Exception when update " "Replica Node Level Soft Anti-Affinity setting", replica_node_soft_anti_affinity_setting, e) volume3_name = volume_name + "-3" volume3_size = str(1 * Gi) volume3_data_path = "/data/test" pv3_name = volume3_name + "-pv" pvc3_name = volume3_name + "-pvc" pod3_name = volume3_name + "-pod" pod3 = pod pod3['metadata']['name'] = pod3_name volume3 = client.create_volume(name=volume3_name, size=volume3_size, numberOfReplicas=1) volume3 = wait_for_volume_detached(client, volume3_name) volume3 = client.by_id_volume(volume3_name) create_pv_for_volume(client, core_api, volume3, pv3_name) create_pvc_for_volume(client, core_api, volume3, pvc3_name) volume3 = client.by_id_volume(volume3_name) pod3['spec']['volumes'] = [{ "name": "pod-data", "persistentVolumeClaim": { "claimName": pvc3_name } }] pod3['spec']['nodeSelector'] = {"kubernetes.io/hostname": node3.name} create_and_wait_pod(core_api, pod3) volume3 = wait_for_volume_healthy(client, volume3_name) write_pod_volume_random_data(core_api, pod3_name, volume3_data_path, DATA_SIZE_IN_MB_4) volume3.updateDataLocality(dataLocality="best-effort") volume3 = client.by_id_volume(volume3_name) if volume3.replicas[0]['hostId'] != node3.name: wait_for_rebuild_start(client, volume3_name) volume3 = client.by_id_volume(volume3_name) assert len(volume3.replicas) == 2 wait_for_rebuild_complete(client, volume3_name) volume3 = wait_for_volume_replica_count(client, volume3_name, 1) assert volume3.replicas[0]["hostId"] == node3.name delete_and_wait_pod(core_api, pod3_name) pod3['spec']['nodeSelector'] = {"kubernetes.io/hostname": node1.name} create_and_wait_pod(core_api, pod3) wait_for_rebuild_start(client, volume3_name) crash_engine_process_with_sigkill(client, core_api, volume3_name) delete_and_wait_pod(core_api, pod3_name) wait_for_volume_detached(client, volume3_name) volume3 = client.by_id_volume(volume3_name) assert len(volume3.replicas) == 1 assert volume3.replicas[0]["hostId"] == node3.name create_and_wait_pod(core_api, pod3) wait_for_rebuild_start(client, volume3_name) volume3 = client.by_id_volume(volume3_name) assert len(volume3.replicas) == 2 wait_for_rebuild_complete(client, volume3_name) # Wait for deletion of extra replica volume3 = wait_for_volume_replica_count(client, volume3_name, 1) assert volume3.replicas[0]["hostId"] == node1.name assert volume3.replicas[0]["mode"] == "RW" assert volume3.replicas[0]["running"] is True delete_and_wait_pod(core_api, pod3_name) wait_for_volume_detached(client, volume3_name) # Case 4: Make sure failed to schedule local replica doesn't block the # the creation of other replicas. replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) try: client.update(replica_node_soft_anti_affinity_setting, value="false") except Exception as e: print( "Exception when update " "Replica Node Level Soft Anti-Affinity setting", replica_node_soft_anti_affinity_setting, e) client.update(node3, allowScheduling=False) volume4_name = volume_name + "-4" volume4_size = str(1 * Gi) volume4 = client.create_volume(name=volume4_name, size=volume4_size, numberOfReplicas=1, dataLocality="best-effort") volume4 = wait_for_volume_detached(client, volume4_name) volume4 = client.by_id_volume(volume4_name) volume4_replica_name = volume4.replicas[0]["name"] volume4.attach(hostId=node3.name) wait_for_volume_healthy(client, volume4_name) volume4 = client.by_id_volume(volume4_name) assert len(volume4.replicas) == 2 for replica in volume4.replicas: if replica["name"] == volume4_replica_name: assert replica["running"] is True assert replica["mode"] == "RW" else: assert replica["running"] is False assert replica["mode"] == "" assert volume4.conditions.scheduled.reason == \ "LocalReplicaSchedulingFailure" volume4 = volume4.updateReplicaCount(replicaCount=3) volume4 = wait_for_volume_degraded(client, volume4_name) v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_failed_replica_count = 0 for replica in volume4.replicas: if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == "": v4_failed_replica_count += 1 assert v4_node1_replica_count == 1 assert v4_node2_replica_count == 1 assert v4_failed_replica_count > 0 volume4 = volume4.updateReplicaCount(replicaCount=2) volume4 = wait_for_volume_replica_count(client, volume4_name, 3) v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_failed_replica_count = 0 for replica in volume4.replicas: if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == "": v4_failed_replica_count += 1 assert v4_node1_replica_count == 1 assert v4_node2_replica_count == 1 assert v4_failed_replica_count > 0 volume4 = volume4.updateReplicaCount(replicaCount=1) volume4 = wait_for_volume_replica_count(client, volume4_name, 2) v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_failed_replica_count = 0 for replica in volume4.replicas: if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == "": v4_failed_replica_count += 1 assert v4_node1_replica_count + v4_node2_replica_count == 1 assert v4_failed_replica_count == 1 volume4 = volume4.updateDataLocality(dataLocality="disabled") volume4 = volume4.updateReplicaCount(replicaCount=2) running_replica_count = 0 for _ in range(RETRY_COUNTS): volume4 = client.by_id_volume(volume4_name) running_replica_count = 0 for r in volume4.replicas: if r.failedAt == "" and r.running is True: running_replica_count += 1 if running_replica_count == 2: break time.sleep(RETRY_INTERVAL) assert running_replica_count == 2 v4_node1_replica_count = 0 v4_node2_replica_count = 0 v4_node3_replica_count = 0 for replica in volume4.replicas: wait_for_replica_running(client, volume4_name, replica["name"]) if replica["hostId"] == node1.name: v4_node1_replica_count += 1 elif replica["hostId"] == node2.name: v4_node2_replica_count += 1 elif replica["hostId"] == node3.name: v4_node3_replica_count += 1 assert v4_node1_replica_count == 1 assert v4_node2_replica_count == 1 assert v4_node3_replica_count == 0