def test_offline_node(reset_cluster_ready_status): node_worker_label = 'node-role.kubernetes.io/worker' pod_lable_selector = "longhorn-test=test-job" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for pod in k8s_api_client.list_namespaced_pod( 'default', label_selector=pod_lable_selector).items: if pod.metadata.name == "longhorn-test": longhorn_test_node_name = pod.spec.node_name for node_item in k8s_api_client.list_node().items: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if node_name == longhorn_test_node_name: continue else: break node = cloudprovider.node_id(node_name) cloudprovider.node_shutdown(node) k8s_node_down = wait_for_node_down_k8s(node_name, k8s_api_client) assert k8s_node_down longhorn_node_down = wait_for_node_down_longhorn(node_name, longhorn_api_client) assert longhorn_node_down
def finalizer(): node_worker_label = 'node-role.kubernetes.io/worker' k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for node_item in k8s_api_client.list_node().items: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if is_node_ready_k8s(node_name, k8s_api_client) is False: node = cloudprovider.node_id(node_name) cloudprovider.node_start(node) node_up_k8s = wait_for_node_up_k8s(node_name, k8s_api_client) assert node_up_k8s else: continue node_up_longhorn =\ wait_for_node_up_longhorn(node_name, longhorn_api_client) assert node_up_longhorn
def test_offline_node(reset_cluster_ready_status): """ Test offline node 1. Bring down one of the nodes in Kuberntes cluster (avoid current node) 2. Make sure the Longhorn node state become `down` """ node_worker_label = 'node-role.kubernetes.io/worker' pod_lable_selector = "longhorn-test=test-job" node_controlplane_label = 'node-role.kubernetes.io/control-plane' node_ip_annotation = "flannel.alpha.coreos.com/public-ip" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for pod in k8s_api_client.list_namespaced_pod( 'default', label_selector=pod_lable_selector).items: if pod.metadata.name == "longhorn-test": longhorn_test_node_name = pod.spec.node_name k3s = is_infra_k3s() for node_item in k8s_api_client.list_node().items: if k3s is True: if node_controlplane_label not in node_item.metadata.labels: node_name = node_item.metadata.name node_ip = node_item.metadata.annotations[node_ip_annotation] if node_name == longhorn_test_node_name: continue else: node = cloudprovider.instance_id_by_ip(node_ip) break else: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if node_name == longhorn_test_node_name: continue else: node = cloudprovider.instance_id(node_name) break print(f'==> stop node: {node_name}') cloudprovider.instance_stop(node) wait_for_node_down_aws(cloudprovider, node) k8s_node_down = wait_for_node_down_k8s(node_name, k8s_api_client) assert k8s_node_down longhorn_api_client = get_longhorn_api_client() longhorn_node_down = wait_for_node_down_longhorn(node_name, longhorn_api_client) assert longhorn_node_down
def reset_default_disk_label(): yield k8sapi = get_core_api_client() lhapi = get_longhorn_api_client() nodes = lhapi.list_node() for node in nodes: k8sapi.patch_node( node["id"], {"metadata": { "labels": { CREATE_DEFAULT_DISK_LABEL: None } }})
def reset_cluster_ready_status(request): yield node_worker_label = 'node-role.kubernetes.io/worker' node_controlplane_label = 'node-role.kubernetes.io/control-plane' node_ip_annotation = "flannel.alpha.coreos.com/public-ip" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() k3s = is_infra_k3s() print('==> test completed! reset cluster ready status ...') for node_item in k8s_api_client.list_node().items: if k3s is True: if node_controlplane_label not in node_item.metadata.labels: node_name = node_item.metadata.name node_ip = node_item.metadata.annotations[node_ip_annotation] node = cloudprovider.instance_id_by_ip(node_ip) else: continue else: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name node = cloudprovider.instance_id(node_name) else: continue if is_node_ready_k8s(node_name, k8s_api_client) is False: cloudprovider.instance_start(node) print(f'==> wait for aws node {node_name} up ...') aws_node_up = wait_for_node_up_aws(cloudprovider, node) assert aws_node_up, f'expect aws node {node_name} up' node_up_k8s = wait_for_node_up_k8s(node_name, k8s_api_client) assert node_up_k8s else: continue node_up_longhorn =\ wait_for_node_up_longhorn(node_name, longhorn_api_client) assert node_up_longhorn
def test_offline_node(reset_cluster_ready_status): """ Test offline node 1. Bring down one of the nodes in Kuberntes cluster (avoid current node) 2. Make sure the Longhorn node state become `down` """ node_worker_label = 'node-role.kubernetes.io/worker' pod_lable_selector = "longhorn-test=test-job" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for pod in k8s_api_client.list_namespaced_pod( 'default', label_selector=pod_lable_selector).items: if pod.metadata.name == "longhorn-test": longhorn_test_node_name = pod.spec.node_name for node_item in k8s_api_client.list_node().items: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if node_name == longhorn_test_node_name: continue else: break node = cloudprovider.instance_id(node_name) cloudprovider.instance_stop(node) k8s_node_down = wait_for_node_down_k8s(node_name, k8s_api_client) assert k8s_node_down longhorn_api_client = get_longhorn_api_client() longhorn_node_down = wait_for_node_down_longhorn(node_name, longhorn_api_client) assert longhorn_node_down
def test_reset_env(): k8s_api_client = get_core_api_client() k8s_storage_client = get_storage_api_client() longhorn_api_client = get_longhorn_api_client() pod_list = k8s_api_client.list_namespaced_pod("default") for pod in pod_list.items: if STRESS_POD_NAME_PREFIX in pod.metadata.name: delete_and_wait_pod(k8s_api_client, pod.metadata.name) pvc_list = \ k8s_api_client.list_namespaced_persistent_volume_claim("default") for pvc in pvc_list.items: if STRESS_PVC_NAME_PREFIX in pvc.metadata.name: delete_and_wait_pvc(k8s_api_client, pvc.metadata.name) pv_list = k8s_api_client.list_persistent_volume() for pv in pv_list.items: pv_name = pv.metadata.name if STRESS_PV_NAME_PREFIX in pv_name: try: delete_and_wait_pv(k8s_api_client, pv_name) except AssertionError: volumeattachment_list = \ k8s_storage_client.list_volume_attachment() for volumeattachment in volumeattachment_list.items: volume_attachment_name = \ volumeattachment.spec.source.persistent_volume_name if volume_attachment_name == pv_name: delete_and_wait_volume_attachment( k8s_storage_client, volume_attachment_name ) delete_and_wait_pv(k8s_api_client, pv.metadata.name) volume_list = \ longhorn_api_client.list_volume() for volume in volume_list.data: if STRESS_VOLUME_NAME_PREFIX in volume.name: delete_and_wait_longhorn(longhorn_api_client, volume.name)
def test_setting_toleration(): """ Test toleration setting 1. Set `taint-toleration` to "key1=value1:NoSchedule; key2:InvalidEffect". 2. Verify the request fails. 3. Create a volume and attach it. 4. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 5. Verify that cannot update toleration setting when any volume is attached. 6. Generate and write `data1` into the volume. 7. Detach the volume. 8. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 9. Wait for all the Longhorn system components to restart with new toleration. 10. Verify that UI, manager, and drive deployer don't restart and don't have new toleration. 11. Attach the volume again and verify the volume `data1`. 12. Generate and write `data2` to the volume. 13. Detach the volume. 14. Clean the `toleration` setting. 15. Wait for all the Longhorn system components to restart with no toleration. 16. Attach the volume and validate `data2`. 17. Generate and write `data3` to the volume. """ client = get_longhorn_api_client() # NOQA apps_api = get_apps_api_client() # NOQA core_api = get_core_api_client() # NOQA count = len(client.list_node()) setting = client.by_id_setting(SETTING_TAINT_TOLERATION) with pytest.raises(Exception) as e: client.update(setting, value="key1=value1:NoSchedule; key2:InvalidEffect") assert 'invalid effect' in str(e.value) volume_name = "test-toleration-vol" # NOQA volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) setting_value_str = "key1=value1:NoSchedule; key2:NoExecute" setting_value_dicts = [ { "key": "key1", "value": "value1", "operator": "Equal", "effect": "NoSchedule" }, { "key": "key2", "value": None, "operator": "Exists", "effect": "NoExecute" }, ] with pytest.raises(Exception) as e: client.update(setting, value=setting_value_str) assert 'cannot modify toleration setting before all volumes are detached' \ in str(e.value) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) # cleanup setting_value_str = "" setting_value_dicts = [] setting = client.by_id_setting(SETTING_TAINT_TOLERATION) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
def write_data_into_pod(pod_name_and_data_path): pod_info = pod_name_and_data_path.split(':') core_api = get_core_api_client() # NOQA write_pod_volume_random_data(core_api, pod_info[0], pod_info[1], DATA_SIZE_IN_MB_3)
def generate_load(request): index = get_random_suffix() longhorn_api_client = get_longhorn_api_client() k8s_api_client = get_core_api_client() check_and_set_backupstore(longhorn_api_client) volume_name = STRESS_VOLUME_NAME_PREFIX + index pv_name = STRESS_PV_NAME_PREFIX + index pvc_name = STRESS_PVC_NAME_PREFIX + index pod_name = STRESS_POD_NAME_PREFIX + index atexit.register(remove_datafile, pod_name) atexit.register(delete_and_wait_longhorn, longhorn_api_client, volume_name) atexit.register(delete_and_wait_pv, k8s_api_client, pv_name) atexit.register(delete_and_wait_pvc, k8s_api_client, pvc_name) atexit.register(delete_and_wait_pod, k8s_api_client, pod_name) longhorn_volume = create_and_check_volume(longhorn_api_client, volume_name, size=VOLUME_SIZE) wait_for_volume_detached(longhorn_api_client, volume_name) pod_manifest = generate_pod_with_pvc_manifest(pod_name, pvc_name) create_pv_for_volume(longhorn_api_client, k8s_api_client, longhorn_volume, pv_name) create_pvc_for_volume(longhorn_api_client, k8s_api_client, longhorn_volume, pvc_name) create_and_wait_pod(k8s_api_client, pod_manifest) snapshots_md5sum = dict() write_data(k8s_api_client, pod_name) create_recurring_jobs(longhorn_api_client, volume_name) global N_RANDOM_ACTIONS for round in range(N_RANDOM_ACTIONS): action = randrange(0, 8) if action == 0: print("write data started: " + time_now(), end=', ') write_data(k8s_api_client, pod_name) print("ended: " + time_now()) elif action == 1: print("delete data started: " + time_now(), end=', ') delete_data(k8s_api_client, pod_name) print("ended: " + time_now()) elif action == 2: print("create snapshot started: " + time_now(), end=', ') snapshot_create_and_record_md5sum(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 3: print("delete random snapshot started: " + time_now(), end=', ') delete_random_snapshot(longhorn_api_client, volume_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 4: print("revert random snapshot started: " + time_now(), end=', ') revert_random_snapshot(longhorn_api_client, k8s_api_client, volume_name, pod_manifest, snapshots_md5sum) print("ended: " + time_now()) elif action == 5: print("create backup started: " + time_now(), end=', ') backup_create_and_record_md5sum(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 6: print("delete replica started: " + time_now(), end=', ') delete_replica(longhorn_api_client, volume_name) print("ended: " + time_now()) elif action == 7: print("restore random backup started: " + time_now(), end=', ') restore_and_check_random_backup(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) clean_volume_backups(longhorn_api_client, volume_name)
def test_upgrade(upgrade_image_tag, settings_reset, volume_name, pod_make, statefulset, storage_class): # NOQA """ Test Longhorn upgrade Prerequisite: - Disable Auto Salvage Setting 1. Find the upgrade image tag 2. Create a volume, generate and write data into the volume. 3. Create a Pod using a volume, generate and write data 4. Create a StatefulSet with 2 replicas, generate and write data to their volumes 5. Keep all volumes attached 6. Upgrade Longhorn system. 7. Check Pod and StatefulSet didn't restart after upgrade 8. Check All volumes data 9. Write data to StatefulSet pods, and Attached volume 10. Check data written to StatefulSet pods, and attached volume. 11. Detach the volume, and Delete Pod, and StatefulSet to detach theirvolumes 12. Upgrade all volumes engine images. 13. Attach the volume, and recreate Pod, and StatefulSet 14. Check All volumes data """ new_ei_name = "longhornio/longhorn-engine:" + upgrade_image_tag client = get_longhorn_api_client() core_api = get_core_api_client() host_id = get_self_host_id() pod_data_path = "/data/test" pod_volume_name = generate_volume_name() auto_salvage_setting = client.by_id_setting(SETTING_AUTO_SALVAGE) setting = client.update(auto_salvage_setting, value="false") assert setting.name == SETTING_AUTO_SALVAGE assert setting.value == "false" # Create Volume attached to a node. volume1 = create_and_check_volume(client, volume_name, size=SIZE) volume1.attach(hostId=host_id) volume1 = wait_for_volume_healthy(client, volume_name) volume1_data = write_volume_random_data(volume1) # Create Volume used by Pod pod_name, pv_name, pvc_name, pod_md5sum = \ prepare_pod_with_data_in_mb(client, core_api, pod_make, pod_volume_name, data_path=pod_data_path, add_liveness_prope=False) # Create multiple volumes used by StatefulSet statefulset_name = 'statefulset-upgrade-test' update_statefulset_manifests(statefulset, storage_class, statefulset_name) create_storage_class(storage_class) create_and_wait_statefulset(statefulset) statefulset_pod_info = get_statefulset_pod_info(core_api, statefulset) for sspod_info in statefulset_pod_info: sspod_info['data'] = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, sspod_info['pod_name'], sspod_info['data']) # upgrade Longhorn assert longhorn_upgrade(upgrade_image_tag) client = get_longhorn_api_client() # wait for 1 minute before checking pod restarts time.sleep(60) pod = core_api.read_namespaced_pod(name=pod_name, namespace='default') assert pod.status.container_statuses[0].restart_count == 0 for sspod_info in statefulset_pod_info: sspod = core_api.read_namespaced_pod(name=sspod_info['pod_name'], namespace='default') assert \ sspod.status.container_statuses[0].restart_count == 0 for sspod_info in statefulset_pod_info: resp = read_volume_data(core_api, sspod_info['pod_name']) assert resp == sspod_info['data'] res_pod_md5sum = get_pod_data_md5sum(core_api, pod_name, pod_data_path) assert res_pod_md5sum == pod_md5sum check_volume_data(volume1, volume1_data) for sspod_info in statefulset_pod_info: sspod_info['data'] = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, sspod_info['pod_name'], sspod_info['data']) for sspod_info in statefulset_pod_info: resp = read_volume_data(core_api, sspod_info['pod_name']) assert resp == sspod_info['data'] volume1 = client.by_id_volume(volume_name) volume1_data = write_volume_random_data(volume1) check_volume_data(volume1, volume1_data) statefulset['spec']['replicas'] = replicas = 0 apps_api = get_apps_api_client() apps_api.patch_namespaced_stateful_set( name=statefulset_name, namespace='default', body={ 'spec': { 'replicas': replicas } }) delete_and_wait_pod(core_api, pod_name) volume = client.by_id_volume(volume_name) volume.detach() volumes = client.list_volume() for v in volumes: wait_for_volume_detached(client, v.name) engineimages = client.list_engine_image() for ei in engineimages: if ei.image == new_ei_name: new_ei = ei volumes = client.list_volume() for v in volumes: volume = client.by_id_volume(v.name) volume.engineUpgrade(image=new_ei.image) statefulset['spec']['replicas'] = replicas = 2 apps_api = get_apps_api_client() apps_api.patch_namespaced_stateful_set( name=statefulset_name, namespace='default', body={ 'spec': { 'replicas': replicas } }) wait_statefulset(statefulset) pod = pod_make(name=pod_name) pod['spec']['volumes'] = [create_pvc_spec(pvc_name)] create_and_wait_pod(core_api, pod) volume1 = client.by_id_volume(volume_name) volume1.attach(hostId=host_id) volume1 = wait_for_volume_healthy(client, volume_name) for sspod_info in statefulset_pod_info: resp = read_volume_data(core_api, sspod_info['pod_name']) assert resp == sspod_info['data'] res_pod_md5sum = get_pod_data_md5sum(core_api, pod_name, pod_data_path) assert res_pod_md5sum == pod_md5sum check_volume_data(volume1, volume1_data)
def test_setting_toleration(): """ Test toleration setting 1. Verify that cannot use Kubernetes tolerations for Longhorn setting 2. Use "key1=value1:NoSchedule; key2:NoExecute" as toleration. 3. Create a volume and attach it. 4. Verify that cannot update toleration setting when any volume is attached 5. Generate and write `data1` into the volume 6. Detach the volume. 7. Update setting `toleration` to toleration. 8. Wait for all the Longhorn components to restart with new toleration 9. Attach the volume again and verify the volume `data1`. 10. Generate and write `data2` to the volume. 11. Detach the volume. 12. Clean the `toleration` setting. 13. Wait for all the Longhorn components to restart with no toleration 14. Attach the volume and validate `data2`. 15. Generate and write `data3` to the volume. """ client = get_longhorn_api_client() # NOQA apps_api = get_apps_api_client() # NOQA core_api = get_core_api_client() # NOQA count = len(client.list_node()) setting = client.by_id_setting(SETTING_TAINT_TOLERATION) with pytest.raises(Exception) as e: client.update(setting, value=KUBERNETES_DEFAULT_TOLERATION + ":NoSchedule") assert "is considered as the key of Kubernetes default tolerations" \ in str(e.value) with pytest.raises(Exception) as e: client.update(setting, value="key1=value1:NoSchedule; key2:InvalidEffect") assert 'invalid effect' in str(e.value) setting_value_str = "key1=value1:NoSchedule; key2:NoExecute" setting_value_dict = \ {"key1": {"key": "key1", "value": "value1", "operator": "Equal", "effect": "NoSchedule"}, "key2": {"key": "key2", "value": None, "operator": "Exists", "effect": "NoExecute"}, } volume_name = "test-toleration-vol" # NOQA volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) with pytest.raises(Exception) as e: client.update(setting, value=setting_value_str) assert 'cannot modify toleration setting before all volumes are detached' \ in str(e.value) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach() wait_for_volume_detached(client, volume_name) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dict) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach() wait_for_volume_detached(client, volume_name) # cleanup setting_value_str = "" setting_value_dict = {} setting = client.by_id_setting(SETTING_TAINT_TOLERATION) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dict) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
def finalizer(): api = get_core_api_client() delete_and_wait_pod(api, pod_manifest['metadata']['name'])