def pytest_collection_modifyitems(config, items): if not config.getoption(ENABLE_RECURRING_JOB_OPT): skip_upgrade = pytest.mark.skip(reason="need " + ENABLE_RECURRING_JOB_OPT + " option to run") for item in items: if "recurring_job" in item.keywords: item.add_marker(skip_upgrade) c = Configuration() c.assert_hostname = False Configuration.set_default(c) k8sconfig.load_incluster_config() api = k8sclient.CoreV1Api() try: api.read_namespaced_pod( name='csi-provisioner-0', namespace='longhorn-system') skip_upgrade = pytest.mark.skip(reason="environment is not using " + "flexvolume") for item in items: if "flexvolume" in item.keywords: item.add_marker(skip_upgrade) except ApiException as e: if (e.status == 404): skip_upgrade = pytest.mark.skip(reason="environment is not " + "using csi") for item in items: if "csi" in item.keywords: item.add_marker(skip_upgrade) all_nodes_support_mount_propagation = True for node in get_longhorn_api_client().list_node(): node = wait_for_node_mountpropagation_condition( get_longhorn_api_client(), node["name"]) if "conditions" not in node.keys(): all_nodes_support_mount_propagation = False else: conditions = node["conditions"] for key, condition in conditions.iteritems(): if key == NODE_CONDITION_MOUNTPROPAGATION and \ condition["status"] != CONDITION_STATUS_TRUE: all_nodes_support_mount_propagation = False break if not all_nodes_support_mount_propagation: break if not all_nodes_support_mount_propagation: skip_upgrade = pytest.mark.skip(reason="environment does not " + "support base image") skip_node = pytest.mark.skip(reason="environment does not " + "support mount disk") for item in items: if "baseimage" in item.keywords: item.add_marker(skip_upgrade) elif "mountdisk" in item.keywords: item.add_marker(skip_node)
def pytest_collection_modifyitems(config, items): c = Configuration() c.assert_hostname = False Configuration.set_default(c) k8sconfig.load_incluster_config() core_api = k8sclient.CoreV1Api() check_longhorn(core_api) if config.getoption(SKIP_RECURRING_JOB_OPT): skip_upgrade = pytest.mark.skip(reason="remove " + SKIP_RECURRING_JOB_OPT + " option to run") for item in items: if "recurring_job" in item.keywords: item.add_marker(skip_upgrade) using_csi = check_csi(core_api) if using_csi: skip_upgrade = pytest.mark.skip(reason="environment is not using " + "flexvolume") for item in items: if "flexvolume" in item.keywords: item.add_marker(skip_upgrade) else: skip_upgrade = pytest.mark.skip(reason="environment is not " + "using csi") for item in items: if "csi" in item.keywords: item.add_marker(skip_upgrade) all_nodes_support_mount_propagation = True for node in get_longhorn_api_client().list_node(): node = wait_for_node_mountpropagation_condition( get_longhorn_api_client(), node["name"]) if "conditions" not in node.keys(): all_nodes_support_mount_propagation = False else: conditions = node["conditions"] for key, condition in conditions.iteritems(): if key == NODE_CONDITION_MOUNTPROPAGATION and \ condition["status"] != CONDITION_STATUS_TRUE: all_nodes_support_mount_propagation = False break if not all_nodes_support_mount_propagation: break if not all_nodes_support_mount_propagation: skip_upgrade = pytest.mark.skip(reason="environment does not " + "support base image") skip_node = pytest.mark.skip(reason="environment does not " + "support mount disk") for item in items: if "baseimage" in item.keywords: item.add_marker(skip_upgrade) elif "mountdisk" in item.keywords: item.add_marker(skip_node)
def test_offline_node(reset_cluster_ready_status): """ Test offline node 1. Bring down one of the nodes in Kuberntes cluster (avoid current node) 2. Make sure the Longhorn node state become `down` """ node_worker_label = 'node-role.kubernetes.io/worker' pod_lable_selector = "longhorn-test=test-job" node_controlplane_label = 'node-role.kubernetes.io/control-plane' node_ip_annotation = "flannel.alpha.coreos.com/public-ip" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for pod in k8s_api_client.list_namespaced_pod( 'default', label_selector=pod_lable_selector).items: if pod.metadata.name == "longhorn-test": longhorn_test_node_name = pod.spec.node_name k3s = is_infra_k3s() for node_item in k8s_api_client.list_node().items: if k3s is True: if node_controlplane_label not in node_item.metadata.labels: node_name = node_item.metadata.name node_ip = node_item.metadata.annotations[node_ip_annotation] if node_name == longhorn_test_node_name: continue else: node = cloudprovider.instance_id_by_ip(node_ip) break else: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if node_name == longhorn_test_node_name: continue else: node = cloudprovider.instance_id(node_name) break print(f'==> stop node: {node_name}') cloudprovider.instance_stop(node) wait_for_node_down_aws(cloudprovider, node) k8s_node_down = wait_for_node_down_k8s(node_name, k8s_api_client) assert k8s_node_down longhorn_api_client = get_longhorn_api_client() longhorn_node_down = wait_for_node_down_longhorn(node_name, longhorn_api_client) assert longhorn_node_down
def finalizer(): node_worker_label = 'node-role.kubernetes.io/worker' k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for node_item in k8s_api_client.list_node().items: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if is_node_ready_k8s(node_name, k8s_api_client) is False: node = cloudprovider.node_id(node_name) cloudprovider.node_start(node) node_up_k8s = wait_for_node_up_k8s(node_name, k8s_api_client) assert node_up_k8s else: continue node_up_longhorn =\ wait_for_node_up_longhorn(node_name, longhorn_api_client) assert node_up_longhorn
def reset_disk_settings(): yield api = get_longhorn_api_client() setting = api.by_id_setting(CREATE_DEFAULT_DISK_SETTING) api.update(setting, value="false") setting = api.by_id_setting(DEFAULT_DATA_PATH_SETTING) api.update(setting, value=DEFAULT_DISK_PATH)
def test_offline_node(reset_cluster_ready_status): node_worker_label = 'node-role.kubernetes.io/worker' pod_lable_selector = "longhorn-test=test-job" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for pod in k8s_api_client.list_namespaced_pod( 'default', label_selector=pod_lable_selector).items: if pod.metadata.name == "longhorn-test": longhorn_test_node_name = pod.spec.node_name for node_item in k8s_api_client.list_node().items: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if node_name == longhorn_test_node_name: continue else: break node = cloudprovider.node_id(node_name) cloudprovider.node_shutdown(node) k8s_node_down = wait_for_node_down_k8s(node_name, k8s_api_client) assert k8s_node_down longhorn_node_down = wait_for_node_down_longhorn(node_name, longhorn_api_client) assert longhorn_node_down
def check_workload_update(core_api, apps_api, count): # NOQA da_list = apps_api.list_namespaced_daemon_set(LONGHORN_NAMESPACE).items for da in da_list: if da.status.updated_number_scheduled != count: return False dp_list = apps_api.list_namespaced_deployment(LONGHORN_NAMESPACE).items for dp in dp_list: if dp.status.updated_replicas != dp.spec.replicas: return False im_pod_list = core_api.list_namespaced_pod( LONGHORN_NAMESPACE, label_selector="longhorn.io/component=instance-manager").items if len(im_pod_list) != 2 * count: return False for p in im_pod_list: if p.status.phase != "Running": return False client = get_longhorn_api_client() # NOQA images = client.list_engine_image() assert len(images) == 1 ei_state = get_engine_image_status_value(client, images[0].name) if images[0].state != ei_state: return False return True
def wait_for_toleration_update(core_api, apps_api, count, set_tolerations): # NOQA updated = False for i in range(RETRY_COUNTS): time.sleep(RETRY_INTERVAL_LONG) updated = True da_list = apps_api.list_namespaced_daemon_set(LONGHORN_NAMESPACE).items for da in da_list: if da.status.updated_number_scheduled != count: updated = False break if not updated: continue dp_list = apps_api.list_namespaced_deployment(LONGHORN_NAMESPACE).items for dp in dp_list: if dp.status.updated_replicas != dp.spec.replicas: updated = False break if not updated: continue im_pod_list = core_api.list_namespaced_pod( LONGHORN_NAMESPACE, label_selector="longhorn.io/component=instance-manager").items if len(im_pod_list) != 2 * count: updated = False continue for p in im_pod_list: if p.status.phase != "Running": updated = False break if not updated: continue pod_list = core_api.list_namespaced_pod(LONGHORN_NAMESPACE).items for p in pod_list: if p.status.phase != "Running" or \ not check_tolerations_set(p.spec.tolerations, set_tolerations): updated = False break if not updated: continue client = get_longhorn_api_client() # NOQA images = client.list_engine_image() assert len(images) == 1 if images[0].state != "ready": updated = False continue if updated: break assert updated
def reset_settings(): yield client = get_longhorn_api_client() # NOQA host_id = get_self_host_id() node = client.by_id_node(host_id) client.update(node, allowScheduling=True) setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) client.update(setting, value="true")
def test_offline_node(reset_cluster_ready_status): """ Test offline node 1. Bring down one of the nodes in Kuberntes cluster (avoid current node) 2. Make sure the Longhorn node state become `down` """ node_worker_label = 'node-role.kubernetes.io/worker' pod_lable_selector = "longhorn-test=test-job" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() for pod in k8s_api_client.list_namespaced_pod( 'default', label_selector=pod_lable_selector).items: if pod.metadata.name == "longhorn-test": longhorn_test_node_name = pod.spec.node_name for node_item in k8s_api_client.list_node().items: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name if node_name == longhorn_test_node_name: continue else: break node = cloudprovider.instance_id(node_name) cloudprovider.instance_stop(node) k8s_node_down = wait_for_node_down_k8s(node_name, k8s_api_client) assert k8s_node_down longhorn_api_client = get_longhorn_api_client() longhorn_node_down = wait_for_node_down_longhorn(node_name, longhorn_api_client) assert longhorn_node_down
def wait_for_longhorn_node_ready(): client = get_longhorn_api_client() # NOQA ei = get_default_engine_image(client) ei_name = ei["name"] ei_state = get_engine_image_status_value(client, ei_name) wait_for_engine_image_state(client, ei_name, ei_state) node = get_self_host_id() wait_for_node_up_longhorn(node, client) return client, node
def reset_default_disk_label(): yield k8sapi = get_core_api_client() lhapi = get_longhorn_api_client() nodes = lhapi.list_node() for node in nodes: k8sapi.patch_node( node["id"], {"metadata": { "labels": { CREATE_DEFAULT_DISK_LABEL: None } }})
def reset_cluster_ready_status(request): yield node_worker_label = 'node-role.kubernetes.io/worker' node_controlplane_label = 'node-role.kubernetes.io/control-plane' node_ip_annotation = "flannel.alpha.coreos.com/public-ip" k8s_api_client = get_core_api_client() longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() k3s = is_infra_k3s() print('==> test completed! reset cluster ready status ...') for node_item in k8s_api_client.list_node().items: if k3s is True: if node_controlplane_label not in node_item.metadata.labels: node_name = node_item.metadata.name node_ip = node_item.metadata.annotations[node_ip_annotation] node = cloudprovider.instance_id_by_ip(node_ip) else: continue else: if node_worker_label in node_item.metadata.labels and \ node_item.metadata.labels[node_worker_label] == 'true': node_name = node_item.metadata.name node = cloudprovider.instance_id(node_name) else: continue if is_node_ready_k8s(node_name, k8s_api_client) is False: cloudprovider.instance_start(node) print(f'==> wait for aws node {node_name} up ...') aws_node_up = wait_for_node_up_aws(cloudprovider, node) assert aws_node_up, f'expect aws node {node_name} up' node_up_k8s = wait_for_node_up_k8s(node_name, k8s_api_client) assert node_up_k8s else: continue node_up_longhorn =\ wait_for_node_up_longhorn(node_name, longhorn_api_client) assert node_up_longhorn
def test_reset_env(): k8s_api_client = get_core_api_client() k8s_storage_client = get_storage_api_client() longhorn_api_client = get_longhorn_api_client() pod_list = k8s_api_client.list_namespaced_pod("default") for pod in pod_list.items: if STRESS_POD_NAME_PREFIX in pod.metadata.name: delete_and_wait_pod(k8s_api_client, pod.metadata.name) pvc_list = \ k8s_api_client.list_namespaced_persistent_volume_claim("default") for pvc in pvc_list.items: if STRESS_PVC_NAME_PREFIX in pvc.metadata.name: delete_and_wait_pvc(k8s_api_client, pvc.metadata.name) pv_list = k8s_api_client.list_persistent_volume() for pv in pv_list.items: pv_name = pv.metadata.name if STRESS_PV_NAME_PREFIX in pv_name: try: delete_and_wait_pv(k8s_api_client, pv_name) except AssertionError: volumeattachment_list = \ k8s_storage_client.list_volume_attachment() for volumeattachment in volumeattachment_list.items: volume_attachment_name = \ volumeattachment.spec.source.persistent_volume_name if volume_attachment_name == pv_name: delete_and_wait_volume_attachment( k8s_storage_client, volume_attachment_name ) delete_and_wait_pv(k8s_api_client, pv.metadata.name) volume_list = \ longhorn_api_client.list_volume() for volume in volume_list.data: if STRESS_VOLUME_NAME_PREFIX in volume.name: delete_and_wait_longhorn(longhorn_api_client, volume.name)
def test_setting_toleration_extra(core_api, apps_api): # NOQA """ Steps: 1. Set Kubernetes Taint Toleration to: `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`. 2. Verify that all system components have the 2 tolerations `ex.com/foobar:NoExecute; ex.com/foobar:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 3. Set Kubernetes Taint Toleration to: `node-role.kubernetes.io/controlplane=true:NoSchedule`. 4. Verify that all system components have the the toleration `node-role.kubernetes.io/controlplane=true:NoSchedule`, and don't have the 2 tolerations `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 5. Set Kubernetes Taint Toleration to special value: `:`. 6. Verify that all system components have the toleration with `operator: Exists` and other field of the toleration are empty. Verify that all system components don't have the toleration `node-role.kubernetes.io/controlplane=true:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 7. Clear Kubernetes Taint Toleration Note: system components are workloads other than UI, manager, driver deployer """ settings = [ { "value": "ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule", "expect": [ { "key": "ex.com/foobar", "value": None, "operator": "Exists", "effect": "NoExecute" }, { "key": "ex.com/foobar", "value": None, "operator": "Exists", "effect": "NoSchedule" }, ], }, { "value": "node-role.kubernetes.io/controlplane=true:NoSchedule", "expect": [ { "key": "node-role.kubernetes.io/controlplane", "value": "true", "operator": "Equal", "effect": "NoSchedule" }, ], }, # Skip the this special toleration for now because it makes # Longhorn deploy manager pods on control/etcd nodes # and the control/etcd nodes become "down" after the test # clear this toleration. # We will enable this test once we implement logic for # deleting failed nodes. # { # "value": ":", # "expect": [ # { # "key": None, # "value": None, # "operator": "Exists", # "effect": None, # }, # ] # }, { "value": "", "expect": [], }, ] chk_removed_tolerations = [] for setting in settings: client = get_longhorn_api_client() # NOQA taint_toleration = client.by_id_setting(SETTING_TAINT_TOLERATION) updated = client.update(taint_toleration, value=setting["value"]) assert updated.value == setting["value"] node_count = len(client.list_node()) wait_for_toleration_update(core_api, apps_api, node_count, setting["expect"], chk_removed_tolerations) chk_removed_tolerations = setting["expect"]
def test_offline_node_with_attached_volume_and_pod( client, core_api, volume_name, make_deployment_with_pvc, reset_cluster_ready_status): # NOQA """ Test offline node with attached volume and pod 1. Create PV/PVC/Deployment manifest. 2. Update deployment's tolerations to 20 seconds to speed up test 3. Update deployment's node affinity rule to avoid the current node 4. Create volume, PV/PVC and deployment. 5. Find the pod in the deployment and write `test_data` into it 6. Shutdown the node pod is running on 7. Wait for deployment to delete the pod 1. Deployment cannot delete the pod here because kubelet doesn't response 8. Force delete the terminating pod 9. Wait for the new pod to be created and the volume attached 10. Check `test_data` in the new pod """ toleration_seconds = 20 apps_api = get_apps_api_client() cloudprovider = detect_cloudprovider() volume_name = generate_volume_name() pv_name = volume_name + "-pv" pvc_name = volume_name + "-pvc" deployment_name = volume_name + "-dep" longhorn_test_node_name = get_self_host_id() deployment_manifest = make_deployment_with_pvc(deployment_name, pvc_name) unreachable_toleration = { "key": "node.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": toleration_seconds } not_ready_toleration = { "key": "node.kubernetes.io/not-ready", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": toleration_seconds } deployment_manifest["spec"]["template"]["spec"]["tolerations"] =\ [unreachable_toleration, not_ready_toleration] node_affinity_roles = { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [{ "matchExpressions": [{ "key": "kubernetes.io/hostname", "operator": "NotIn", "values": [longhorn_test_node_name] }] }] } } } deployment_manifest["spec"]["template"]["spec"]["affinity"] =\ node_affinity_roles longhorn_volume = create_and_check_volume(client, volume_name, size=SIZE) wait_for_volume_detached(client, volume_name) create_pv_for_volume(client, core_api, longhorn_volume, pv_name) create_pvc_for_volume(client, core_api, longhorn_volume, pvc_name) create_and_wait_deployment(apps_api, deployment_manifest) deployment_label_selector =\ "name=" + deployment_manifest["metadata"]["labels"]["name"] deployment_pod_list =\ core_api.list_namespaced_pod(namespace="default", label_selector=deployment_label_selector) assert deployment_pod_list.items.__len__() == 1 pod_name = deployment_pod_list.items[0].metadata.name test_data = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name, test_data) node_name = deployment_pod_list.items[0].spec.node_name node = cloudprovider.node_id(node_name) cloudprovider.node_shutdown(node) k8s_node_down = wait_for_node_down_k8s(node_name, core_api) assert k8s_node_down client = get_longhorn_api_client() longhorn_node_down = wait_for_node_down_longhorn(node_name, client) assert longhorn_node_down time.sleep(toleration_seconds + 5) for i in range(TERMINATING_POD_RETRYS): deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) terminating_pod_name = None for pod in deployment_pod_list.items: if pod.metadata.__getattribute__("deletion_timestamp") is not None: terminating_pod_name = pod.metadata.name break if terminating_pod_name is not None: break else: time.sleep(TERMINATING_POD_INTERVAL) assert terminating_pod_name is not None core_api.delete_namespaced_pod(namespace="default", name=terminating_pod_name, grace_period_seconds=0) delete_and_wait_pod(core_api, terminating_pod_name) deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) assert deployment_pod_list.items.__len__() == 1 wait_for_volume_detached(client, volume_name) wait_for_volume_healthy(client, volume_name) deployment_pod_list =\ core_api.list_namespaced_pod( namespace="default", label_selector=deployment_label_selector ) assert deployment_pod_list.items.__len__() == 1 new_pod_name = deployment_pod_list.items[0].metadata.name wait_pod(new_pod_name) resp_data = read_volume_data(core_api, new_pod_name) assert test_data == resp_data
def minio_get_backup_volume_prefix(volume_name): client = get_longhorn_api_client() backupstore_bv_path = backup_volume_path(volume_name) backupstore_path = minio_get_backupstore_path(client) return backupstore_path + backupstore_bv_path
def test_setting_toleration(): """ Test toleration setting 1. Verify that cannot use Kubernetes tolerations for Longhorn setting 2. Use "key1=value1:NoSchedule; key2:NoExecute" as toleration. 3. Create a volume and attach it. 4. Verify that cannot update toleration setting when any volume is attached 5. Generate and write `data1` into the volume 6. Detach the volume. 7. Update setting `toleration` to toleration. 8. Wait for all the Longhorn components to restart with new toleration 9. Attach the volume again and verify the volume `data1`. 10. Generate and write `data2` to the volume. 11. Detach the volume. 12. Clean the `toleration` setting. 13. Wait for all the Longhorn components to restart with no toleration 14. Attach the volume and validate `data2`. 15. Generate and write `data3` to the volume. """ client = get_longhorn_api_client() # NOQA apps_api = get_apps_api_client() # NOQA core_api = get_core_api_client() # NOQA count = len(client.list_node()) setting = client.by_id_setting(SETTING_TAINT_TOLERATION) with pytest.raises(Exception) as e: client.update(setting, value=KUBERNETES_DEFAULT_TOLERATION + ":NoSchedule") assert "is considered as the key of Kubernetes default tolerations" \ in str(e.value) with pytest.raises(Exception) as e: client.update(setting, value="key1=value1:NoSchedule; key2:InvalidEffect") assert 'invalid effect' in str(e.value) setting_value_str = "key1=value1:NoSchedule; key2:NoExecute" setting_value_dict = \ {"key1": {"key": "key1", "value": "value1", "operator": "Equal", "effect": "NoSchedule"}, "key2": {"key": "key2", "value": None, "operator": "Exists", "effect": "NoExecute"}, } volume_name = "test-toleration-vol" # NOQA volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) with pytest.raises(Exception) as e: client.update(setting, value=setting_value_str) assert 'cannot modify toleration setting before all volumes are detached' \ in str(e.value) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach() wait_for_volume_detached(client, volume_name) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dict) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach() wait_for_volume_detached(client, volume_name) # cleanup setting_value_str = "" setting_value_dict = {} setting = client.by_id_setting(SETTING_TAINT_TOLERATION) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dict) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
def pytest_collection_modifyitems(config, items): c = Configuration() c.assert_hostname = False Configuration.set_default(c) k8sconfig.load_incluster_config() core_api = k8sclient.CoreV1Api() check_longhorn(core_api) include_base_image = config.getoption(INCLUDE_BASE_IMAGE_OPT) if not include_base_image: skip_base_image = pytest.mark.skip(reason="set " + INCLUDE_BASE_IMAGE_OPT + " option to run") for item in items: if "baseimage" in item.keywords: item.add_marker(skip_base_image) if config.getoption(SKIP_RECURRING_JOB_OPT): skip_upgrade = pytest.mark.skip(reason="remove " + SKIP_RECURRING_JOB_OPT + " option to run") for item in items: if "recurring_job" in item.keywords: item.add_marker(skip_upgrade) csi_expansion_enabled = check_csi_expansion(core_api) if not csi_expansion_enabled: skip_csi_expansion = pytest.mark.skip(reason="environment is not " + "using csi expansion") for item in items: if "csi_expansion" in item.keywords: item.add_marker(skip_csi_expansion) all_nodes_support_mount_propagation = True for node in get_longhorn_api_client().list_node(): node = wait_for_node_mountpropagation_condition( get_longhorn_api_client(), node.name) if "conditions" not in node.keys(): all_nodes_support_mount_propagation = False else: conditions = node.conditions for key, condition in conditions.items(): if key == NODE_CONDITION_MOUNTPROPAGATION and \ condition.status != CONDITION_STATUS_TRUE: all_nodes_support_mount_propagation = False break if not all_nodes_support_mount_propagation: break if not all_nodes_support_mount_propagation: skip_upgrade = pytest.mark.skip(reason="environment does not " + "support base image") skip_node = pytest.mark.skip(reason="environment does not " + "support mount disk") for item in items: # Don't need to add skip marker for Base Image twice. if include_base_image and "baseimage" in item.keywords: item.add_marker(skip_upgrade) elif "mountdisk" in item.keywords: item.add_marker(skip_node) if not config.getoption(INCLUDE_INFRA_OPT): skip_infra = pytest.mark.skip(reason="include " + INCLUDE_INFRA_OPT + " option to run") for item in items: if "infra" in item.keywords: item.add_marker(skip_infra) if not config.getoption(INCLUDE_STRESS_OPT): skip_stress = pytest.mark.skip(reason="include " + INCLUDE_STRESS_OPT + " option to run") for item in items: if "stress" in item.keywords: item.add_marker(skip_stress) if not config.getoption(INCLUDE_UPGRADE_OPT): skip_upgrade = pytest.mark.skip(reason="include " + INCLUDE_UPGRADE_OPT + " option to run") for item in items: if "upgrade" in item.keywords: item.add_marker(skip_upgrade)
def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_class, volume_name): # NOQA """ Test that the Priority Class setting is validated and utilized correctly. 1. Verify that the name of a non-existent Priority Class cannot be used for the Setting. 2. Create a new Priority Class in Kubernetes. 3. Create and attach a Volume. 4. Verify that the Priority Class Setting cannot be updated with an attached Volume. 5. Generate and write `data1`. 6. Detach the Volume. 7. Update the Priority Class Setting to the new Priority Class. 8. Wait for all the Longhorn system components to restart with the new Priority Class. 9. Verify that UI, manager, and drive deployer don't have Priority Class 10. Attach the Volume and verify `data1`. 11. Generate and write `data2`. 12. Unset the Priority Class Setting. 13. Wait for all the Longhorn system components to restart with the new Priority Class. 14. Verify that UI, manager, and drive deployer don't have Priority Class 15. Attach the Volume and verify `data2`. 16. Generate and write `data3`. Note: system components are workloads other than UI, manager, driver deployer """ client = get_longhorn_api_client() # NOQA count = len(client.list_node()) name = priority_class['metadata']['name'] setting = client.by_id_setting(SETTING_PRIORITY_CLASS) with pytest.raises(Exception) as e: client.update(setting, value=name) assert 'failed to get priority class ' in str(e.value) scheduling_api.create_priority_class(priority_class) volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) with pytest.raises(Exception) as e: client.update(setting, value=name) assert 'cannot modify priority class setting before all volumes are ' \ 'detached' in str(e.value) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) setting = client.update(setting, value=name) assert setting.value == name wait_for_priority_class_update(core_api, apps_api, count, priority_class) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) setting = client.by_id_setting(SETTING_PRIORITY_CLASS) setting = client.update(setting, value='') assert setting.value == '' wait_for_priority_class_update(core_api, apps_api, count) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
def test_upgrade(upgrade_image_tag, settings_reset, volume_name, pod_make, statefulset, storage_class): # NOQA """ Test Longhorn upgrade Prerequisite: - Disable Auto Salvage Setting 1. Find the upgrade image tag 2. Create a volume, generate and write data into the volume. 3. Create a Pod using a volume, generate and write data 4. Create a StatefulSet with 2 replicas, generate and write data to their volumes 5. Keep all volumes attached 6. Upgrade Longhorn system. 7. Check Pod and StatefulSet didn't restart after upgrade 8. Check All volumes data 9. Write data to StatefulSet pods, and Attached volume 10. Check data written to StatefulSet pods, and attached volume. 11. Detach the volume, and Delete Pod, and StatefulSet to detach theirvolumes 12. Upgrade all volumes engine images. 13. Attach the volume, and recreate Pod, and StatefulSet 14. Check All volumes data """ new_ei_name = "longhornio/longhorn-engine:" + upgrade_image_tag client = get_longhorn_api_client() core_api = get_core_api_client() host_id = get_self_host_id() pod_data_path = "/data/test" pod_volume_name = generate_volume_name() auto_salvage_setting = client.by_id_setting(SETTING_AUTO_SALVAGE) setting = client.update(auto_salvage_setting, value="false") assert setting.name == SETTING_AUTO_SALVAGE assert setting.value == "false" # Create Volume attached to a node. volume1 = create_and_check_volume(client, volume_name, size=SIZE) volume1.attach(hostId=host_id) volume1 = wait_for_volume_healthy(client, volume_name) volume1_data = write_volume_random_data(volume1) # Create Volume used by Pod pod_name, pv_name, pvc_name, pod_md5sum = \ prepare_pod_with_data_in_mb(client, core_api, pod_make, pod_volume_name, data_path=pod_data_path, add_liveness_prope=False) # Create multiple volumes used by StatefulSet statefulset_name = 'statefulset-upgrade-test' update_statefulset_manifests(statefulset, storage_class, statefulset_name) create_storage_class(storage_class) create_and_wait_statefulset(statefulset) statefulset_pod_info = get_statefulset_pod_info(core_api, statefulset) for sspod_info in statefulset_pod_info: sspod_info['data'] = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, sspod_info['pod_name'], sspod_info['data']) # upgrade Longhorn assert longhorn_upgrade(upgrade_image_tag) client = get_longhorn_api_client() # wait for 1 minute before checking pod restarts time.sleep(60) pod = core_api.read_namespaced_pod(name=pod_name, namespace='default') assert pod.status.container_statuses[0].restart_count == 0 for sspod_info in statefulset_pod_info: sspod = core_api.read_namespaced_pod(name=sspod_info['pod_name'], namespace='default') assert \ sspod.status.container_statuses[0].restart_count == 0 for sspod_info in statefulset_pod_info: resp = read_volume_data(core_api, sspod_info['pod_name']) assert resp == sspod_info['data'] res_pod_md5sum = get_pod_data_md5sum(core_api, pod_name, pod_data_path) assert res_pod_md5sum == pod_md5sum check_volume_data(volume1, volume1_data) for sspod_info in statefulset_pod_info: sspod_info['data'] = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, sspod_info['pod_name'], sspod_info['data']) for sspod_info in statefulset_pod_info: resp = read_volume_data(core_api, sspod_info['pod_name']) assert resp == sspod_info['data'] volume1 = client.by_id_volume(volume_name) volume1_data = write_volume_random_data(volume1) check_volume_data(volume1, volume1_data) statefulset['spec']['replicas'] = replicas = 0 apps_api = get_apps_api_client() apps_api.patch_namespaced_stateful_set( name=statefulset_name, namespace='default', body={ 'spec': { 'replicas': replicas } }) delete_and_wait_pod(core_api, pod_name) volume = client.by_id_volume(volume_name) volume.detach() volumes = client.list_volume() for v in volumes: wait_for_volume_detached(client, v.name) engineimages = client.list_engine_image() for ei in engineimages: if ei.image == new_ei_name: new_ei = ei volumes = client.list_volume() for v in volumes: volume = client.by_id_volume(v.name) volume.engineUpgrade(image=new_ei.image) statefulset['spec']['replicas'] = replicas = 2 apps_api = get_apps_api_client() apps_api.patch_namespaced_stateful_set( name=statefulset_name, namespace='default', body={ 'spec': { 'replicas': replicas } }) wait_statefulset(statefulset) pod = pod_make(name=pod_name) pod['spec']['volumes'] = [create_pvc_spec(pvc_name)] create_and_wait_pod(core_api, pod) volume1 = client.by_id_volume(volume_name) volume1.attach(hostId=host_id) volume1 = wait_for_volume_healthy(client, volume_name) for sspod_info in statefulset_pod_info: resp = read_volume_data(core_api, sspod_info['pod_name']) assert resp == sspod_info['data'] res_pod_md5sum = get_pod_data_md5sum(core_api, pod_name, pod_data_path) assert res_pod_md5sum == pod_md5sum check_volume_data(volume1, volume1_data)
def test_setting_toleration(): """ Test toleration setting 1. Set `taint-toleration` to "key1=value1:NoSchedule; key2:InvalidEffect". 2. Verify the request fails. 3. Create a volume and attach it. 4. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 5. Verify that cannot update toleration setting when any volume is attached. 6. Generate and write `data1` into the volume. 7. Detach the volume. 8. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 9. Wait for all the Longhorn system components to restart with new toleration. 10. Verify that UI, manager, and drive deployer don't restart and don't have new toleration. 11. Attach the volume again and verify the volume `data1`. 12. Generate and write `data2` to the volume. 13. Detach the volume. 14. Clean the `toleration` setting. 15. Wait for all the Longhorn system components to restart with no toleration. 16. Attach the volume and validate `data2`. 17. Generate and write `data3` to the volume. """ client = get_longhorn_api_client() # NOQA apps_api = get_apps_api_client() # NOQA core_api = get_core_api_client() # NOQA count = len(client.list_node()) setting = client.by_id_setting(SETTING_TAINT_TOLERATION) with pytest.raises(Exception) as e: client.update(setting, value="key1=value1:NoSchedule; key2:InvalidEffect") assert 'invalid effect' in str(e.value) volume_name = "test-toleration-vol" # NOQA volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) setting_value_str = "key1=value1:NoSchedule; key2:NoExecute" setting_value_dicts = [ { "key": "key1", "value": "value1", "operator": "Equal", "effect": "NoSchedule" }, { "key": "key2", "value": None, "operator": "Exists", "effect": "NoExecute" }, ] with pytest.raises(Exception) as e: client.update(setting, value=setting_value_str) assert 'cannot modify toleration setting before all volumes are detached' \ in str(e.value) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach(hostId="") wait_for_volume_detached(client, volume_name) # cleanup setting_value_str = "" setting_value_dicts = [] setting = client.by_id_setting(SETTING_TAINT_TOLERATION) setting = client.update(setting, value=setting_value_str) assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
def generate_load(request): index = get_random_suffix() longhorn_api_client = get_longhorn_api_client() k8s_api_client = get_core_api_client() check_and_set_backupstore(longhorn_api_client) volume_name = STRESS_VOLUME_NAME_PREFIX + index pv_name = STRESS_PV_NAME_PREFIX + index pvc_name = STRESS_PVC_NAME_PREFIX + index pod_name = STRESS_POD_NAME_PREFIX + index atexit.register(remove_datafile, pod_name) atexit.register(delete_and_wait_longhorn, longhorn_api_client, volume_name) atexit.register(delete_and_wait_pv, k8s_api_client, pv_name) atexit.register(delete_and_wait_pvc, k8s_api_client, pvc_name) atexit.register(delete_and_wait_pod, k8s_api_client, pod_name) longhorn_volume = create_and_check_volume(longhorn_api_client, volume_name, size=VOLUME_SIZE) wait_for_volume_detached(longhorn_api_client, volume_name) pod_manifest = generate_pod_with_pvc_manifest(pod_name, pvc_name) create_pv_for_volume(longhorn_api_client, k8s_api_client, longhorn_volume, pv_name) create_pvc_for_volume(longhorn_api_client, k8s_api_client, longhorn_volume, pvc_name) create_and_wait_pod(k8s_api_client, pod_manifest) snapshots_md5sum = dict() write_data(k8s_api_client, pod_name) create_recurring_jobs(longhorn_api_client, volume_name) global N_RANDOM_ACTIONS for round in range(N_RANDOM_ACTIONS): action = randrange(0, 8) if action == 0: print("write data started: " + time_now(), end=', ') write_data(k8s_api_client, pod_name) print("ended: " + time_now()) elif action == 1: print("delete data started: " + time_now(), end=', ') delete_data(k8s_api_client, pod_name) print("ended: " + time_now()) elif action == 2: print("create snapshot started: " + time_now(), end=', ') snapshot_create_and_record_md5sum(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 3: print("delete random snapshot started: " + time_now(), end=', ') delete_random_snapshot(longhorn_api_client, volume_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 4: print("revert random snapshot started: " + time_now(), end=', ') revert_random_snapshot(longhorn_api_client, k8s_api_client, volume_name, pod_manifest, snapshots_md5sum) print("ended: " + time_now()) elif action == 5: print("create backup started: " + time_now(), end=', ') backup_create_and_record_md5sum(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) elif action == 6: print("delete replica started: " + time_now(), end=', ') delete_replica(longhorn_api_client, volume_name) print("ended: " + time_now()) elif action == 7: print("restore random backup started: " + time_now(), end=', ') restore_and_check_random_backup(longhorn_api_client, k8s_api_client, volume_name, pod_name, snapshots_md5sum) print("ended: " + time_now()) clean_volume_backups(longhorn_api_client, volume_name)