예제 #1
0
def test_replica_zone_anti_affinity(client, core_api, volume_name,
                                    k8s_node_zone_tags):  # NOQA
    """
    Test replica scheduler with zone anti-affinity

    1. Set zone anti-affinity to hard.
    2. Label nodes 1 & 2 with same zone label "zone1".
    Label node 3 with zone label "zone2".
    3. Create a volume with 3 replicas.
    4. Wait for volume condition `scheduled` to be false.
    5. Label node 2 with zone label "zone3".
    6. Wait for volume condition `scheduled` to be success.
    7. Clear the volume.
    8. Set zone anti-affinity to soft.
    9. Change the zone labels on node 1 & 2 & 3 to "zone1".
    10. Create a volume.
    11. Wait for volume condition `scheduled` to be success.
    12. Clean up the replica count, the zone labels and the volume.
    """

    wait_longhorn_node_zone_updated(client)

    replica_node_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(replica_node_soft_anti_affinity_setting, value="false")

    replica_zone_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY)
    client.update(replica_zone_soft_anti_affinity_setting, value="false")

    volume = create_and_check_volume(client, volume_name)

    lh_nodes = client.list_node()

    count = 0
    for node in lh_nodes:
        count += 1
        set_k8s_node_zone_label(core_api, node.name, "lh-zone" + str(count))

    wait_longhorn_node_zone_updated(client)

    wait_for_volume_condition_scheduled(client, volume_name, "status",
                                        CONDITION_STATUS_TRUE)

    replica_zone_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY)
    client.update(replica_zone_soft_anti_affinity_setting, value="true")

    volume = client.by_id_volume(volume_name)
    client.delete(volume)
    wait_for_volume_delete(client, volume_name)

    for node in lh_nodes:
        set_k8s_node_zone_label(core_api, node.name, "lh-zone1")

    wait_longhorn_node_zone_updated(client)

    volume = create_and_check_volume(client, volume_name)
    wait_for_volume_condition_scheduled(client, volume_name, "status",
                                        CONDITION_STATUS_TRUE)
예제 #2
0
def csi_backup_test(client, core_api, csi_pv, pvc, pod_make, base_image=""):  # NOQA
    pod_name = 'csi-backup-test'
    create_and_wait_csi_pod(pod_name, client, core_api, csi_pv, pvc, pod_make,
                            base_image, "")
    test_data = generate_random_data(VOLUME_RWTEST_SIZE)

    setting = client.by_id_setting(common.SETTING_BACKUP_TARGET)
    # test backupTarget for multiple settings
    backupstores = common.get_backupstore_url()
    i = 1
    for backupstore in backupstores:
        if common.is_backupTarget_s3(backupstore):
            backupsettings = backupstore.split("$")
            setting = client.update(setting, value=backupsettings[0])
            assert setting.value == backupsettings[0]

            credential = client.by_id_setting(
                    common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value=backupsettings[1])
            assert credential.value == backupsettings[1]
        else:
            setting = client.update(setting, value=backupstore)
            assert setting.value == backupstore
            credential = client.by_id_setting(
                    common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value="")
            assert credential.value == ""

        backupstore_test(client, core_api, csi_pv, pvc, pod_make, pod_name,
                         base_image, test_data, i)
        i += 1
예제 #3
0
def csi_backup_test(client, core_api, csi_pv, pvc, pod_make, base_image=""):  # NOQA
    pod_name = 'csi-backup-test'
    create_and_wait_csi_pod(pod_name, client, core_api, csi_pv, pvc, pod_make,
                            base_image, "")
    test_data = generate_random_data(VOLUME_RWTEST_SIZE)

    setting = client.by_id_setting(common.SETTING_BACKUP_TARGET)
    # test backupTarget for multiple settings
    backupstores = common.get_backupstore_url()
    i = 1
    for backupstore in backupstores:
        if common.is_backupTarget_s3(backupstore):
            backupsettings = backupstore.split("$")
            setting = client.update(setting, value=backupsettings[0])
            assert setting["value"] == backupsettings[0]

            credential = client.by_id_setting(
                    common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value=backupsettings[1])
            assert credential["value"] == backupsettings[1]
        else:
            setting = client.update(setting, value=backupstore)
            assert setting["value"] == backupstore
            credential = client.by_id_setting(
                    common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value="")
            assert credential["value"] == ""

        backupstore_test(client, core_api, csi_pv, pvc, pod_make, pod_name,
                         base_image, test_data, i)
        i += 1
예제 #4
0
def test_soft_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity can detach and reattach to a
    node properly.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume["replicas"]) == 3

    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #5
0
def test_soft_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity work as expected.

    With Soft Anti-Affinity, a new replica should still be scheduled on a node
    with an existing replica, which will result in "Healthy" state but limited
    redundancy.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #6
0
def test_hard_anti_affinity_offline_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas during
    the attaching process once a valid node is available.

    Once a new replica has been built as part of the attaching process, the
    volume should be Healthy again.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    client.update(node, allowScheduling=True)
    volume.attach(hostId=host_id)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #7
0
def test_hard_anti_affinity_live_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas live once
    a valid node is available.

    If no nodes without existing replicas are available, the volume should
    remain in "Degraded" state. However, once one is available, the replica
    should now be scheduled successfully, with the volume returning to
    "Healthy" state.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # Allow scheduling on host node again
    client.update(node, allowScheduling=True)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #8
0
def test_replica_scheduler_update_over_provisioning(client):  # NOQA
    nodes = client.list_node()
    lht_hostId = get_self_host_id()
    expect_node_disk = {}
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            if disk["path"] == DEFAULT_DISK_PATH:
                expect_disk = disk
                expect_disk["fsid"] = fsid
                expect_node_disk[node["name"]] = expect_disk

    over_provisioning_setting = client.by_id_setting(
        SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE)
    old_provisioning_setting = over_provisioning_setting["value"]

    # set storage over provisioning percentage to 0
    # to test all replica couldn't be scheduled
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="0")
    vol_name = common.generate_volume_name()
    volume = client.create_volume(name=vol_name,
                                  size=SIZE, numberOfReplicas=len(nodes))
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_FALSE)

    # set storage over provisioning percentage to 100
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="100")

    # check volume status
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_TRUE)
    volume = common.wait_for_volume_detached(client, vol_name)
    assert volume["state"] == "detached"
    assert volume["created"] != ""

    volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, vol_name)

    node_hosts = []
    for node in nodes:
        node_hosts.append(node["name"])
    # check all replica should be scheduled to default disk
    for replica in volume["replicas"]:
        id = replica["hostId"]
        assert id != ""
        assert replica["running"]
        expect_disk = expect_node_disk[id]
        assert replica["diskID"] == expect_disk["fsid"]
        assert expect_disk["path"] in replica["dataPath"]
        node_hosts = filter(lambda x: x != id, node_hosts)
    assert len(node_hosts) == 0

    # clean volume and disk
    cleanup_volume(client, vol_name)
    client.update(over_provisioning_setting,
                  value=old_provisioning_setting)
예제 #9
0
def test_replica_scheduler_update_over_provisioning(client):  # NOQA
    nodes = client.list_node()
    lht_hostId = get_self_host_id()
    expect_node_disk = {}
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            if disk["path"] == DEFAULT_DISK_PATH:
                expect_disk = disk
                expect_disk["fsid"] = fsid
                expect_node_disk[node["name"]] = expect_disk

    over_provisioning_setting = client.by_id_setting(
        SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE)
    old_provisioning_setting = over_provisioning_setting["value"]

    # set storage over provisioning percentage to 0
    # to test all replica couldn't be scheduled
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="0")
    vol_name = common.generate_volume_name()
    volume = client.create_volume(name=vol_name,
                                  size=SIZE, numberOfReplicas=len(nodes))
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_FALSE)

    # set storage over provisioning percentage to 100
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="100")

    # check volume status
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_TRUE)
    volume = common.wait_for_volume_detached(client, vol_name)
    assert volume["state"] == "detached"
    assert volume["created"] != ""

    volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, vol_name)

    node_hosts = []
    for node in nodes:
        node_hosts.append(node["name"])
    # check all replica should be scheduled to default disk
    for replica in volume["replicas"]:
        id = replica["hostId"]
        assert id != ""
        assert replica["running"]
        expect_disk = expect_node_disk[id]
        assert replica["diskID"] == expect_disk["fsid"]
        assert expect_disk["path"] in replica["dataPath"]
        node_hosts = filter(lambda x: x != id, node_hosts)
    assert len(node_hosts) == 0

    # clean volume and disk
    cleanup_volume(client, vol_name)
    client.update(over_provisioning_setting,
                  value=old_provisioning_setting)
예제 #10
0
def test_replica_scheduler_exceed_over_provisioning(client):  # NOQA
    over_provisioning_setting = client.by_id_setting(
        SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE)
    old_provisioning_setting = over_provisioning_setting["value"]
    # set storage over provisioning percentage to 100
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="100")

    # test exceed over provisioning limit couldn't be scheduled
    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            disk["storageReserved"] = \
                disk["storageMaximum"] - 1*Gi
        update_disks = get_update_disks(disks)
        node = node.diskUpdate(disks=update_disks)
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_status(client, node["name"], fsid, "storageReserved",
                                 disk["storageMaximum"] - 1 * Gi)

    vol_name = common.generate_volume_name()
    volume = client.create_volume(name=vol_name,
                                  size=str(2 * Gi),
                                  numberOfReplicas=len(nodes))
    volume = common.wait_for_volume_condition_scheduled(
        client, vol_name, "status", CONDITION_STATUS_FALSE)
    client.delete(volume)
    common.wait_for_volume_delete(client, vol_name)
    client.update(over_provisioning_setting, value=old_provisioning_setting)
예제 #11
0
def test_replica_scheduler_just_under_over_provisioning(client):  # NOQA
    over_provisioning_setting = client.by_id_setting(
        SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE)
    old_provisioning_setting = over_provisioning_setting["value"]
    # set storage over provisioning percentage to 100
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="100")

    lht_hostId = get_self_host_id()
    nodes = client.list_node()
    expect_node_disk = {}
    max_size_array = []
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            if disk["path"] == DEFAULT_DISK_PATH:
                expect_disk = disk
                expect_disk["fsid"] = fsid
                expect_node_disk[node["name"]] = expect_disk
                max_size_array.append(disk["storageMaximum"])
            disk["storageReserved"] = 0
            update_disks = get_update_disks(disks)
            node = node.diskUpdate(disks=update_disks)
            disks = node["disks"]
            for fsid, disk in disks.iteritems():
                wait_for_disk_status(client, node["name"], fsid,
                                     "storageReserved", 0)

    max_size = min(max_size_array)
    # test just under over provisioning limit could be scheduled
    vol_name = common.generate_volume_name()
    volume = client.create_volume(name=vol_name,
                                  size=str(max_size),
                                  numberOfReplicas=len(nodes))
    volume = common.wait_for_volume_condition_scheduled(
        client, vol_name, "status", CONDITION_STATUS_TRUE)
    volume = common.wait_for_volume_detached(client, vol_name)
    assert volume["state"] == "detached"
    assert volume["created"] != ""

    volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, vol_name)
    nodes = client.list_node()
    node_hosts = []
    for node in nodes:
        node_hosts.append(node["name"])
    # check all replica should be scheduled to default disk
    for replica in volume["replicas"]:
        id = replica["hostId"]
        assert id != ""
        assert replica["running"]
        expect_disk = expect_node_disk[id]
        assert replica["diskID"] == expect_disk["fsid"]
        assert expect_disk["path"] in replica["dataPath"]
        node_hosts = filter(lambda x: x != id, node_hosts)
    assert len(node_hosts) == 0

    # clean volume and disk
    cleanup_volume(client, vol_name)
    client.update(over_provisioning_setting, value=old_provisioning_setting)
예제 #12
0
def test_replica_scheduler_exceed_over_provisioning(client):  # NOQA
    over_provisioning_setting = client.by_id_setting(
        SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE)
    old_provisioning_setting = over_provisioning_setting["value"]
    # set storage over provisioning percentage to 100
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="100")

    # test exceed over provisioning limit couldn't be scheduled
    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            disk["storageReserved"] = \
                disk["storageMaximum"] - 1*Gi
        update_disks = get_update_disks(disks)
        node = node.diskUpdate(disks=update_disks)
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_status(client, node["name"],
                                 fsid, "storageReserved",
                                 disk["storageMaximum"] - 1*Gi)

    vol_name = common.generate_volume_name()
    volume = client.create_volume(name=vol_name,
                                  size=str(2*Gi), numberOfReplicas=len(nodes))
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_FALSE)
    client.delete(volume)
    common.wait_for_volume_delete(client, vol_name)
    client.update(over_provisioning_setting, value=old_provisioning_setting)
예제 #13
0
def reset_settings():
    yield
    client = get_longhorn_api_client()  # NOQA
    host_id = get_self_host_id()
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=True)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
예제 #14
0
def test_hard_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity are still able to detach and
    reattach to a node properly, even in degraded state.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
    6. Detach the volume.
    7. Verify that volume only have 2 replicas
        1. Unhealthy replica will be removed upon detach.
    8. Attach the volume again.
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
        3. Verify only two of three replicas of volume are healthy.
        4. Verify the remaining replica doesn't have `replica.HostID`, meaning
        it's unscheduled
    9. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume.replicas) == 2

    volume.attach(hostId=host_id)
    # Make sure we're still not getting another successful replica.
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    assert sum([
        1 for replica in volume.replicas
        if replica.running and replica.mode == "RW"
    ]) == 2
    assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #15
0
def test_hard_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity work as expected.

    With Hard Anti-Affinity, scheduling on nodes with existing replicas should
    be forbidden, resulting in "Degraded" state.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
        3. Verify only two of three replicas of volume are healthy.
        4. Verify the remaining replica doesn't have `replica.HostID`, meaning
        it's unscheduled
    6. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    # Instead of waiting for timeout and lengthening the tests a significant
    # amount we can make sure the scheduling isn't working by making sure the
    # volume becomes Degraded and reports a scheduling error.
    wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # While there are three replicas that should exist to meet the Volume's
    # request, only two of those volumes should actually be Healthy.
    volume = client.by_id_volume(volume_name)
    assert sum([
        1 for replica in volume.replicas
        if replica.running and replica.mode == "RW"
    ]) == 2
    # Confirm that the final volume is an unscheduled volume.
    assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1
    # Three replicas in total should still exist.
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #16
0
def test_replica_rebuild_per_volume_limit(client, core_api, storage_class,
                                          sts_name, statefulset):  # NOQA
    """
    Test the volume always only have one replica scheduled for rebuild

    1. Set soft anti-affinity to `true`.
    2. Create a volume with 1 replica.
    3. Attach the volume and write a few hundreds MB data to it.
    4. Scale the volume replica to 5.
    5. Constantly checking the volume replica list to make sure there should be
       only 1 replica in WO state.
    6. Wait for the volume to complete rebuilding. Then remove 4 of the 5
       replicas.
    7. Monitoring the volume replica list again.
    8. Once the rebuild was completed again, verify the data checksum.
    """
    replica_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(replica_soft_anti_affinity_setting, value="true")

    data_path = '/data/test'
    storage_class['parameters']['numberOfReplicas'] = "1"
    vol_name, pod_name, md5sum = \
        common.prepare_statefulset_with_data_in_mb(
            client, core_api, statefulset, sts_name, storage_class,
            data_path=data_path, data_size_in_mb=DATA_SIZE_IN_MB_2)

    # Scale the volume replica to 5
    r_count = 5
    vol = client.by_id_volume(vol_name)
    vol.updateReplicaCount(replicaCount=r_count)

    vol = common.wait_for_volume_replicas_mode(client,
                                               vol_name,
                                               'RW',
                                               replica_count=r_count)

    # Delete 4 volume replicas
    del vol.replicas[0]
    for r in vol.replicas:
        vol.replicaRemove(name=r.name)

    r_count = 1
    common.wait_for_volume_replicas_mode(client,
                                         vol_name,
                                         'RW',
                                         replica_count=r_count)

    assert md5sum == common.get_pod_data_md5sum(core_api, pod_name, data_path)
예제 #17
0
def test_provisioner_tags(client, core_api, node_default_tags, storage_class,
                          pvc, pod):  # NOQA
    """
    Test that a StorageClass can properly provision a volume with requested
    Tags.

    Test prerequisite:
      - set Replica Node Level Soft Anti-Affinity enabled

    1. Use `node_default_tags` to add default tags to nodes.
    2. Create a StorageClass with disk and node tag set.
    3. Create PVC and Pod.
    4. Verify the volume has the correct parameters and tags.
    """

    replica_node_soft_anti_affinity_setting = \
        client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(replica_node_soft_anti_affinity_setting, value="true")

    # Prepare pod and volume specs.
    pod_name = 'provisioner-tags-test'
    tag_spec = {
        "disk": ["ssd", "nvme"],
        "expected": 1,
        "node": ["storage", "main"]
    }

    pod['metadata']['name'] = pod_name
    pod['spec']['volumes'] = [create_pvc_spec(pvc['metadata']['name'])]
    pvc['spec']['storageClassName'] = DEFAULT_STORAGECLASS_NAME
    storage_class['metadata']['name'] = DEFAULT_STORAGECLASS_NAME
    storage_class['parameters']['diskSelector'] = 'ssd,nvme'
    storage_class['parameters']['nodeSelector'] = 'storage,main'
    volume_size = DEFAULT_VOLUME_SIZE * Gi

    create_storage(core_api, storage_class, pvc)
    create_and_wait_pod(core_api, pod)
    pvc_volume_name = get_volume_name(core_api, pvc['metadata']['name'])

    # Confirm that the volume has all the correct parameters we gave it.
    volumes = client.list_volume()
    assert len(volumes) == 1
    assert volumes.data[0].name == pvc_volume_name
    assert volumes.data[0].size == str(volume_size)
    assert volumes.data[0].numberOfReplicas == \
        int(storage_class['parameters']['numberOfReplicas'])
    assert volumes.data[0].state == "attached"
    check_volume_replicas(volumes.data[0], tag_spec, node_default_tags)
예제 #18
0
def test_hard_anti_affinity_offline_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas during
    the attaching process once a valid node is available.

    Once a new replica has been built as part of the attaching process, the
    volume should be Healthy again.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
    6. Detach the volume.
    7. Enable current node's scheduling.
    8. Attach the volume again.
    9. Wait for volume to become healthy with 3 replicas
    10. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume.replicas)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    client.update(node, allowScheduling=True)
    volume.attach(hostId=host_id)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #19
0
def test_hard_anti_affinity_live_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas live once
    a valid node is available.

    If no nodes without existing replicas are available, the volume should
    remain in "Degraded" state. However, once one is available, the replica
    should now be scheduled successfully, with the volume returning to
    "Healthy" state.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
    6. Enable the current node's scheduling
    7. Wait for volume to start rebuilding and become healthy again
    8. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume.replicas)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # Allow scheduling on host node again
    client.update(node, allowScheduling=True)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #20
0
def test_soft_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity can detach and reattach to a
    node properly.

    1. Create a volume and attach to the current node.
    2. Generate and write `data` to the volume
    3. Set `soft anti-affinity` to true
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
    6. Wait for the new replica to be rebuilt
    7. Detach the volume.
    8. Verify there are 3 replicas
    9. Attach the volume again. Verify there are still 3 replicas
    10. Verify the `data`.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = list(map(lambda replica: replica.name, volume.replicas))
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume.replicas) == 3

    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #21
0
def test_orphan_auto_deletion(client, volume_name, request):  # NOQA
    """
    Test orphaned dirs creation and background deletion in multiple disks
    1. Create a new disks for holding orphaned replica directories
    2. Create a volume and attach to the current node
    3. Create orphaned replica directories by copying the active
       replica directory
    4. Clean up volume
    5. Verify orphan list contains the orphan CRs for replica
       directories
    6. Enable the orphan-auto-deletion setting
    7. Verify orphan list is empty and the orphaned directory is
       deleted in background
    8. Clean up disk
    """
    disk_names = ["vol-disk-" + generate_random_id(4)]

    # Step 1
    lht_hostId = get_self_host_id()
    cleanup_node_disks(client, lht_hostId)
    disk_paths = crate_disks_on_host(client, disk_names, request)

    # Step 2
    volume = create_volume_with_replica_on_host(client, volume_name)

    # Step 3
    create_orphaned_directories_on_host(volume, disk_paths, 1)

    # Step 4
    cleanup_volume_by_name(client, volume_name)

    # Step 5
    assert wait_for_orphan_count(client, 1, 180) == 1

    # Step 6: enable orphan auto deletion
    setting = client.by_id_setting(SETTING_ORPHAN_AUTO_DELETION)
    client.update(setting, value="true")

    # Step 7
    assert wait_for_orphan_count(client, 0, 180) == 0
    assert wait_for_file_count(os.path.join(disk_paths[0], "replicas"), 0,
                               180) == 0
예제 #22
0
def test_node_controller_sync_disk_state(client):  # NOQA
    # update StorageMinimalAvailablePercentage to test Disk State
    setting = client.by_id_setting(
        SETTING_STORAGE_MINIMAL_AVAILABLE_PERCENTAGE)
    old_minimal_available_percentage = setting["value"]
    setting = client.update(setting, value="100")
    assert setting["value"] == "100"
    nodes = client.list_node()
    # wait for node controller to update disk state
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_conditions(client, node["name"], fsid,
                                     DISK_CONDITION_SCHEDULABLE,
                                     CONDITION_STATUS_FALSE)

    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            conditions = disk["conditions"]
            assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \
                CONDITION_STATUS_FALSE

    setting = client.update(setting, value=old_minimal_available_percentage)
    assert setting["value"] == old_minimal_available_percentage
    # wait for node controller to update disk state
    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_conditions(client, node["name"], fsid,
                                     DISK_CONDITION_SCHEDULABLE,
                                     CONDITION_STATUS_TRUE)

    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            conditions = disk["conditions"]
            assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \
                CONDITION_STATUS_TRUE
예제 #23
0
def test_node_controller_sync_disk_state(client):  # NOQA
    # update StorageMinimalAvailablePercentage to test Disk State
    setting = client.by_id_setting(
        SETTING_STORAGE_MINIMAL_AVAILABLE_PERCENTAGE)
    old_minimal_available_percentage = setting["value"]
    setting = client.update(setting, value="100")
    assert setting["value"] == "100"
    nodes = client.list_node()
    # wait for node controller to update disk state
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_conditions(client, node["name"],
                                     fsid, DISK_CONDITION_SCHEDULABLE,
                                     CONDITION_STATUS_FALSE)

    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            conditions = disk["conditions"]
            assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \
                CONDITION_STATUS_FALSE

    setting = client.update(setting, value=old_minimal_available_percentage)
    assert setting["value"] == old_minimal_available_percentage
    # wait for node controller to update disk state
    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_conditions(client, node["name"],
                                     fsid, DISK_CONDITION_SCHEDULABLE,
                                     CONDITION_STATUS_TRUE)

    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            conditions = disk["conditions"]
            assert conditions[DISK_CONDITION_SCHEDULABLE]["status"] == \
                CONDITION_STATUS_TRUE
예제 #24
0
def test_hard_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity work as expected.

    With Hard Anti-Affinity, scheduling on nodes with existing replicas should
    be forbidden, resulting in "Degraded" state.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    # Instead of waiting for timeout and lengthening the tests a significant
    # amount we can make sure the scheduling isn't working by making sure the
    # volume becomes Degraded and reports a scheduling error.
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # While there are three replicas that should exist to meet the Volume's
    # request, only two of those volumes should actually be Healthy.
    assert sum([
        1 for replica in volume["replicas"]
        if replica["running"] and replica["mode"] == "RW"
    ]) == 2
    # Confirm that the final volume is an unscheduled volume.
    assert sum([1 for replica in volume["replicas"]
                if not replica["hostId"]]) == 1
    # Three replicas in total should still exist.
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #25
0
def test_hard_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity are still able to detach and
    reattach to a node properly, even in degraded state.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume["replicas"]) == 2

    volume.attach(hostId=host_id)
    # Make sure we're still not getting another successful replica.
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    assert sum([
        1 for replica in volume["replicas"]
        if replica["running"] and replica["mode"] == "RW"
    ]) == 2
    assert sum([1 for replica in volume["replicas"]
                if not replica["hostId"]]) == 1
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
예제 #26
0
def test_soft_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity work as expected.

    With Soft Anti-Affinity, a new replica should still be scheduled on a node
    with an existing replica, which will result in "Healthy" state but limited
    redundancy.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to true
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
    6. Wait for the volume to complete rebuild. Volume should have 3 replicas.
    7. Verify `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = list(map(lambda replica: replica.name, volume.replicas))
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
def test_backup_kubernetes_status(set_random_backupstore, client, core_api,
                                  pod):  # NOQA
    """
    Test that Backups have KubernetesStatus stored properly when there is an
    associated PersistentVolumeClaim and Pod.

    1. Setup a random backupstore
    2. Set settings Longhorn Static StorageClass to `longhorn-static-test`
    3. Create a volume and PV/PVC. Verify the StorageClass of PVC
    4. Create a Pod using the PVC.
    5. Check volume's Kubernetes status to reflect PV/PVC/Pod correctly.
    6. Create a backup for the volume.
    7. Verify the labels of created backup reflect PV/PVC/Pod status.
    8. Restore the backup to a volume. Wait for restoration to complete.
    9. Check the volume's Kubernetes Status
        1. Make sure the `lastPodRefAt` and `lastPVCRefAt` is snapshot created
    time
    10. Delete the backup and restored volume.
    11. Delete PV/PVC/Pod.
    12. Verify volume's Kubernetes Status updated to reflect history data.
    13. Attach the volume and create another backup. Verify the labels
    14. Verify the volume's Kubernetes status.
    15. Restore the previous backup to a new volume. Wait for restoration.
    16. Verify the restored volume's Kubernetes status.
        1. Make sure `lastPodRefAt` and `lastPVCRefAt` matched volume on step
        12
    """

    host_id = get_self_host_id()
    static_sc_name = "longhorn-static-test"
    setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC)
    setting = client.update(setting, value=static_sc_name)
    assert setting.value == static_sc_name

    volume_name = "test-backup-kubernetes-status-pod"  # NOQA
    client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2)
    volume = wait_for_volume_detached(client, volume_name)

    pod_name = "pod-" + volume_name
    pv_name = "pv-" + volume_name
    pvc_name = "pvc-" + volume_name
    create_pv_for_volume(client, core_api, volume, pv_name)
    create_pvc_for_volume(client, core_api, volume, pvc_name)
    ret = core_api.list_namespaced_persistent_volume_claim(namespace='default')
    pvc_found = False
    for item in ret.items:
        if item.metadata.name == pvc_name:
            pvc_found = item
            break
    assert pvc_found
    assert pvc_found.spec.storage_class_name == static_sc_name

    pod['metadata']['name'] = pod_name
    pod['spec']['volumes'] = [{
        'name':
        pod['spec']['containers'][0]['volumeMounts'][0]['name'],
        'persistentVolumeClaim': {
            'claimName': pvc_name,
        },
    }]
    create_and_wait_pod(core_api, pod)

    ks = {
        'lastPodRefAt':
        '',
        'lastPVCRefAt':
        '',
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        'pvName':
        pv_name,
        'pvStatus':
        'Bound',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, volume_name, ks)
    volume = wait_for_volume_healthy(client, volume_name)

    # Create Backup manually instead of calling create_backup since Kubernetes
    # is not guaranteed to mount our Volume to the test host.
    snap = create_snapshot(client, volume_name)
    volume.snapshotBackup(name=snap.name)
    wait_for_backup_completion(client, volume_name, snap.name)
    _, b = find_backup(client, volume_name, snap.name)
    # Check backup label
    status = loads(b.labels.get(KUBERNETES_STATUS_LABEL))
    assert status == ks
    # Check backup volume label
    for _ in range(RETRY_COUNTS):
        bv = client.by_id_backupVolume(volume_name)
        if bv is not None and bv.labels is not None:
            break
        time.sleep(RETRY_INTERVAL)
    assert bv is not None and bv.labels is not None
    status = loads(bv.labels.get(KUBERNETES_STATUS_LABEL))
    assert status == ks

    restore_name = generate_volume_name()
    client.create_volume(name=restore_name,
                         size=SIZE,
                         numberOfReplicas=2,
                         fromBackup=b.url)
    wait_for_volume_restoration_completed(client, restore_name)
    wait_for_volume_detached(client, restore_name)

    snapshot_created = b.snapshotCreated
    ks = {
        'lastPodRefAt':
        b.snapshotCreated,
        'lastPVCRefAt':
        b.snapshotCreated,
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        # Restoration should not apply PersistentVolume data.
        'pvName':
        '',
        'pvStatus':
        '',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, restore_name, ks)
    restore = client.by_id_volume(restore_name)
    # We need to compare LastPodRefAt and LastPVCRefAt manually since
    # wait_volume_kubernetes_status only checks for empty or non-empty state.
    assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"]
    assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"]

    delete_backup(client, bv.name, b.name)
    client.delete(restore)
    wait_for_volume_delete(client, restore_name)
    delete_and_wait_pod(core_api, pod_name)
    delete_and_wait_pvc(core_api, pvc_name)
    delete_and_wait_pv(core_api, pv_name)

    # With the Pod, PVC, and PV deleted, the Volume should have both Ref
    # fields set. Check that a new Backup and Restore will use this instead of
    # manually populating the Ref fields.
    ks = {
        'lastPodRefAt':
        'NOT NULL',
        'lastPVCRefAt':
        'NOT NULL',
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        'pvName':
        '',
        'pvStatus':
        '',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, volume_name, ks)
    volume = wait_for_volume_detached(client, volume_name)

    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)

    snap = create_snapshot(client, volume_name)
    volume.snapshotBackup(name=snap.name)
    volume = wait_for_backup_completion(client, volume_name, snap.name)
    bv, b = find_backup(client, volume_name, snap.name)
    new_b = bv.backupGet(name=b.name)
    status = loads(new_b.labels.get(KUBERNETES_STATUS_LABEL))
    # Check each field manually, we have no idea what the LastPodRefAt or the
    # LastPVCRefAt will be. We just know it shouldn't be SnapshotCreated.
    assert status['lastPodRefAt'] != snapshot_created
    assert status['lastPVCRefAt'] != snapshot_created
    assert status['namespace'] == "default"
    assert status['pvcName'] == pvc_name
    assert status['pvName'] == ""
    assert status['pvStatus'] == ""
    assert status['workloadsStatus'] == [{
        'podName': pod_name,
        'podStatus': 'Running',
        'workloadName': '',
        'workloadType': ''
    }]

    restore_name = generate_volume_name()
    client.create_volume(name=restore_name,
                         size=SIZE,
                         numberOfReplicas=2,
                         fromBackup=b.url)
    wait_for_volume_restoration_completed(client, restore_name)
    wait_for_volume_detached(client, restore_name)

    ks = {
        'lastPodRefAt':
        status['lastPodRefAt'],
        'lastPVCRefAt':
        status['lastPVCRefAt'],
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        'pvName':
        '',
        'pvStatus':
        '',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, restore_name, ks)
    restore = client.by_id_volume(restore_name)
    assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"]
    assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"]

    # cleanup
    backupstore_cleanup(client)
    client.delete(restore)
    cleanup_volume(client, volume)
def test_pvc_creation_with_default_sc_set(client, core_api, storage_class,
                                          pod):  # NOQA
    """
    Test creating PVC with default StorageClass set

    The target is to make sure the newly create PV/PVC won't use default
    StorageClass, and if there is no default StorageClass, PV/PVC can still be
    created.

    1. Create a StorageClass and set it to be the default StorageClass
    2. Update static StorageClass to `longhorn-static-test`
    3. Create volume then PV/PVC.
    4. Make sure the newly created PV/PVC using StorageClass
    `longhorn-static-test`
    5. Create pod with PVC.
    6. Verify volume's Kubernetes Status
    7. Remove PVC and Pod.
    8. Verify volume's Kubernetes Status only contains current PV and history
    9. Wait for volume to detach (since pod is deleted)
    10. Reuse the volume on a new pod. Wait for the pod to start
    11. Verify volume's Kubernetes Status reflect the new pod.
    12. Delete PV/PVC/Pod.
    13. Verify volume's Kubernetes Status only contains history
    14. Delete the default StorageClass.
    15. Create PV/PVC for the volume.
    16. Make sure the PV's StorageClass is static StorageClass
    """
    # set default storage class
    storage_class['metadata']['annotations'] = \
        {"storageclass.kubernetes.io/is-default-class": "true"}
    create_storage_class(storage_class)

    static_sc_name = "longhorn-static-test"
    setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC)
    setting = client.update(setting, value=static_sc_name)
    assert setting.value == static_sc_name

    volume_name = "test-pvc-creation-with-sc"  # NOQA
    pod_name = "pod-" + volume_name
    client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2)
    volume = wait_for_volume_detached(client, volume_name)

    pv_name = "pv-" + volume_name
    pvc_name = "pvc-" + volume_name
    pvc_name_extra = "pvc-" + volume_name + "-extra"

    create_pv_for_volume(client, core_api, volume, pv_name)
    create_pvc_for_volume(client, core_api, volume, pvc_name)

    ret = core_api.list_namespaced_persistent_volume_claim(namespace='default')
    for item in ret.items:
        if item.metadata.name == pvc_name:
            pvc_found = item
            break
    assert pvc_found
    assert pvc_found.spec.storage_class_name == static_sc_name

    pod['metadata']['name'] = pod_name
    pod['spec']['volumes'] = [{
        'name':
        pod['spec']['containers'][0]['volumeMounts'][0]['name'],
        'persistentVolumeClaim': {
            'claimName': pvc_name,
        },
    }]
    create_and_wait_pod(core_api, pod)

    ks = {
        'pvName':
        pv_name,
        'pvStatus':
        'Bound',
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        'lastPVCRefAt':
        '',
        'lastPodRefAt':
        '',
        'workloadsStatus': [
            {
                'podName': pod_name,
                'podStatus': 'Running',
                'workloadName': '',
                'workloadType': '',
            },
        ],
    }
    wait_volume_kubernetes_status(client, volume_name, ks)

    delete_and_wait_pod(core_api, pod_name)
    delete_and_wait_pvc(core_api, pvc_name)

    ks = {
        'pvName': pv_name,
        'pvStatus': 'Released',
        'namespace': 'default',
        'pvcName': pvc_name,
        'lastPVCRefAt': 'not empty',
        'lastPodRefAt': 'not empty',
    }
    wait_volume_kubernetes_status(client, volume_name, ks)

    # try to reuse the pv
    volume = wait_for_volume_detached(client, volume_name)
    create_pvc_for_volume(client, core_api, volume, pvc_name_extra)
    pod['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = \
        pvc_name_extra
    create_and_wait_pod(core_api, pod)

    ks = {
        'pvName':
        pv_name,
        'pvStatus':
        'Bound',
        'namespace':
        'default',
        'pvcName':
        pvc_name_extra,
        'lastPVCRefAt':
        '',
        'lastPodRefAt':
        '',
        'workloadsStatus': [
            {
                'podName': pod_name,
                'podStatus': 'Running',
                'workloadName': '',
                'workloadType': '',
            },
        ],
    }
    wait_volume_kubernetes_status(client, volume_name, ks)

    delete_and_wait_pod(core_api, pod_name)
    delete_and_wait_pvc(core_api, pvc_name_extra)
    delete_and_wait_pv(core_api, pv_name)

    ks = {
        'pvName': '',
        'pvStatus': '',
        'namespace': 'default',
        'pvcName': pvc_name_extra,
        'lastPVCRefAt': 'not empty',
        'lastPodRefAt': 'not empty',
    }
    wait_volume_kubernetes_status(client, volume_name, ks)

    # without default storage class
    delete_storage_class(storage_class['metadata']['name'])

    create_pv_for_volume(client, core_api, volume, pv_name)
    create_pvc_for_volume(client, core_api, volume, pvc_name)

    ret = core_api.list_namespaced_persistent_volume_claim(namespace='default')
    for item in ret.items:
        if item.metadata.name == pvc_name:
            pvc2 = item
            break
    assert pvc2
    assert pvc2.spec.storage_class_name == static_sc_name

    delete_and_wait_pvc(core_api, pvc_name)
    delete_and_wait_pv(core_api, pv_name)
예제 #29
0
def test_allow_volume_creation_with_degraded_availability_csi(
        client, core_api, apps_api, make_deployment_with_pvc):  # NOQA
    """
    Test Allow Volume Creation with Degraded Availability (CSI)

    Requirement:
    1. Set `allow-volume-creation-with-degraded-availability` to true.
    2. Set `node-level-soft-anti-affinity` to false.

    Steps:
    1. Disable scheduling for node 3.
    2. Create a Deployment Pod with a volume and 3 replicas.
        1. After the volume is attached, scheduling error should be seen.
    3. Write data to the Pod.
    4. Scale down the deployment to 0 to detach the volume.
        1. Scheduled condition should become true.
    5. Scale up the deployment back to 1 and verify the data.
        1. Scheduled condition should become false.
    6. Enable the scheduling for node 3.
        1. Volume should start rebuilding on the node 3 soon.
        2. Once the rebuilding starts, the scheduled condition should become
           true.
    7. Once rebuild finished, scale down and back the deployment to verify
       the data.
    """
    setting = client.by_id_setting(common.SETTING_DEGRADED_AVAILABILITY)
    client.update(setting, value="true")

    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")

    nodes = client.list_node()
    node3 = nodes[2]
    client.update(node3, allowScheduling=False)

    vol = common.create_and_check_volume(client, generate_volume_name(),
                                         size=str(500 * Mi))

    pv_name = vol.name + "-pv"
    common.create_pv_for_volume(client, core_api, vol, pv_name)

    pvc_name = vol.name + "-pvc"
    common.create_pvc_for_volume(client, core_api, vol, pvc_name)

    deployment_name = vol.name + "-dep"
    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
    deployment["spec"]["replicas"] = 3
    apps_api.create_namespaced_deployment(body=deployment, namespace='default')
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)
    common.wait_scheduling_failure(client, vol.name)

    data_path = "/data/test"
    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    common.write_pod_volume_random_data(core_api, pod.metadata.name,
                                        data_path, common.DATA_SIZE_IN_MB_2)
    created_md5sum = get_pod_data_md5sum(core_api, pod.metadata.name,
                                         data_path)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    vol = common.wait_for_volume_detached(client, vol.name)
    assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True"

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)
    common.wait_for_volume_condition_scheduled(client, vol.name, "status",
                                               common.CONDITION_STATUS_FALSE)
    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    assert created_md5sum == get_pod_data_md5sum(core_api,
                                                 pod.metadata.name,
                                                 data_path)

    client.update(node3, allowScheduling=True)
    common.wait_for_rebuild_start(client, vol.name)
    vol = client.by_id_volume(vol.name)
    assert vol.conditions[VOLUME_CONDITION_SCHEDULED]['status'] == "True"
    common.wait_for_rebuild_complete(client, vol.name)

    deployment['spec']['replicas'] = 0
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_detached(client, vol.name)

    deployment['spec']['replicas'] = 1
    apps_api.patch_namespaced_deployment(body=deployment,
                                         namespace='default',
                                         name=deployment_name)
    common.wait_for_volume_status(client, vol.name,
                                  common.VOLUME_FIELD_STATE,
                                  common.VOLUME_STATE_ATTACHED)

    pod = common.wait_and_get_any_deployment_pod(core_api, deployment_name)
    assert created_md5sum == get_pod_data_md5sum(core_api,
                                                 pod.metadata.name,
                                                 data_path)
예제 #30
0
def test_csi_expansion_with_replica_failure(client, core_api, storage_class, pvc, pod_manifest):  # NOQA
    """
    Test expansion success but with one replica expansion failure

    1. Create a new `storage_class` with `allowVolumeExpansion` set
    2. Create PVC and Pod with dynamic provisioned volume from the StorageClass
    3. Create an empty directory with expansion snapshot tmp meta file path
       for one replica so that the replica expansion will fail
    4. Generate `test_data` and write to the pod
    5. Delete the pod and wait for volume detachment
    6. Update pvc.spec.resources to expand the volume
    7. Check expansion result using Longhorn API. There will be expansion error
       caused by the failed replica but overall the expansion should succeed.
    8. Create a new pod and
       check if the volume will reuse the failed replica during rebuilding.
    9. Validate the volume content, then check if data writing looks fine
    """
    replenish_wait_setting = \
        client.by_id_setting(SETTING_REPLICA_REPLENISHMENT_WAIT_INTERVAL)
    client.update(replenish_wait_setting, value="600")

    create_storage_class(storage_class)

    pod_name = 'csi-expansion-with-replica-failure-test'
    pvc_name = pod_name + "-pvc"
    pvc['metadata']['name'] = pvc_name
    pvc['spec']['storageClassName'] = storage_class['metadata']['name']
    create_pvc(pvc)

    pod_manifest['metadata']['name'] = pod_name
    pod_manifest['spec']['volumes'] = [{
        'name':
            pod_manifest['spec']['containers'][0]['volumeMounts'][0]['name'],
        'persistentVolumeClaim': {'claimName': pvc_name},
    }]
    create_and_wait_pod(core_api, pod_manifest)

    expand_size = str(EXPANDED_VOLUME_SIZE*Gi)
    pv = wait_and_get_pv_for_pvc(core_api, pvc_name)
    assert pv.status.phase == "Bound"
    volume_name = pv.spec.csi.volume_handle
    volume = client.by_id_volume(volume_name)
    failed_replica = volume.replicas[0]
    fail_replica_expansion(client, core_api,
                           volume_name, expand_size, [failed_replica])

    test_data = generate_random_data(VOLUME_RWTEST_SIZE)
    write_pod_volume_data(core_api, pod_name, test_data)

    delete_and_wait_pod(core_api, pod_name)
    wait_for_volume_detached(client, volume_name)

    # There will be replica expansion error info
    # but the expansion should succeed.
    pvc['spec']['resources'] = {
        'requests': {
            'storage': size_to_string(EXPANDED_VOLUME_SIZE*Gi)
        }
    }
    expand_and_wait_for_pvc(core_api, pvc)
    wait_for_expansion_failure(client, volume_name)
    wait_for_volume_expansion(client, volume_name)
    volume = client.by_id_volume(volume_name)
    assert volume.state == "detached"
    assert volume.size == expand_size
    for r in volume.replicas:
        if r.name == failed_replica.name:
            assert r.failedAt != ""
        else:
            assert r.failedAt == ""

    # Check if the failed replica will be reused during rebuilding,
    # and if the volume still works fine.
    create_and_wait_pod(core_api, pod_manifest)
    volume = wait_for_volume_healthy(client, volume_name)
    for r in volume.replicas:
        assert r.mode == "RW"
    resp = read_volume_data(core_api, pod_name)
    assert resp == test_data
    test_data = generate_random_data(VOLUME_RWTEST_SIZE)
    write_pod_volume_data(core_api, pod_name, test_data)
    resp = read_volume_data(core_api, pod_name)
    assert resp == test_data
예제 #31
0
def test_setting_toleration():
    """
    Test toleration setting

    1.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:InvalidEffect".
    2.  Verify the request fails.
    3.  Create a volume and attach it.
    4.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute".
    5.  Verify that cannot update toleration setting when any volume is
        attached.
    6.  Generate and write `data1` into the volume.
    7.  Detach the volume.
    8.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute".
    9.  Wait for all the Longhorn system components to restart with new
        toleration.
    10. Verify that UI, manager, and drive deployer don't restart and
        don't have new toleration.
    11. Attach the volume again and verify the volume `data1`.
    12. Generate and write `data2` to the volume.
    13. Detach the volume.
    14. Clean the `toleration` setting.
    15. Wait for all the Longhorn system components to restart with no
        toleration.
    16. Attach the volume and validate `data2`.
    17. Generate and write `data3` to the volume.
    """
    client = get_longhorn_api_client()  # NOQA
    apps_api = get_apps_api_client()  # NOQA
    core_api = get_core_api_client()  # NOQA
    count = len(client.list_node())

    setting = client.by_id_setting(SETTING_TAINT_TOLERATION)

    with pytest.raises(Exception) as e:
        client.update(setting,
                      value="key1=value1:NoSchedule; key2:InvalidEffect")
    assert 'invalid effect' in str(e.value)

    volume_name = "test-toleration-vol"  # NOQA
    volume = create_and_check_volume(client, volume_name)
    volume.attach(hostId=get_self_host_id())
    volume = wait_for_volume_healthy(client, volume_name)

    setting_value_str = "key1=value1:NoSchedule; key2:NoExecute"
    setting_value_dicts = [
        {
            "key": "key1",
            "value": "value1",
            "operator": "Equal",
            "effect": "NoSchedule"
        },
        {
            "key": "key2",
            "value": None,
            "operator": "Exists",
            "effect": "NoExecute"
        },
    ]
    with pytest.raises(Exception) as e:
        client.update(setting, value=setting_value_str)
    assert 'cannot modify toleration setting before all volumes are detached' \
           in str(e.value)

    data1 = write_volume_random_data(volume)
    check_volume_data(volume, data1)

    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    setting = client.update(setting, value=setting_value_str)
    assert setting.value == setting_value_str
    wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data1)
    data2 = write_volume_random_data(volume)
    check_volume_data(volume, data2)
    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    # cleanup
    setting_value_str = ""
    setting_value_dicts = []
    setting = client.by_id_setting(SETTING_TAINT_TOLERATION)
    setting = client.update(setting, value=setting_value_str)
    assert setting.value == setting_value_str
    wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data2)
    data3 = write_volume_random_data(volume)
    check_volume_data(volume, data3)

    cleanup_volume(client, volume)
예제 #32
0
def test_setting_priority_class(core_api, apps_api, scheduling_api,
                                priority_class, volume_name):  # NOQA
    """
    Test that the Priority Class setting is validated and utilized correctly.

    1. Verify that the name of a non-existent Priority Class cannot be used
    for the Setting.
    2. Create a new Priority Class in Kubernetes.
    3. Create and attach a Volume.
    4. Verify that the Priority Class Setting cannot be updated with an
    attached Volume.
    5. Generate and write `data1`.
    6. Detach the Volume.
    7. Update the Priority Class Setting to the new Priority Class.
    8. Wait for all the Longhorn system components to restart with the new
       Priority Class.
    9. Verify that UI, manager, and drive deployer don't have Priority Class
    10. Attach the Volume and verify `data1`.
    11. Generate and write `data2`.
    12. Unset the Priority Class Setting.
    13. Wait for all the Longhorn system components to restart with the new
        Priority Class.
    14. Verify that UI, manager, and drive deployer don't have Priority Class
    15. Attach the Volume and verify `data2`.
    16. Generate and write `data3`.

    Note: system components are workloads other than UI, manager, driver
     deployer
    """
    client = get_longhorn_api_client()  # NOQA
    count = len(client.list_node())
    name = priority_class['metadata']['name']
    setting = client.by_id_setting(SETTING_PRIORITY_CLASS)

    with pytest.raises(Exception) as e:
        client.update(setting, value=name)
    assert 'failed to get priority class ' in str(e.value)

    scheduling_api.create_priority_class(priority_class)

    volume = create_and_check_volume(client, volume_name)
    volume.attach(hostId=get_self_host_id())
    volume = wait_for_volume_healthy(client, volume_name)

    with pytest.raises(Exception) as e:
        client.update(setting, value=name)
    assert 'cannot modify priority class setting before all volumes are ' \
           'detached' in str(e.value)

    data1 = write_volume_random_data(volume)
    check_volume_data(volume, data1)

    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    setting = client.update(setting, value=name)
    assert setting.value == name

    wait_for_priority_class_update(core_api, apps_api, count, priority_class)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data1)
    data2 = write_volume_random_data(volume)
    check_volume_data(volume, data2)
    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    setting = client.by_id_setting(SETTING_PRIORITY_CLASS)
    setting = client.update(setting, value='')
    assert setting.value == ''
    wait_for_priority_class_update(core_api, apps_api, count)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data2)
    data3 = write_volume_random_data(volume)
    check_volume_data(volume, data3)

    cleanup_volume(client, volume)
예제 #33
0
def test_instance_manager_cpu_reservation(client, core_api):  # NOQA
    """
    Test if the CPU requests of instance manager pods are controlled by
    the settings and the node specs correctly.

    1. Try to change the deprecated setting `Guaranteed Engine CPU`.
       --> The setting update should fail.
    2. Pick up node 1, set `node.engineManagerCPURequest` and
       `node.replicaManagerCPURequest` to 150 and 250, respectively.
       --> The IM pods on this node will be restarted. And the CPU requests
       of these IM pods matches the above milli value.
    3. Change the new settings `Guaranteed Engine Manager CPU` and
       `Guaranteed Replica Manager CPU` to 10 and 20, respectively.
       Then wait for all IM pods except for the pods on node 1 restarting.
       --> The CPU requests of the restarted IM pods equals to
           the new setting value multiply the kube node allocatable CPU.
    4. Set the both new settings to 0.
       --> All IM pods except for the pod on node 1 will be restarted without
        CPU requests.
    5. Set the fields on node 1 to 0.
       --> The IM pods on node 1 will be restarted without CPU requests.
    6. Set the both new settings to 2 random values,
       and the sum of the 2 values is small than 40.
       Then wait for all IM pods restarting.
       --> The CPU requests of all IM pods equals to
           the new setting value multiply the kube node allocatable CPU.
    7. Set the both new settings to 2 random values,
       and the single value or the sum of the 2 values is greater than 40.
       --> The setting update should fail.
    8. Create a volume, verify everything works as normal

    Note: use fixture to restore the setting into the original state
    """

    instance_managers = client.list_instance_manager()
    deprecated_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_CPU)
    with pytest.raises(Exception) as e:
        client.update(deprecated_setting, value="0.1")

    host_node_name = get_self_host_id()
    host_node = client.by_id_node(host_node_name)
    other_ems, other_rms = [], []
    for im in instance_managers:
        if im.managerType == "engine":
            if im.nodeID == host_node_name:
                em_on_host = im
            else:
                other_ems.append(im)
        else:
            if im.nodeID == host_node_name:
                rm_on_host = im
            else:
                other_rms.append(im)
    assert em_on_host and rm_on_host
    host_kb_node = core_api.read_node(host_node_name)
    if host_kb_node.status.allocatable["cpu"].endswith('m'):
        allocatable_millicpu = int(host_kb_node.status.allocatable["cpu"][:-1])
    else:
        allocatable_millicpu = int(
            host_kb_node.status.allocatable["cpu"]) * 1000

    client.update(host_node,
                  allowScheduling=True,
                  engineManagerCPURequest=150,
                  replicaManagerCPURequest=250)
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(client, core_api, [em_on_host],
                                        "Running", True, "150m")
    guaranteed_engine_cpu_setting_check(client, core_api, [rm_on_host],
                                        "Running", True, "250m")

    em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU)
    client.update(em_setting, value="10")
    rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU)
    client.update(rm_setting, value="20")
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(
        client, core_api, other_ems, "Running", True,
        str(int(allocatable_millicpu * 10 / 100)) + "m")
    guaranteed_engine_cpu_setting_check(
        client, core_api, other_rms, "Running", True,
        str(int(allocatable_millicpu * 20 / 100)) + "m")

    em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU)
    client.update(em_setting, value="0")
    rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU)
    client.update(rm_setting, value="0")
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(client, core_api, other_ems, "Running",
                                        True, "")
    guaranteed_engine_cpu_setting_check(client, core_api, other_rms, "Running",
                                        True, "")

    ems, rms = other_ems, other_rms
    ems.append(em_on_host)
    rms.append(rm_on_host)

    host_node = client.by_id_node(host_node_name)
    client.update(host_node,
                  allowScheduling=True,
                  engineManagerCPURequest=0,
                  replicaManagerCPURequest=0)
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(client, core_api, ems, "Running", True,
                                        "")
    guaranteed_engine_cpu_setting_check(client, core_api, rms, "Running", True,
                                        "")

    client.update(em_setting, value="20")
    rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU)
    client.update(rm_setting, value="15")
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(
        client, core_api, ems, "Running", True,
        str(int(allocatable_millicpu * 20 / 100)) + "m")
    guaranteed_engine_cpu_setting_check(
        client, core_api, rms, "Running", True,
        str(int(allocatable_millicpu * 15 / 100)) + "m")

    with pytest.raises(Exception) as e:
        client.update(em_setting, value="41")
    assert "should be between 0 to 40" in \
           str(e.value)

    em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU)
    with pytest.raises(Exception) as e:
        client.update(em_setting, value="35")
    assert "The sum should not be smaller than 0% or greater than 40%" in \
           str(e.value)

    # Create a volume to test
    vol_name = generate_volume_name()
    volume = create_and_check_volume(client, vol_name)
    volume.attach(hostId=get_self_host_id())
    volume = wait_for_volume_healthy(client, vol_name)
    assert len(volume.replicas) == 3
    data = write_volume_random_data(volume)
    check_volume_data(volume, data)
    cleanup_volume(client, volume)
예제 #34
0
def test_setting_toleration_extra(core_api, apps_api):  # NOQA
    """
    Steps:
    1. Set Kubernetes Taint Toleration to:
       `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`.
    2. Verify that all system components have the 2 tolerations
       `ex.com/foobar:NoExecute; ex.com/foobar:NoSchedule`.
       Verify that UI, manager, and drive deployer don't restart and
       don't have toleration.
    3. Set Kubernetes Taint Toleration to:
       `node-role.kubernetes.io/controlplane=true:NoSchedule`.
    4. Verify that all system components have the the toleration
       `node-role.kubernetes.io/controlplane=true:NoSchedule`,
       and don't have the 2 tolerations
       `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`.
       Verify that UI, manager, and drive deployer don't restart and
       don't have toleration.
    5. Set Kubernetes Taint Toleration to special value:
       `:`.
    6. Verify that all system components have the toleration with
       `operator: Exists` and other field of the toleration are empty.
       Verify that all system components don't have the toleration
       `node-role.kubernetes.io/controlplane=true:NoSchedule`.
       Verify that UI, manager, and drive deployer don't restart and
       don't have toleration.
    7. Clear Kubernetes Taint Toleration

    Note: system components are workloads other than UI, manager, driver
    deployer
    """
    settings = [
        {
            "value":
            "ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule",
            "expect": [
                {
                    "key": "ex.com/foobar",
                    "value": None,
                    "operator": "Exists",
                    "effect": "NoExecute"
                },
                {
                    "key": "ex.com/foobar",
                    "value": None,
                    "operator": "Exists",
                    "effect": "NoSchedule"
                },
            ],
        },
        {
            "value":
            "node-role.kubernetes.io/controlplane=true:NoSchedule",
            "expect": [
                {
                    "key": "node-role.kubernetes.io/controlplane",
                    "value": "true",
                    "operator": "Equal",
                    "effect": "NoSchedule"
                },
            ],
        },
        # Skip the this special toleration for now because it makes
        # Longhorn deploy manager pods on control/etcd nodes
        # and the control/etcd nodes become "down" after the test
        # clear this toleration.
        # We will enable this test once we implement logic for
        # deleting failed nodes.
        # {
        #     "value": ":",
        #     "expect": [
        #         {
        #             "key": None,
        #             "value": None,
        #             "operator": "Exists",
        #             "effect": None,
        #         },
        #     ]
        # },
        {
            "value": "",
            "expect": [],
        },
    ]

    chk_removed_tolerations = []
    for setting in settings:
        client = get_longhorn_api_client()  # NOQA
        taint_toleration = client.by_id_setting(SETTING_TAINT_TOLERATION)
        updated = client.update(taint_toleration, value=setting["value"])
        assert updated.value == setting["value"]

        node_count = len(client.list_node())
        wait_for_toleration_update(core_api, apps_api, node_count,
                                   setting["expect"], chk_removed_tolerations)
        chk_removed_tolerations = setting["expect"]
예제 #35
0
def ha_backup_deletion_recovery_test(client, volume_name, size, base_image=""):  # NOQA
    volume = client.create_volume(name=volume_name, size=size,
                                  numberOfReplicas=2, baseImage=base_image)
    volume = common.wait_for_volume_detached(client, volume_name)

    host_id = get_self_host_id()
    volume = volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    setting = client.by_id_setting(common.SETTING_BACKUP_TARGET)
    # test backupTarget for multiple settings
    backupstores = common.get_backupstore_url()
    for backupstore in backupstores:
        if common.is_backupTarget_s3(backupstore):
            backupsettings = backupstore.split("$")
            setting = client.update(setting, value=backupsettings[0])
            assert setting["value"] == backupsettings[0]

            credential = client.by_id_setting(
                    common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value=backupsettings[1])
            assert credential["value"] == backupsettings[1]
        else:
            setting = client.update(setting, value=backupstore)
            assert setting["value"] == backupstore
            credential = client.by_id_setting(
                    common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value="")
            assert credential["value"] == ""

        data = write_volume_random_data(volume)
        snap2 = volume.snapshotCreate()
        volume.snapshotCreate()

        volume.snapshotBackup(name=snap2["name"])

        _, b = common.find_backup(client, volume_name, snap2["name"])

        res_name = common.generate_volume_name()
        res_volume = client.create_volume(name=res_name, size=size,
                                          numberOfReplicas=2,
                                          fromBackup=b["url"])
        res_volume = common.wait_for_volume_detached(client, res_name)
        res_volume = res_volume.attach(hostId=host_id)
        res_volume = common.wait_for_volume_healthy(client, res_name)
        check_volume_data(res_volume, data)

        snapshots = res_volume.snapshotList()
        # only the backup snapshot + volume-head
        assert len(snapshots) == 2
        backup_snapshot = ""
        for snap in snapshots:
            if snap["name"] != "volume-head":
                backup_snapshot = snap["name"]
        assert backup_snapshot != ""

        res_volume.snapshotCreate()
        snapshots = res_volume.snapshotList()
        assert len(snapshots) == 3

        res_volume.snapshotDelete(name=backup_snapshot)
        res_volume.snapshotPurge()
        snapshots = res_volume.snapshotList()
        assert len(snapshots) == 2

        ha_rebuild_replica_test(client, res_name)

        res_volume = res_volume.detach()
        res_volume = common.wait_for_volume_detached(client, res_name)

        client.delete(res_volume)
        common.wait_for_volume_delete(client, res_name)

    volume = volume.detach()
    volume = common.wait_for_volume_detached(client, volume_name)

    client.delete(volume)
    common.wait_for_volume_delete(client, volume_name)

    volumes = client.list_volume()
    assert len(volumes) == 0
예제 #36
0
def test_replica_scheduler_update_minimal_available(client):  # NOQA
    minimal_available_setting = client.by_id_setting(
        SETTING_STORAGE_MINIMAL_AVAILABLE_PERCENTAGE)
    old_minimal_setting = minimal_available_setting["value"]

    nodes = client.list_node()
    expect_node_disk = {}
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            if disk["path"] == DEFAULT_DISK_PATH:
                expect_disk = disk
                expect_disk["fsid"] = fsid
                expect_node_disk[node["name"]] = expect_disk

    # set storage minimal available percentage to 100
    # to test all replica couldn't be scheduled
    minimal_available_setting = client.update(minimal_available_setting,
                                              value="100")
    # wait for disks state
    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_conditions(client, node["name"],
                                     fsid, DISK_CONDITION_SCHEDULABLE,
                                     CONDITION_STATUS_FALSE)

    lht_hostId = get_self_host_id()
    vol_name = common.generate_volume_name()
    volume = client.create_volume(name=vol_name,
                                  size=SIZE, numberOfReplicas=len(nodes))
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_FALSE)

    # set storage minimal available percentage to default value(10)
    minimal_available_setting = client.update(minimal_available_setting,
                                              value=old_minimal_setting)
    # wait for disks state
    nodes = client.list_node()
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            wait_for_disk_conditions(client, node["name"],
                                     fsid, DISK_CONDITION_SCHEDULABLE,
                                     CONDITION_STATUS_TRUE)
    # check volume status
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_TRUE)
    volume = common.wait_for_volume_detached(client, vol_name)
    assert volume["state"] == "detached"
    assert volume["created"] != ""

    volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, vol_name)
    nodes = client.list_node()
    node_hosts = []
    for node in nodes:
        node_hosts.append(node["name"])
    # check all replica should be scheduled to default disk
    for replica in volume["replicas"]:
        id = replica["hostId"]
        assert id != ""
        assert replica["running"]
        expect_disk = expect_node_disk[id]
        assert replica["diskID"] == expect_disk["fsid"]
        assert expect_disk["path"] in replica["dataPath"]
        node_hosts = filter(lambda x: x != id, node_hosts)
    assert len(node_hosts) == 0

    # clean volume and disk
    cleanup_volume(client, vol_name)
예제 #37
0
def test_replica_scheduler_just_under_over_provisioning(client):  # NOQA
    over_provisioning_setting = client.by_id_setting(
        SETTING_STORAGE_OVER_PROVISIONING_PERCENTAGE)
    old_provisioning_setting = over_provisioning_setting["value"]
    # set storage over provisioning percentage to 100
    over_provisioning_setting = client.update(over_provisioning_setting,
                                              value="100")

    lht_hostId = get_self_host_id()
    nodes = client.list_node()
    expect_node_disk = {}
    max_size_array = []
    for node in nodes:
        disks = node["disks"]
        for fsid, disk in disks.iteritems():
            if disk["path"] == DEFAULT_DISK_PATH:
                expect_disk = disk
                expect_disk["fsid"] = fsid
                expect_node_disk[node["name"]] = expect_disk
                max_size_array.append(disk["storageMaximum"])
            disk["storageReserved"] = 0
            update_disks = get_update_disks(disks)
            node = node.diskUpdate(disks=update_disks)
            disks = node["disks"]
            for fsid, disk in disks.iteritems():
                wait_for_disk_status(client, node["name"],
                                     fsid, "storageReserved", 0)

    max_size = min(max_size_array)
    # test just under over provisioning limit could be scheduled
    vol_name = common.generate_volume_name()
    volume = client.create_volume(name=vol_name,
                                  size=str(max_size),
                                  numberOfReplicas=len(nodes))
    volume = common.wait_for_volume_condition_scheduled(client, vol_name,
                                                        "status",
                                                        CONDITION_STATUS_TRUE)
    volume = common.wait_for_volume_detached(client, vol_name)
    assert volume["state"] == "detached"
    assert volume["created"] != ""

    volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, vol_name)
    nodes = client.list_node()
    node_hosts = []
    for node in nodes:
        node_hosts.append(node["name"])
    # check all replica should be scheduled to default disk
    for replica in volume["replicas"]:
        id = replica["hostId"]
        assert id != ""
        assert replica["running"]
        expect_disk = expect_node_disk[id]
        assert replica["diskID"] == expect_disk["fsid"]
        assert expect_disk["path"] in replica["dataPath"]
        node_hosts = filter(lambda x: x != id, node_hosts)
    assert len(node_hosts) == 0

    # clean volume and disk
    cleanup_volume(client, vol_name)
    client.update(over_provisioning_setting, value=old_provisioning_setting)