def test_frontend_switch( grpc_controller_no_frontend, # NOQA grpc_replica1, grpc_replica2): # NOQA open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller_no_frontend.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller_no_frontend.volume_start(replicas=[r1_url, r2_url]) assert v.name == VOLUME_NO_FRONTEND_NAME assert v.replicaCount == 2 assert v.frontend == "" grpc_controller_no_frontend.volume_frontend_start(FRONTEND_TGT_BLOCKDEV) dev = get_blockdev(volume=VOLUME_NO_FRONTEND_NAME) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) grpc_controller_no_frontend.volume_frontend_shutdown() grpc_controller_no_frontend.volume_frontend_start(FRONTEND_TGT_BLOCKDEV) dev = get_blockdev(volume=VOLUME_NO_FRONTEND_NAME) verify_read(dev, data_offset, data) grpc_controller_no_frontend.volume_frontend_shutdown()
def test_ha_remove_extra_disks( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA address = grpc_controller.address prepare_backup_dir(BACKUP_DIR) open_replica(grpc_replica1) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url v = grpc_controller.volume_start(replicas=[r1_url]) assert v.name == VOLUME_NAME assert v.replicaCount == 1 replicas = grpc_controller.replica_list() assert len(replicas) == 1 assert replicas[0].mode == "RW" dev = get_blockdev(VOLUME_NAME) wasted_data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, wasted_data) # now replica1 contains extra data in a snapshot cmd.snapshot_create(address) cleanup_controller(grpc_controller) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r2_url]) assert v.name == VOLUME_NAME assert v.replicaCount == 1 replicas = grpc_controller.replica_list() assert len(replicas) == 1 assert replicas[0].mode == "RW" dev = get_blockdev(VOLUME_NAME) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) r1 = grpc_replica1.replica_reload() print(r1) cmd.add_replica(address, r1_url) wait_for_rebuild_complete(address) verify_data(dev, data_offset, data)
def check_dr_volume_block_device_size(grpc_em, grpc_c, size): grpc_c.volume_frontend_start(FRONTEND_TGT_BLOCKDEV) v = grpc_c.volume_get() assert v.frontendState == "up" get_blockdev(volume=VOLUME_NO_FRONTEND_NAME) check_block_device_size(VOLUME_NO_FRONTEND_NAME, size) grpc_c.volume_frontend_shutdown() v = grpc_c.volume_get() assert v.frontendState == "down"
def test_ha_single_replica_failure( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) cleanup_replica(grpc_replica2) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "ERR") verify_read(dev, data_offset, data)
def test_ha_revision_counter_consistency( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.name == VOLUME_NAME assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) verify_async(dev, 10, 128, 100) r1 = grpc_replica1.replica_get() r2 = grpc_replica2.replica_get() # kernel can merge requests so backend may not receive 1000 writes assert r1.revisionCounter > 0 assert r1.revisionCounter == r2.revisionCounter
def test_snapshot_tree_rebuild( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA address = grpc_controller.address offset = 0 length = 128 open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.name == VOLUME_NAME assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) snap, snap_data = snapshot_tree_build(dev, address, ENGINE_NAME, offset, length) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) # Cleanup replica2 cleanup_replica(grpc_replica2) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "ERR") verify_read(dev, data_offset, data) grpc_controller.replica_delete(replicas[1].address) # Rebuild replica2 open_replica(grpc_replica2) cmd.add_replica(address, r2_url) wait_for_rebuild_complete(address) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "RW") snapshot_tree_verify(dev, address, ENGINE_NAME, offset, length, snap, snap_data)
def snapshot_tree_verify_backup_node( grpc_controller, grpc_replica1, grpc_replica2, address, engine_name, offset, length, backup, data, name): # NOQA reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(grpc_controller.volume_get().name) restore_with_frontend(address, engine_name, backup[name]) readed = read_dev(dev, offset, length) assert readed == data[name]
def verify_no_frontend_data(grpc_em, data_offset, data, grpc_c): grpc_c.volume_frontend_start(FRONTEND_TGT_BLOCKDEV) v = grpc_c.volume_get() assert v.frontendState == "up" dev = get_blockdev(volume=VOLUME_NO_FRONTEND_NAME) verify_read(dev, data_offset, data) grpc_c.volume_frontend_shutdown() v = grpc_c.volume_get() assert v.frontendState == "down"
def backup_with_backing_file_test( backup_target, # NOQA grpc_backing_controller, # NOQA grpc_backing_replica1, # NOQA grpc_backing_replica2): # NOQA address = grpc_backing_controller.address dev = get_dev(grpc_backing_replica1, grpc_backing_replica2, grpc_backing_controller) offset = 0 length = 256 snap0 = cmd.snapshot_create(address) before = read_dev(dev, offset, length) assert before != "" snap0_checksum = checksum_dev(dev) exists = read_from_backing_file(offset, length) assert before == exists backup0_info = create_backup(address, snap0, backup_target, backing_image_name=BACKING_IMAGE_NAME, backing_image_url=BACKING_IMAGE_URL) assert backup0_info["VolumeName"] == VOLUME_BACKING_NAME backup_test(grpc_backing_replica1, grpc_backing_replica2, grpc_backing_controller, VOLUME_BACKING_NAME, ENGINE_BACKING_NAME, backup_target) reset_volume(grpc_backing_controller, grpc_backing_replica1, grpc_backing_replica2) dev = get_blockdev(VOLUME_BACKING_NAME) restore_with_frontend(address, ENGINE_BACKING_NAME, backup0_info["URL"]) after = read_dev(dev, offset, length) assert before == after c = checksum_dev(dev) assert c == snap0_checksum rm_backups(address, ENGINE_BACKING_NAME, [backup0_info["URL"]])
def backup_test(grpc_r1, grpc_r2, grpc_c, volume_name, engine_name, backup_target): address = grpc_c.address dev = get_blockdev(volume_name) offset = 0 length = 128 snap1_data = random_string(length) verify_data(dev, offset, snap1_data) snap1_checksum = checksum_dev(dev) snap1 = cmd.snapshot_create(address) backup1_info = create_backup(address, snap1, backup_target) assert backup1_info["VolumeName"] == volume_name assert backup1_info["Size"] == BLOCK_SIZE_STR snap2_data = random_string(length) verify_data(dev, offset, snap2_data) snap2_checksum = checksum_dev(dev) snap2 = cmd.snapshot_create(address) backup2_info = create_backup(address, snap2, backup_target) assert backup2_info["VolumeName"] == volume_name assert backup2_info["Size"] == BLOCK_SIZE_STR snap3_data = random_string(length) verify_data(dev, offset, snap3_data) snap3_checksum = checksum_dev(dev) snap3 = cmd.snapshot_create(address) backup3_info = create_backup(address, snap3, backup_target) assert backup3_info["VolumeName"] == volume_name assert backup3_info["Size"] == BLOCK_SIZE_STR reset_volume(grpc_c, grpc_r1, grpc_r2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup3_info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap3_data c = checksum_dev(dev) assert c == snap3_checksum rm_backups(address, engine_name, [backup3_info["URL"]]) reset_volume(grpc_c, grpc_r1, grpc_r2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup1_info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap1_data c = checksum_dev(dev) assert c == snap1_checksum rm_backups(address, engine_name, [backup1_info["URL"]]) reset_volume(grpc_c, grpc_r1, grpc_r2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup2_info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap2_data c = checksum_dev(dev) assert c == snap2_checksum rm_backups(address, engine_name, [backup2_info["URL"]])
def test_backup_incremental_logic(grpc_replica1, grpc_replica2, grpc_controller, backup_targets): # NOQA for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) address = grpc_controller.address volume_name = VOLUME_NAME engine_name = ENGINE_NAME offset = 0 length = 128 # initial backup snap1_data = random_string(length) verify_data(dev, offset, snap1_data) snap1_checksum = checksum_dev(dev) snap1 = cmd.snapshot_create(address) backup1_info = create_backup(address, snap1, backup_target) assert backup1_info["IsIncremental"] is False # delta backup on top of initial backup snap2_data = random_string(int(length / 2)) verify_data(dev, offset, snap2_data) snap2 = cmd.snapshot_create(address) backup2_info = create_backup(address, snap2, backup_target) assert backup2_info["IsIncremental"] is True # delete the volume cmd.sync_agent_server_reset(address) grpc_controller = cleanup_controller(grpc_controller) grpc_replica1 = cleanup_replica(grpc_replica1) grpc_replica2 = cleanup_replica(grpc_replica2) # recreate the volume dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller, clean_backup_dir=False) # empty initial backup after volume recreation snap3 = cmd.snapshot_create(address) backup3_info = create_backup(address, snap3, backup_target) assert backup3_info["VolumeName"] == volume_name assert backup3_info["Size"] == '0' assert backup3_info["IsIncremental"] is False # write half of snap1 onto head snap4_data = snap1_data[:int(length / 2)] assert len(snap4_data) == int(length / 2) verify_data(dev, offset, snap4_data) snap4_checksum = checksum_dev(dev) assert snap4_checksum != snap1_checksum snap4 = cmd.snapshot_create(address) backup4_info = create_backup(address, snap4, backup_target) assert backup4_info["IsIncremental"] is True # restore initial backup reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup1_info["URL"]) assert read_dev(dev, offset, length) == snap1_data assert checksum_dev(dev) == snap1_checksum # restore final backup (half of snap1) reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup4_info["URL"]) assert checksum_dev(dev) == snap4_checksum assert snap4_checksum != snap1_checksum data = read_dev(dev, offset, length) assert data[:int(length / 2)] == snap4_data assert data[int(length / 2):] == '\x00' * int(length / 2) rm_backups(address, engine_name, [ backup1_info["URL"], backup2_info["URL"], backup3_info["URL"], backup4_info["URL"] ]) cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_backup_S3_latest_unavailable( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA for backup_target in backup_targets: if "s3://" not in backup_target: continue dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) address = grpc_controller.address volume_name = VOLUME_NAME engine_name = ENGINE_NAME offset = 0 length = 128 # initial backup snap1_data = random_string(length) verify_data(dev, offset, snap1_data) snap1_checksum = checksum_dev(dev) snap1 = cmd.snapshot_create(address) backup1_info = create_backup(address, snap1, backup_target) # backup to be unavailable snap2_data = random_string(length) verify_data(dev, offset, snap2_data) snap2 = cmd.snapshot_create(address) backup2_info = create_backup(address, snap2, backup_target) # the gc after the restore will clean up the missing backup cfg = findfile(BACKUP_DIR, "backup_" + backup2_info["Name"] + ".cfg") os.remove(cfg) # final full backup after unavailable backup snap3_data = random_string(length) verify_data(dev, offset, snap3_data) snap3_checksum = checksum_dev(dev) snap3 = cmd.snapshot_create(address) backup3_info = create_backup(address, snap3, backup_target) assert backup3_info["VolumeName"] == volume_name assert backup3_info["Size"] == BLOCK_SIZE_STR # write some stuff on head head_data = random_string(length) verify_data(dev, offset, head_data) # test restore of the initial backup reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup1_info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap1_data c = checksum_dev(dev) assert c == snap1_checksum # test a restore for the final backup reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup3_info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap3_data c = checksum_dev(dev) assert c == snap3_checksum rm_backups(address, engine_name, [backup1_info["URL"], backup3_info["URL"]]) cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def backup_hole_with_backing_file_test( backup_target, # NOQA grpc_backing_controller, # NOQA grpc_backing_replica1, # NOQA grpc_backing_replica2): # NOQA address = grpc_backing_controller.address dev = get_dev(grpc_backing_replica1, grpc_backing_replica2, grpc_backing_controller) volume_name = grpc_backing_controller.volume_get().name assert volume_name == VOLUME_BACKING_NAME offset1 = 512 length1 = 256 offset2 = 640 length2 = 256 boundary_offset = 0 boundary_length = 4100 # just pass 4096 into next 4k hole_offset = 2 * 1024 * 1024 hole_length = 1024 snap1_data = random_string(length1) verify_data(dev, offset1, snap1_data) snap1_checksum = checksum_dev(dev) snap1 = cmd.snapshot_create(address) boundary_data_backup1 = read_dev(dev, boundary_offset, boundary_length) hole_data_backup1 = read_dev(dev, hole_offset, hole_length) backup1_info = create_backup(address, snap1, backup_target) snap2_data = random_string(length2) verify_data(dev, offset2, snap2_data) snap2_checksum = checksum_dev(dev) snap2 = cmd.snapshot_create(address) boundary_data_backup2 = read_dev(dev, boundary_offset, boundary_length) hole_data_backup2 = read_dev(dev, hole_offset, hole_length) backup2_info = create_backup(address, snap2, backup_target) reset_volume(grpc_backing_controller, grpc_backing_replica1, grpc_backing_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, ENGINE_BACKING_NAME, backup1_info["URL"]) readed = read_dev(dev, boundary_offset, boundary_length) assert readed == boundary_data_backup1 readed = read_dev(dev, hole_offset, hole_length) assert readed == hole_data_backup1 c = checksum_dev(dev) assert c == snap1_checksum reset_volume(grpc_backing_controller, grpc_backing_replica1, grpc_backing_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, ENGINE_BACKING_NAME, backup2_info["URL"]) readed = read_dev(dev, boundary_offset, boundary_length) assert readed == boundary_data_backup2 readed = read_dev(dev, hole_offset, hole_length) assert readed == hole_data_backup2 c = checksum_dev(dev) assert c == snap2_checksum
def test_ha_single_replica_rebuild( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA address = grpc_controller.address open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) # Cleanup replica2 cleanup_replica(grpc_replica2) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "ERR") verify_read(dev, data_offset, data) grpc_controller.replica_delete(replicas[1].address) # Rebuild replica2 open_replica(grpc_replica2) cmd.add_replica(address, r2_url) wait_for_rebuild_complete(address) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "RW") verify_read(dev, data_offset, data) # WORKAROUND for unable to remove the parent of volume head newsnap = cmd.snapshot_create(address) info = cmd.snapshot_info(address) assert len(info) == 3 sysnap = info[newsnap]["parent"] assert info[sysnap]["parent"] == "" assert newsnap in info[sysnap]["children"] assert info[sysnap]["usercreated"] is False assert info[sysnap]["removed"] is False cmd.snapshot_purge(address) wait_for_purge_completion(address) info = cmd.snapshot_info(address) assert len(info) == 2 assert info[newsnap] is not None assert info[VOLUME_HEAD] is not None
def test_ha_double_replica_rebuild( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.name == VOLUME_NAME assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) data1 = random_string(128) data1_offset = 1024 verify_data(dev, data1_offset, data1) # Close replica2 r2 = grpc_replica2.replica_get() assert r2.revisionCounter == 1 grpc_replica2.replica_close() verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "ERR") verify_read(dev, data1_offset, data1) data2 = random_string(128) data2_offset = 512 verify_data(dev, data2_offset, data2) # Close replica1 r1 = grpc_replica1.replica_get() assert r1.revisionCounter == 12 # 1 + 10 + 1 grpc_replica1.replica_close() # Restart volume cleanup_controller(grpc_controller) replicas = grpc_controller.replica_list() assert len(replicas) == 0 # NOTE the order is reversed here r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r2_url, r1_url]) assert v.replicaCount == 2 # replica2 is out because of lower revision counter replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "ERR" assert replicas[1].mode == "RW" verify_read(dev, data1_offset, data1) verify_read(dev, data2_offset, data2) # Rebuild replica2 r2 = grpc_replica2.replica_get() assert r2.revisionCounter == 1 grpc_replica2.replica_close() grpc_controller.replica_delete(replicas[0].address) cmd.add_replica(grpc_controller.address, r2_url) wait_for_rebuild_complete(grpc_controller.address) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "RW") verify_read(dev, data1_offset, data1) verify_read(dev, data2_offset, data2) r1 = grpc_replica1.replica_get() r2 = grpc_replica2.replica_get() assert r1.revisionCounter == 22 # 1 + 10 + 1 + 10 assert r2.revisionCounter == 22 # must be in sync with r1