def test_replica_failure_detection(grpc_controller_client, # NOQA grpc_replica_client, # NOQA grpc_replica_client2): # NOQA open_replica(grpc_replica_client) open_replica(grpc_replica_client2) r1_url = grpc_replica_client.url r2_url = grpc_replica_client2.url v = grpc_controller_client.volume_start(replicas=[ r1_url, r2_url, ]) assert v.replicaCount == 2 # wait for initial read/write period to pass time.sleep(2) cleanup_replica(grpc_replica_client) detected = False for i in range(10): replicas = grpc_controller_client.replica_list() assert len(replicas) == 2 for r in replicas: if r.address == r1_url and r.mode == 'ERR': detected = True break if detected: break time.sleep(1) assert detected
def test_backup_volume_deletion( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA offset = 0 length = 128 address = grpc_controller.address for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) snap_data = random_string(length) verify_data(dev, offset, snap_data) snap = cmd.snapshot_create(address) backup_info = create_backup(address, snap, backup_target) assert backup_info["VolumeName"] == VOLUME_NAME assert backup_info["Size"] == BLOCK_SIZE_STR assert snap in backup_info["SnapshotName"] cmd.backup_volume_rm(address, VOLUME_NAME, backup_target) url = get_backup_volume_url(backup_target, VOLUME_NAME) with pytest.raises(subprocess.CalledProcessError): cmd.backup_inspect_volume(address, url) cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_ha_single_replica_failure( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) cleanup_replica(grpc_replica2) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "ERR") verify_read(dev, data_offset, data)
def test_backup_volume_deletion( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA offset = 0 length = 128 address = grpc_controller.address for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) snap_data = random_string(length) verify_data(dev, offset, snap_data) snap = cmd.snapshot_create(address) backup_info = create_backup(address, snap, backup_target) assert backup_info["VolumeName"] == VOLUME_NAME assert backup_info["Size"] == BLOCK_SIZE_STR assert snap in backup_info["SnapshotName"] cmd.backup_volume_rm(address, VOLUME_NAME, backup_target) info = cmd.backup_volume_list(address, VOLUME_NAME, backup_target) assert "cannot find" in info[VOLUME_NAME]["Messages"]["error"] cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_expand_multiple_times(): for i in range(30): em_client = ProcessManagerClient(INSTANCE_MANAGER_ENGINE) engine_process = create_engine_process(em_client) grpc_controller_client = ControllerClient( get_process_address(engine_process)) rm_client = ProcessManagerClient(INSTANCE_MANAGER_REPLICA) replica_process = create_replica_process(rm_client, REPLICA_NAME) grpc_replica_client = ReplicaClient( get_process_address(replica_process)) time.sleep(3) cleanup_replica(grpc_replica_client) open_replica(grpc_replica_client) r1_url = grpc_replica_client.url v = grpc_controller_client.volume_start(replicas=[ r1_url, ]) assert v.replicaCount == 1 expand_volume_with_frontend( grpc_controller_client, EXPANDED_SIZE) wait_and_check_volume_expansion( grpc_controller_client, EXPANDED_SIZE) cleanup_process(em_client) cleanup_process(rm_client)
def test_snapshot_tree_rebuild( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA address = grpc_controller.address offset = 0 length = 128 open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.name == VOLUME_NAME assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) snap, snap_data = snapshot_tree_build(dev, address, ENGINE_NAME, offset, length) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) # Cleanup replica2 cleanup_replica(grpc_replica2) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "ERR") verify_read(dev, data_offset, data) grpc_controller.replica_delete(replicas[1].address) # Rebuild replica2 open_replica(grpc_replica2) cmd.add_replica(address, r2_url) wait_for_rebuild_complete(address) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "RW") snapshot_tree_verify(dev, address, ENGINE_NAME, offset, length, snap, snap_data)
def test_snapshot_tree_backup(grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA for backup_target in backup_targets: snapshot_tree_backup_test(backup_target, ENGINE_NAME, grpc_controller, grpc_replica1, grpc_replica2) cmd.sync_agent_server_reset(grpc_controller.address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_backup_corrupt_deletion( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA address = grpc_controller.address length = 128 for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) # write two backup blocks verify_data(dev, 0, random_string(length)) verify_data(dev, BLOCK_SIZE, random_string(length)) snap = cmd.snapshot_create(address) backup1 = create_backup(address, snap, backup_target) # overwrite second backup block verify_data(dev, BLOCK_SIZE, random_string(length)) snap = cmd.snapshot_create(address) backup2 = create_backup(address, snap, backup_target) # check that the volume now has 3 blocks # backup1 and backup2 share the first block # and have different second blocks check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 3) # corrupt backup1 config cfg = findfile(BACKUP_DIR, "backup_" + backup1["Name"] + ".cfg") corrupt_backup = open(cfg, "w") assert corrupt_backup assert corrupt_backup.write("{corrupt: definitely") > 0 corrupt_backup.close() cmd.backup_rm(address, backup1["URL"]) # check that the volume now has 2 blocks # backup2 still relies on the backup1 first block check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 2) # remove backup 2 and check that all blocks are deleted cmd.backup_rm(address, backup2["URL"]) check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 0) # remove volume.cfg then delete the backup volume cfg = findfile(finddir(BACKUP_DIR, VOLUME_NAME), "volume.cfg") os.remove(cfg) cmd.backup_volume_rm(address, VOLUME_NAME, backup_target) info = cmd.backup_volume_list(address, VOLUME_NAME, backup_target)[VOLUME_NAME] assert "cannot find" in info["Messages"]["error"] cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_backup(grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) backup_test(dev, grpc_controller.address, VOLUME_NAME, ENGINE_NAME, backup_target) cmd.sync_agent_server_reset(grpc_controller.address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_backup_hole_with_backing_file(grpc_backing_replica1, grpc_backing_replica2, # NOQA grpc_backing_controller, backup_targets): # NOQA for backup_target in backup_targets: backup_hole_with_backing_file_test(backup_target, grpc_backing_controller, grpc_backing_replica1, grpc_backing_replica2) cmd.sync_agent_server_reset(grpc_backing_controller.address) cleanup_controller(grpc_backing_controller) cleanup_replica(grpc_backing_replica1) cleanup_replica(grpc_backing_replica2)
def test_backup_cli(bin, engine_manager_client, # NOQA grpc_controller_client, # NOQA grpc_replica_client, grpc_replica_client2, # NOQA backup_targets): for backup_target in backup_targets: backup_core(bin, engine_manager_client, grpc_controller_client, grpc_replica_client, grpc_replica_client2, backup_target) cleanup_replica(grpc_replica_client) cleanup_replica(grpc_replica_client2) cleanup_controller(grpc_controller_client)
def test_restore_to_file_without_backing_file( backup_targets, # NOQA grpc_controller, # NOQA grpc_replica1, # NOQA grpc_replica2): # NOQA for backup_target in backup_targets: restore_to_file_without_backing_file_test(backup_target, grpc_controller, grpc_replica1, grpc_replica2) cmd.sync_agent_server_reset(grpc_controller.address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def grpc_replica_client2(process_manager_client): time.sleep(3) r = create_replica_process(process_manager_client, REPLICA_2_NAME) listen = get_process_address(r) c = ReplicaClient(listen) return cleanup_replica(c)
def test_replica_crashed_update_state_error(grpc_controller, grpc_fixed_dir_replica1): # NOQA """ The test flow: 1. Create a fixed directory replica1, since we need to remove a file manually. 2. Remove file 'volume-head-000.img' manually from fixed directory replica1. 3. Check this fixed diretory replica1 should be in 'ERR' state. 4. Clean up created replica. """ # Create a fixed directory replica1 open_replica(grpc_fixed_dir_replica1) # Before created a volume, the engine controller should have no replica. replicas = grpc_controller.replica_list() assert len(replicas) == 0 # Create a volume on this engine controller with fixed_dir_replica1. r1_url = grpc_fixed_dir_replica1.url v = grpc_controller.volume_start(replicas=[r1_url]) assert v.replicaCount == 1 # Check engine controller should have 1 replica in 'RW' mode. replicas = grpc_controller.replica_list() assert len(replicas) == 1 assert replicas[0].mode == "RW" # Get the replica object r = grpc_fixed_dir_replica1.replica_get() assert r.chain == ['volume-head-000.img'] assert r.state == 'open' assert r.sector_size == 512 # Removing a file from this replica directory remove_file = os.path.join(FIXED_REPLICA_PATH1, "volume-head-000.img") assert os.path.exists(remove_file) os.remove(remove_file) # After removing the file, the replica should be in 'ERR' mode. verify_replica_mode(grpc_controller, r1_url, "ERR") # Cleanup created replica. cleanup_replica(grpc_fixed_dir_replica1)
def cleanup_no_frontend_volume(grpc_em, grpc_c, grpc_r1, grpc_r2): grpc_c.volume_frontend_start(FRONTEND_TGT_BLOCKDEV) v = grpc_c.volume_get() assert v.frontendState == "up" cmd.sync_agent_server_reset(grpc_c.address) grpc_c.volume_frontend_shutdown() v = grpc_c.volume_get() assert v.frontendState == "down" # ep = grpc_em.engine_get(ENGINE_NO_FRONTEND_NAME) # assert ep.spec.frontend == "" cleanup_controller(grpc_c) cleanup_replica(grpc_r1) cleanup_replica(grpc_r2) cleanup_replica_dir(FIXED_REPLICA_PATH1) cleanup_replica_dir(FIXED_REPLICA_PATH2)
def generate_grpc_replica_client(replica_name, args=[]): r = create_replica_process(process_manager_client, replica_name, args=args) listen = get_process_address(r) c = ReplicaClient(listen) grpc_replica_client.replica_client = cleanup_replica(c) return grpc_replica_client.replica_client
def test_engine_restart_after_sigkill(bin): # NOQA """ Test if engine can be restarted after crashing by SIGKILL. 1. Create then initialize 1 engine and 2 replicas. 2. Start the engine. 3. Create 2 snapshots. 4. Use SIGKILL to kill the engine process. 5. Wait for the engine errored. 6. Mock volume detachment by deleting the engine process and replicas processes. 7. Mock volume reattachment by recreating processes and re-starting the engine. 8. Check if the engine is up with 2 replicas. 9. Check if the engine still works fine by creating/removing/purging snapshots. """ em_client = ProcessManagerClient(INSTANCE_MANAGER_ENGINE) engine_process = create_engine_process(em_client) grpc_controller_client = ControllerClient( get_process_address(engine_process)) rm_client = ProcessManagerClient(INSTANCE_MANAGER_REPLICA) replica_dir1 = tempfile.mkdtemp() replica_dir2 = tempfile.mkdtemp() replica_process1 = create_replica_process(rm_client, REPLICA_NAME, replica_dir=replica_dir1) grpc_replica_client1 = ReplicaClient( get_process_address(replica_process1)) time.sleep(3) cleanup_replica(grpc_replica_client1) replica_process2 = create_replica_process(rm_client, REPLICA_2_NAME, replica_dir=replica_dir2) grpc_replica_client2 = ReplicaClient( get_process_address(replica_process2)) time.sleep(3) cleanup_replica(grpc_replica_client2) open_replica(grpc_replica_client1) open_replica(grpc_replica_client2) r1_url = grpc_replica_client1.url r2_url = grpc_replica_client2.url v = grpc_controller_client.volume_start(replicas=[ r1_url, r2_url, ]) assert v.replicaCount == 2 cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'create'] snap0 = subprocess.check_output(cmd, encoding='utf-8').strip() expected = grpc_replica_client1.replica_get().chain[1] assert expected == 'volume-snap-{}.img'.format(snap0) cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'create', '--label', 'name=snap1', '--label', 'key=value'] snap1 = subprocess.check_output(cmd, encoding='utf-8').strip() cmd = ["bash", "-c", "kill -9 $(ps aux | grep %s | grep -v grep | awk '{print $2}')" % VOLUME_NAME] subprocess.check_call(cmd) wait_for_process_error(em_client, ENGINE_NAME) # Mock detach: cleanup_process(em_client) cleanup_process(rm_client) # Mock reattach: # 1. Directly create replicas processes. # 2. Call replica_create() to init replica servers for replica processes. # 3. Create one engine process and start the engine with replicas. replica_process1 = create_replica_process(rm_client, REPLICA_NAME, replica_dir=replica_dir1) grpc_replica_client1 = ReplicaClient( get_process_address(replica_process1)) time.sleep(3) grpc_replica_client1.replica_create(size=SIZE_STR) replica_process2 = create_replica_process(rm_client, REPLICA_2_NAME, replica_dir=replica_dir2) grpc_replica_client2 = ReplicaClient( get_process_address(replica_process2)) time.sleep(3) grpc_replica_client2.replica_create(size=SIZE_STR) engine_process = create_engine_process(em_client) grpc_controller_client = ControllerClient( get_process_address(engine_process)) r1_url = grpc_replica_client1.url r2_url = grpc_replica_client2.url v = grpc_controller_client.volume_start(replicas=[ r1_url, r2_url, ]) assert v.replicaCount == 2 # Verify the engine still works fine cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'create'] snap2 = subprocess.check_output(cmd, encoding='utf-8').strip() cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'rm', snap1] subprocess.check_call(cmd) cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'purge'] subprocess.check_call(cmd) wait_for_purge_completion(grpc_controller_client.address) cmd = [bin, '--debug', '--url', grpc_controller_client.address, 'snapshot', 'ls'] ls_output = subprocess.check_output(cmd, encoding='utf-8') assert ls_output == '''ID {} {} '''.format(snap2, snap0) cleanup_process(em_client) cleanup_process(rm_client)
def test_single_replica_failure_during_engine_start(bin): # NOQA """ Test if engine still works fine if there is an invalid replica/backend in the starting phase 1. Create then initialize 1 engine and 2 replicas. 2. Start the engine. 3. Create 2 snapshots. 4. Mess up the replica1 by manually modifying the snapshot meta file. 5. Mock volume detachment by deleting the engine process and replicas processes. 6. Mock volume reattachment by recreating processes and re-starting the engine. 7. Check if the engine is up and if replica1 is mode ERR in the engine. 8. Check if the engine still works fine by creating one more snapshot. 9. Remove the ERR replica from the engine then check snapshot remove and snapshot purge work fine. 10. Check if the snapshot list is correct. """ em_client = ProcessManagerClient(INSTANCE_MANAGER_ENGINE) engine_process = create_engine_process(em_client) grpc_controller_client = ControllerClient( get_process_address(engine_process)) rm_client = ProcessManagerClient(INSTANCE_MANAGER_REPLICA) replica_dir1 = tempfile.mkdtemp() replica_dir2 = tempfile.mkdtemp() replica_process1 = create_replica_process(rm_client, REPLICA_NAME, replica_dir=replica_dir1) grpc_replica_client1 = ReplicaClient( get_process_address(replica_process1)) time.sleep(3) cleanup_replica(grpc_replica_client1) replica_process2 = create_replica_process(rm_client, REPLICA_2_NAME, replica_dir=replica_dir2) grpc_replica_client2 = ReplicaClient( get_process_address(replica_process2)) time.sleep(3) cleanup_replica(grpc_replica_client2) open_replica(grpc_replica_client1) open_replica(grpc_replica_client2) r1_url = grpc_replica_client1.url r2_url = grpc_replica_client2.url v = grpc_controller_client.volume_start(replicas=[ r1_url, r2_url, ]) assert v.replicaCount == 2 cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'create'] snap0 = subprocess.check_output(cmd, encoding='utf-8').strip() expected = grpc_replica_client1.replica_get().chain[1] assert expected == 'volume-snap-{}.img'.format(snap0) cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'create', '--label', 'name=snap1', '--label', 'key=value'] snap1 = subprocess.check_output(cmd, encoding='utf-8').strip() # Mess up the replica1 by manually modifying the snapshot meta file r1_snap1_meta_path = os.path.join(replica_dir1, 'volume-snap-{}.img.meta'.format(snap1)) with open(r1_snap1_meta_path, 'r') as f: snap1_meta_info = json.load(f) with open(r1_snap1_meta_path, 'w') as f: snap1_meta_info["Parent"] = "invalid-parent.img" json.dump(snap1_meta_info, f) # Mock detach: cleanup_process(em_client) cleanup_process(rm_client) # Mock reattach: # 1. Directly create replicas processes. # 2. Call replica_create() to init replica servers for replica processes. # 3. Create one engine process and start the engine with replicas. replica_process1 = create_replica_process(rm_client, REPLICA_NAME, replica_dir=replica_dir1) grpc_replica_client1 = ReplicaClient( get_process_address(replica_process1)) time.sleep(3) grpc_replica_client1.replica_create(size=SIZE_STR) replica_process2 = create_replica_process(rm_client, REPLICA_2_NAME, replica_dir=replica_dir2) grpc_replica_client2 = ReplicaClient( get_process_address(replica_process2)) time.sleep(3) grpc_replica_client2.replica_create(size=SIZE_STR) engine_process = create_engine_process(em_client) grpc_controller_client = ControllerClient( get_process_address(engine_process)) r1_url = grpc_replica_client1.url r2_url = grpc_replica_client2.url v = grpc_controller_client.volume_start(replicas=[ r1_url, r2_url, ]) assert v.replicaCount == 2 # Check if replica1 is mode `ERR` rs = grpc_controller_client.replica_list() assert len(rs) == 2 r1_verified = False r2_verified = False for r in rs: if r.address == r1_url: assert r.mode == 'ERR' r1_verified = True if r.address == r2_url: assert r.mode == 'RW' r2_verified = True assert r1_verified assert r2_verified # The engine still works fine cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'create'] snap2 = subprocess.check_output(cmd, encoding='utf-8').strip() # Remove the ERR replica before removing snapshots grpc_controller_client.replica_delete(r1_url) rs = grpc_controller_client.replica_list() assert len(rs) == 1 assert rs[0].address == r2_url assert rs[0].mode == "RW" cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'rm', snap1] subprocess.check_call(cmd) cmd = [bin, '--url', grpc_controller_client.address, 'snapshot', 'purge'] subprocess.check_call(cmd) wait_for_purge_completion(grpc_controller_client.address) cmd = [bin, '--debug', '--url', grpc_controller_client.address, 'snapshot', 'ls'] ls_output = subprocess.check_output(cmd, encoding='utf-8') assert ls_output == '''ID {} {} '''.format(snap2, snap0) cleanup_process(em_client) cleanup_process(rm_client)
def test_backup_type( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA for backup_target in backup_targets: address = grpc_controller.address block_size = 2 * 1024 * 1024 dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) zero_string = b'\x00'.decode('utf-8') # backup0: 256 random data in 1st block length0 = 256 snap0_data = random_string(length0) verify_data(dev, 0, snap0_data) verify_data(dev, block_size, snap0_data) snap0 = cmd.snapshot_create(address) backup0 = create_backup(address, snap0, backup_target) backup0_url = backup0["URL"] assert backup0['IsIncremental'] is False # backup1: 32 random data + 32 zero data + 192 random data in 1st block length1 = 32 offset1 = 32 snap1_data = zero_string * length1 verify_data(dev, offset1, snap1_data) snap1 = cmd.snapshot_create(address) backup1 = create_backup(address, snap1, backup_target) backup1_url = backup1["URL"] assert backup1['IsIncremental'] is True # backup2: 32 random data + 256 random data in 1st block, # 256 random data in 2nd block length2 = 256 offset2 = 32 snap2_data = random_string(length2) verify_data(dev, offset2, snap2_data) verify_data(dev, block_size, snap2_data) snap2 = cmd.snapshot_create(address) backup2 = create_backup(address, snap2, backup_target) backup2_url = backup2["URL"] assert backup2['IsIncremental'] is True rm_backups(address, ENGINE_NAME, [backup2_url]) # backup3: 64 zero data + 192 random data in 1st block length3 = 64 offset3 = 0 verify_data(dev, offset3, zero_string * length3) verify_data(dev, length2, zero_string * offset2) verify_data(dev, block_size, zero_string * length2) snap3 = cmd.snapshot_create(address) backup3 = create_backup(address, snap3, backup_target) backup3_url = backup3["URL"] assert backup3['IsIncremental'] is True # full backup: backup the same snapshot twice backup3 = create_backup(address, snap3, backup_target) backup3_url = backup3["URL"] assert backup3['IsIncremental'] is False # backup4: 256 random data in 1st block length4 = 256 offset4 = 0 snap4_data = random_string(length4) verify_data(dev, offset4, snap4_data) snap4 = cmd.snapshot_create(address) backup4 = create_backup(address, snap4, backup_target) backup4_url = backup4["URL"] assert backup4['IsIncremental'] is True rm_backups(address, ENGINE_NAME, [backup0_url, backup1_url, backup3_url, backup4_url]) cmd.sync_agent_server_reset(address) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2) cleanup_controller(grpc_controller)
def test_backup_lock( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA """ Test backup locks Context: The idea is to implement a locking mechanism that utilizes the backupstore, to prevent the following dangerous cases of concurrent operations. - prevent backup deletion during backup restoration - prevent backup deletion while a backup is in progress - prevent backup creation during backup deletion - prevent backup restoration during backup deletion Steps: 1. Create a volume(1) and attach to the current node 2. create a backup(1) of volume(1) 3. verify backup(1) creation completed 4. write some data to volume(1) 5. create an active lock of type Delete 6. create a backup(2) of volume(1) 7. verify backup(2) creation timed out 8. delete active lock of type Delete 9. create an active lock of type Delete 10. restore backup(1) 11. verify backup(1) restore timed out 12. delete active lock of type Delete 13. restore backup(1) 14. verify backup(1) restore completed 15. create an active lock of type Restore 16. delete backup(1) 17. verify backup(1) deletion timed out 18. delete active lock of type Restore 19. delete backup(1) 20. verify backup(1) deletion completed 21. cleanup """ for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) # create a regular backup address = grpc_controller.address offset = 0 length = 128 snap1_data = random_string(length) verify_data(dev, offset, snap1_data) snap1_checksum = checksum_dev(dev) snap1 = cmd.snapshot_create(address) # create a backup to create the volume info = create_backup(address, snap1, backup_target) assert info["VolumeName"] == VOLUME_NAME assert info["Size"] == BLOCK_SIZE_STR assert snap1 in info["SnapshotName"] # backup should error out with timeout # because of delete lock create_delete_lock(True) with pytest.raises(subprocess.CalledProcessError): create_backup(address, snap1, backup_target) remove_lock_file(DELETE_LOCK) # restore should error out with timeout # because of delete lock create_delete_lock(True) with pytest.raises(subprocess.CalledProcessError): restore_with_frontend(address, ENGINE_NAME, info["URL"]) remove_lock_file(DELETE_LOCK) # restore should succeed now, that there is no active delete lock restore_with_frontend(address, ENGINE_NAME, info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap1_data c = checksum_dev(dev) assert c == snap1_checksum # delete should error out with timeout # because of restore lock create_restore_lock(True) with pytest.raises(subprocess.CalledProcessError): rm_backups(address, ENGINE_NAME, [info["URL"]]) remove_lock_file(RESTORE_LOCK) # delete should succeed now, that there is no active restore lock rm_backups(address, ENGINE_NAME, [info["URL"]]) # cleanup volume 1 cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_backup_volume_list( grpc_replica_client, grpc_controller_client, # NOQA grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA """ Test backup volume list Context: We want to make sure that an error when listing a single backup volume does not stop us from listing all the other backup volumes. Otherwise a single faulty backup can block the retrieval of all known backup volumes. Steps: 1. Create a volume(1,2) and attach to the current node 2. write some data to volume(1,2) 3. Create a backup(1) of volume(1,2) 4. request a backup list 5. verify backup list contains no error messages for volume(1,2) 6. verify backup list contains backup(1) for volume(1,2) 7. place a file named "*****@*****.**" into the backups folder of volume(1) 8. request a backup list 9. verify backup list contains no error messages for volume(1,2) 10. verify backup list contains backup(1) for volume(1,2) 11. delete backup volumes(1 & 2) 12. cleanup """ # create a second volume grpc2_replica1 = grpc_replica_client(REPLICA_2_NAME + "-1") grpc2_replica2 = grpc_replica_client(REPLICA_2_NAME + "-2") grpc2_controller = grpc_controller_client(ENGINE2_NAME, VOLUME2_NAME) offset = 0 length = 128 address = grpc_controller.address address2 = grpc2_controller.address for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) dev2 = get_dev(grpc2_replica1, grpc2_replica2, grpc2_controller) # create a regular backup snap_data = random_string(length) verify_data(dev, offset, snap_data) snap = cmd.snapshot_create(address) backup_info = create_backup(address, snap, backup_target) assert backup_info["VolumeName"] == VOLUME_NAME assert backup_info["Size"] == BLOCK_SIZE_STR assert snap in backup_info["SnapshotName"] # create a regular backup on volume 2 verify_data(dev2, offset, random_string(length)) snap = cmd.snapshot_create(address2) backup_info = create_backup(address2, snap, backup_target) assert backup_info["VolumeName"] == VOLUME2_NAME assert backup_info["Size"] == BLOCK_SIZE_STR assert snap in backup_info["SnapshotName"] # request a volume list info = cmd.backup_volume_list(address, "", backup_target, include_backup_details=True) assert info[VOLUME_NAME]["Name"] == VOLUME_NAME assert len(info[VOLUME_NAME]["Backups"]) == 1 assert MESSAGE_TYPE_ERROR not in info[VOLUME_NAME]["Messages"] assert info[VOLUME2_NAME]["Name"] == VOLUME2_NAME assert len(info[VOLUME2_NAME]["Backups"]) == 1 assert MESSAGE_TYPE_ERROR not in info[VOLUME2_NAME]["Messages"] # place badly named backup.cfg file # we want the list call to return all valid files correctly backup_dir = os.path.join(finddir(BACKUP_DIR, VOLUME_NAME), "backups") cfg = open(os.path.join(backup_dir, "*****@*****.**"), "w") cfg.close() info = cmd.backup_volume_list(address, "", backup_target, include_backup_details=True) assert info[VOLUME_NAME]["Name"] == VOLUME_NAME assert len(info[VOLUME_NAME]["Backups"]) == 1 assert MESSAGE_TYPE_ERROR not in info[VOLUME_NAME]["Messages"] assert info[VOLUME2_NAME]["Name"] == VOLUME2_NAME assert len(info[VOLUME2_NAME]["Backups"]) == 1 assert MESSAGE_TYPE_ERROR not in info[VOLUME2_NAME]["Messages"] # remove the volume with the badly named backup.cfg cmd.backup_volume_rm(address, VOLUME_NAME, backup_target) info = cmd.backup_volume_list(address, VOLUME_NAME, backup_target, include_backup_details=True) assert "cannot find" in info[VOLUME_NAME]["Messages"]["error"] # remove volume 2 backups cmd.backup_volume_rm(address, VOLUME2_NAME, backup_target) info = cmd.backup_volume_list(address, VOLUME2_NAME, backup_target, include_backup_details=True) assert "cannot find" in info[VOLUME2_NAME]["Messages"]["error"] # cleanup volume 1 cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2) # cleanup volume 2 cmd.sync_agent_server_reset(address2) cleanup_controller(grpc2_controller) cleanup_replica(grpc2_replica1) cleanup_replica(grpc2_replica2)
def test_expansion_with_rebuild( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA address = grpc_controller.address dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" # the default size is 4MB, will expand it to 8MB address = grpc_controller.address zero_char = b'\x00'.decode('utf-8') original_data = zero_char * SIZE # write the data to the original part then do expansion data1_len = random_length(PAGE_SIZE) data1 = Data(random.randrange(0, SIZE - 2 * PAGE_SIZE, PAGE_SIZE), data1_len, random_string(data1_len)) snap1 = Snapshot(dev, data1, address) expand_volume_with_frontend(grpc_controller, EXPANDED_SIZE) wait_and_check_volume_expansion(grpc_controller, EXPANDED_SIZE) snap1.verify_data() assert \ dev.readat(0, SIZE) == \ original_data[0:data1.offset] + data1.content + \ original_data[data1.offset+data1.length:] assert dev.readat(SIZE, SIZE) == zero_char * SIZE # write the data to both the original part and the expanded part data2_len = random_length(PAGE_SIZE) data2 = Data(SIZE - PAGE_SIZE, data2_len, random_string(data2_len)) snap2 = Snapshot(dev, data2, address) data3_len = random_length(PAGE_SIZE) data3 = Data(random.randrange(SIZE, EXPANDED_SIZE - PAGE_SIZE, PAGE_SIZE), data3_len, random_string(data3_len)) snap3 = Snapshot(dev, data3, address) snap1.verify_data() snap2.verify_data() snap3.verify_data() assert \ dev.readat(SIZE, SIZE) == zero_char*(data3.offset-SIZE) + \ data3.content + zero_char*(EXPANDED_SIZE-data3.offset-data3.length) # Cleanup replica2 cleanup_replica(grpc_replica2) verify_replica_state(grpc_controller, grpc_replica2.address, "ERR") grpc_controller.replica_delete(replicas[1].address) # Rebuild replica2. open_replica(grpc_replica2) # The newly opened replica2 will be expanded automatically cmd.add_replica(address, grpc_replica2.url) wait_for_rebuild_complete(address) verify_replica_state(grpc_controller, grpc_replica2.address, "RW") # Cleanup replica1 then check if the rebuilt replica2 works fine cleanup_replica(grpc_replica1) verify_replica_state(grpc_controller, grpc_replica1.address, "ERR") grpc_controller.replica_delete(replicas[0].address) assert \ dev.readat(0, SIZE) == \ original_data[0:data1.offset] + data1.content + \ original_data[data1.offset+data1.length:data2.offset] + \ data2.content + \ original_data[data2.offset+data2.length:] assert \ dev.readat(SIZE, SIZE) == zero_char*(data3.offset-SIZE) + \ data3.content + zero_char*(EXPANDED_SIZE-data3.offset-data3.length) data4_len = random_length(PAGE_SIZE) data4 = Data(data1.offset, data4_len, random_string(data4_len)) snap4 = Snapshot(dev, data4, address) snap4.verify_data()
def test_expansion_rollback_with_rebuild(grpc_controller, grpc_fixed_dir_replica1, grpc_fixed_dir_replica2): # NOQA """ The test flow: 1. Write random data into the block device. 2. Create the 1st snapshot. 3. Create an empty directory using the tmp meta file path of the expansion disk for each replica. This will fail the following expansion and trigger expansion rollback. 4. Try to expand the volume but fails. Then the automatic rollback will be applied implicitly. 5. Check the volume status and if there are leftovers of the failed expansion. 6. Check if the volume is still usable by r/w data, then create the 2nd snapshot. 7. Retry expansion. It should succeed. 8. Verify the data and try data r/w. 9. Delete then rebuild the replica2. Then rebuilt replica2 will be expanded automatically. 10. Delete the replica1 then check if the rebuilt replica2 works fine. """ address = grpc_controller.address r1_url = grpc_fixed_dir_replica1.address r2_url = grpc_fixed_dir_replica2.address dev = get_dev(grpc_fixed_dir_replica1, grpc_fixed_dir_replica2, grpc_controller) replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" # the default size is 4MB, will expand it to 8MB zero_char = b'\x00'.decode('utf-8') original_data = zero_char * SIZE # write the data to the original part then do expansion data1_len = random_length(PAGE_SIZE) data1 = Data(random.randrange(0, SIZE - 2 * PAGE_SIZE, PAGE_SIZE), data1_len, random_string(data1_len)) snap1 = Snapshot(dev, data1, address) # use the tmp meta file path of expansion disks to create empty directories # so that the expansion disk meta data update will fail. # Then expansion will fail and the rollback will be triggered. disk_meta_tmp_1 = os.path.join(FIXED_REPLICA_PATH1, EXPANSION_DISK_TMP_META_NAME) disk_meta_tmp_2 = os.path.join(FIXED_REPLICA_PATH2, EXPANSION_DISK_TMP_META_NAME) os.mkdir(disk_meta_tmp_1) os.mkdir(disk_meta_tmp_2) # All replicas' expansion will fail # then engine will do rollback automatically grpc_controller.volume_frontend_shutdown() grpc_controller.volume_expand(EXPANDED_SIZE) wait_for_volume_expansion(grpc_controller, SIZE) grpc_controller.volume_frontend_start(FRONTEND_TGT_BLOCKDEV) # Expansion should fail but the expansion rollback should succeed volume_info = grpc_controller.volume_get() assert volume_info.last_expansion_error != "" assert volume_info.last_expansion_failed_at != "" verify_replica_state(grpc_controller, r1_url, "RW") verify_replica_state(grpc_controller, r2_url, "RW") # The invalid disk and head will be cleaned up automatically # after the rollback expansion_disk_1 = os.path.join(FIXED_REPLICA_PATH1, EXPANSION_DISK_NAME) expansion_disk_2 = os.path.join(FIXED_REPLICA_PATH2, EXPANSION_DISK_NAME) assert not os.path.exists(expansion_disk_1) assert not os.path.exists(expansion_disk_2) assert not os.path.exists(disk_meta_tmp_1) assert not os.path.exists(disk_meta_tmp_2) # The meta info file should keep unchanged replica_meta_file_1 = os.path.join(FIXED_REPLICA_PATH1, REPLICA_META_FILE_NAME) replica_meta_file_2 = os.path.join(FIXED_REPLICA_PATH2, REPLICA_META_FILE_NAME) with open(replica_meta_file_1) as f: replica_meta_1 = json.load(f) assert replica_meta_1["Size"] == SIZE with open(replica_meta_file_2) as f: replica_meta_2 = json.load(f) assert replica_meta_2["Size"] == SIZE # try to check then write new data snap1.verify_data() data2_len = random_length(PAGE_SIZE) data2 = Data(SIZE - PAGE_SIZE, data2_len, random_string(data2_len)) snap2 = Snapshot(dev, data2, address) # Retry expansion expand_volume_with_frontend(grpc_controller, EXPANDED_SIZE) wait_and_check_volume_expansion(grpc_controller, EXPANDED_SIZE) with open(replica_meta_file_1) as f: replica_meta_1 = json.load(f) assert replica_meta_1["Size"] == EXPANDED_SIZE with open(replica_meta_file_2) as f: replica_meta_2 = json.load(f) assert replica_meta_2["Size"] == EXPANDED_SIZE assert os.path.exists(expansion_disk_1) assert os.path.exists(expansion_disk_2) snap1.verify_data() snap2.verify_data() assert dev.readat(SIZE, SIZE) == zero_char * SIZE data3_len = random_length(PAGE_SIZE) data3 = Data(random.randrange(SIZE, EXPANDED_SIZE - PAGE_SIZE, PAGE_SIZE), data3_len, random_string(data3_len)) snap3 = Snapshot(dev, data3, address) snap1.verify_data() snap2.verify_data() snap3.verify_data() assert \ dev.readat(SIZE, SIZE) == zero_char*(data3.offset-SIZE) + \ data3.content + zero_char*(EXPANDED_SIZE-data3.offset-data3.length) # Delete replica2 cleanup_replica(grpc_fixed_dir_replica2) verify_replica_state(grpc_controller, r2_url, "ERR") grpc_controller.replica_delete(replicas[1].address) # Rebuild replica2. open_replica(grpc_fixed_dir_replica2) # The newly opened replica2 will be expanded automatically cmd.add_replica(address, grpc_fixed_dir_replica2.url) wait_for_rebuild_complete(address) verify_replica_state(grpc_controller, r2_url, "RW") # Cleanup replica1 then check if the rebuilt replica2 works fine cleanup_replica(grpc_fixed_dir_replica1) verify_replica_state(grpc_controller, r1_url, "ERR") grpc_controller.replica_delete(replicas[0].address) assert \ dev.readat(0, SIZE) == \ original_data[0:data1.offset] + data1.content + \ original_data[data1.offset+data1.length:data2.offset] + \ data2.content + \ original_data[data2.offset+data2.length:] assert \ dev.readat(SIZE, SIZE) == zero_char*(data3.offset-SIZE) + \ data3.content + zero_char*(EXPANDED_SIZE-data3.offset-data3.length) data4_len = random_length(PAGE_SIZE) data4 = Data(data1.offset, data4_len, random_string(data4_len)) snap4 = Snapshot(dev, data4, address) snap4.verify_data()
def test_backup_block_no_cleanup( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA address = grpc_controller.address length = 128 for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) # write two backup blocks verify_data(dev, 0, random_string(length)) verify_data(dev, BLOCK_SIZE, random_string(length)) snap = cmd.snapshot_create(address) backup1 = create_backup(address, snap, backup_target) assert backup1["VolumeName"] == VOLUME_NAME assert backup1["Size"] == str(BLOCK_SIZE * 2) assert snap in backup1["SnapshotName"] check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 2) # overwrite second backup block verify_data(dev, BLOCK_SIZE, random_string(length)) snap = cmd.snapshot_create(address) backup2 = create_backup(address, snap, backup_target) assert backup2["VolumeName"] == VOLUME_NAME assert backup2["Size"] == str(BLOCK_SIZE * 2) assert snap in backup2["SnapshotName"] # check that the volume now has 3 blocks # backup1 and backup2 share the first block # and have different second blocks check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 3) # create an artificial in progress backup # that will stop the gc from removing blocks in_progress_backup_file = create_in_progress_backup_file(VOLUME_NAME) # remove backup 1 the volume should still have 3 blocks cmd.backup_rm(address, backup1["URL"]) check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 3) # remove the in progress backup os.remove(in_progress_backup_file) # remove the last remaining backup 2 # this should remove all blocks # including the orphaned block from backup 1 cmd.backup_rm(address, backup2["URL"]) check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 0) # cleanup the backup volume cmd.backup_volume_rm(address, VOLUME_NAME, backup_target) info = cmd.backup_volume_list(address, VOLUME_NAME, backup_target)[VOLUME_NAME] assert "cannot find" in info["Messages"]["error"] cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_backup_block_deletion( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA address = grpc_controller.address length = 128 for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) # write two backup block verify_data(dev, 0, random_string(length)) verify_data(dev, BLOCK_SIZE, random_string(length)) snap = cmd.snapshot_create(address) backup1 = create_backup(address, snap, backup_target) assert backup1["VolumeName"] == VOLUME_NAME assert backup1["Size"] == str(BLOCK_SIZE * 2) assert snap in backup1["SnapshotName"] # test block deduplication backup1_duplicate = create_backup(address, snap, backup_target) assert backup1_duplicate["VolumeName"] == VOLUME_NAME assert backup1_duplicate["Size"] == str(BLOCK_SIZE * 2) assert snap in backup1_duplicate["SnapshotName"] check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 2) # overwrite second backup block verify_data(dev, BLOCK_SIZE, random_string(length)) snap = cmd.snapshot_create(address) backup2 = create_backup(address, snap, backup_target) assert backup2["VolumeName"] == VOLUME_NAME assert backup2["Size"] == str(BLOCK_SIZE * 2) assert snap in backup2["SnapshotName"] # check that the volume now has 3 blocks # backup1 and backup2 share the first block # and have different second blocks check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 3) # remove backup 1 duplicate # this should not change the blocks on disk # since all blocks are still required cmd.backup_rm(address, backup1_duplicate["URL"]) check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 3) # remove backup 1 # the volume should now have 2 blocks # blk1 from backup1 should still be present # since it's required by backup 2 cmd.backup_rm(address, backup1["URL"]) check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 2) # remove the last remaining backup 2 # this should remove all blocks cmd.backup_rm(address, backup2["URL"]) check_backup_volume_block_count(address, VOLUME_NAME, backup_target, 0) # cleanup the backup volume cmd.backup_volume_rm(address, VOLUME_NAME, backup_target) info = cmd.backup_volume_list(address, VOLUME_NAME, backup_target)[VOLUME_NAME] assert "cannot find" in info["Messages"]["error"] cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_backup_incremental_logic(grpc_replica1, grpc_replica2, grpc_controller, backup_targets): # NOQA for backup_target in backup_targets: dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) address = grpc_controller.address volume_name = VOLUME_NAME engine_name = ENGINE_NAME offset = 0 length = 128 # initial backup snap1_data = random_string(length) verify_data(dev, offset, snap1_data) snap1_checksum = checksum_dev(dev) snap1 = cmd.snapshot_create(address) backup1_info = create_backup(address, snap1, backup_target) assert backup1_info["IsIncremental"] is False # delta backup on top of initial backup snap2_data = random_string(int(length / 2)) verify_data(dev, offset, snap2_data) snap2 = cmd.snapshot_create(address) backup2_info = create_backup(address, snap2, backup_target) assert backup2_info["IsIncremental"] is True # delete the volume cmd.sync_agent_server_reset(address) grpc_controller = cleanup_controller(grpc_controller) grpc_replica1 = cleanup_replica(grpc_replica1) grpc_replica2 = cleanup_replica(grpc_replica2) # recreate the volume dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller, clean_backup_dir=False) # empty initial backup after volume recreation snap3 = cmd.snapshot_create(address) backup3_info = create_backup(address, snap3, backup_target) assert backup3_info["VolumeName"] == volume_name assert backup3_info["Size"] == '0' assert backup3_info["IsIncremental"] is False # write half of snap1 onto head snap4_data = snap1_data[:int(length / 2)] assert len(snap4_data) == int(length / 2) verify_data(dev, offset, snap4_data) snap4_checksum = checksum_dev(dev) assert snap4_checksum != snap1_checksum snap4 = cmd.snapshot_create(address) backup4_info = create_backup(address, snap4, backup_target) assert backup4_info["IsIncremental"] is True # restore initial backup reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup1_info["URL"]) assert read_dev(dev, offset, length) == snap1_data assert checksum_dev(dev) == snap1_checksum # restore final backup (half of snap1) reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup4_info["URL"]) assert checksum_dev(dev) == snap4_checksum assert snap4_checksum != snap1_checksum data = read_dev(dev, offset, length) assert data[:int(length / 2)] == snap4_data assert data[int(length / 2):] == '\x00' * int(length / 2) rm_backups(address, engine_name, [ backup1_info["URL"], backup2_info["URL"], backup3_info["URL"], backup4_info["URL"] ]) cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)
def test_ha_single_replica_rebuild( grpc_controller, # NOQA grpc_replica1, grpc_replica2): # NOQA address = grpc_controller.address open_replica(grpc_replica1) open_replica(grpc_replica2) replicas = grpc_controller.replica_list() assert len(replicas) == 0 r1_url = grpc_replica1.url r2_url = grpc_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.replicaCount == 2 replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" dev = get_blockdev(VOLUME_NAME) data = random_string(128) data_offset = 1024 verify_data(dev, data_offset, data) # Cleanup replica2 cleanup_replica(grpc_replica2) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "ERR") verify_read(dev, data_offset, data) grpc_controller.replica_delete(replicas[1].address) # Rebuild replica2 open_replica(grpc_replica2) cmd.add_replica(address, r2_url) wait_for_rebuild_complete(address) verify_async(dev, 10, 128, 1) verify_replica_state(grpc_controller, r2_url, "RW") verify_read(dev, data_offset, data) # WORKAROUND for unable to remove the parent of volume head newsnap = cmd.snapshot_create(address) info = cmd.snapshot_info(address) assert len(info) == 3 sysnap = info[newsnap]["parent"] assert info[sysnap]["parent"] == "" assert newsnap in info[sysnap]["children"] assert info[sysnap]["usercreated"] is False assert info[sysnap]["removed"] is False cmd.snapshot_purge(address) wait_for_purge_completion(address) info = cmd.snapshot_info(address) assert len(info) == 2 assert info[newsnap] is not None assert info[VOLUME_HEAD] is not None
def test_single_replica_expansion_failed(grpc_controller, grpc_fixed_dir_replica1, grpc_fixed_dir_replica2): # NOQA """ The test flow: 1. Write random data into the block device. 2. Create the 1st snapshot. 3. Create an empty directory using the tmp meta file path of the expansion disk for replica1. 4. Try to expand the volume. replica1 will be directly marked as ERR state. Finally the volume expansion should succeed. 5. Check the volume status, and if the expanded volume works fine: r/w data then create the 2nd snapshot. 6. Rebuild replica1 and check the replica1 is expanded automatically. 7. Delete replica2 then check if the rebuilt replica1 works fine. """ address = grpc_controller.address r1_url = grpc_fixed_dir_replica1.address r2_url = grpc_fixed_dir_replica2.address dev = get_dev(grpc_fixed_dir_replica1, grpc_fixed_dir_replica2, grpc_controller) replicas = grpc_controller.replica_list() assert len(replicas) == 2 assert replicas[0].mode == "RW" assert replicas[1].mode == "RW" # the default size is 4MB, will expand it to 8MB zero_char = b'\x00'.decode('utf-8') # write the data to the original part then do expansion data1_len = random_length(PAGE_SIZE) data1 = Data(random.randrange(0, SIZE - 2 * PAGE_SIZE, PAGE_SIZE), data1_len, random_string(data1_len)) snap1 = Snapshot(dev, data1, address) disk_meta_tmp_1 = os.path.join(FIXED_REPLICA_PATH1, EXPANSION_DISK_TMP_META_NAME) os.mkdir(disk_meta_tmp_1) # replica1 will fail to expand the size, # then engine will directly mark it as ERR state. # Finally, The volume expansion should succeed since replica2 works fine. grpc_controller.volume_frontend_shutdown() grpc_controller.volume_expand(EXPANDED_SIZE) wait_for_volume_expansion(grpc_controller, EXPANDED_SIZE) grpc_controller.volume_frontend_start(FRONTEND_TGT_BLOCKDEV) volume_info = grpc_controller.volume_get() assert volume_info.last_expansion_error != "" assert volume_info.last_expansion_failed_at != "" verify_replica_state(grpc_controller, r1_url, "ERR") verify_replica_state(grpc_controller, r2_url, "RW") expansion_disk_2 = os.path.join(FIXED_REPLICA_PATH2, EXPANSION_DISK_NAME) disk_meta_tmp_2 = os.path.join(FIXED_REPLICA_PATH2, EXPANSION_DISK_TMP_META_NAME) assert os.path.exists(expansion_disk_2) assert not os.path.exists(disk_meta_tmp_2) # The meta info file should keep unchanged replica_meta_file_2 = os.path.join(FIXED_REPLICA_PATH2, REPLICA_META_FILE_NAME) with open(replica_meta_file_2) as f: replica_meta_2 = json.load(f) assert replica_meta_2["Size"] == EXPANDED_SIZE # Cleanup replica1 then check if replica2 works fine cleanup_replica(grpc_fixed_dir_replica1) verify_replica_state(grpc_controller, r1_url, "ERR") grpc_controller.replica_delete(replicas[0].address) snap1.verify_data() data2_len = random_length(PAGE_SIZE) data2 = Data(SIZE - PAGE_SIZE, data2_len, random_string(data2_len)) snap2 = Snapshot(dev, data2, address) snap2.verify_data() assert dev.readat(SIZE, SIZE) == zero_char * SIZE # Rebuild replica1. # The newly opened replica1 will be expanded automatically open_replica(grpc_fixed_dir_replica1) cmd.add_replica(address, grpc_fixed_dir_replica1.url) wait_for_rebuild_complete(address) r1 = grpc_fixed_dir_replica1.replica_get() assert r1.size == EXPANDED_SIZE_STR verify_replica_state(grpc_controller, r1_url, "RW") replica_meta_file_1 = os.path.join(FIXED_REPLICA_PATH1, REPLICA_META_FILE_NAME) with open(replica_meta_file_1) as f: replica_meta_1 = json.load(f) assert replica_meta_1["Size"] == EXPANDED_SIZE # Delete replica2 then check if the rebuilt replica1 works fine cleanup_replica(grpc_fixed_dir_replica2) verify_replica_state(grpc_controller, r2_url, "ERR") grpc_controller.replica_delete(replicas[1].address) data3_len = random_length(PAGE_SIZE) data3 = Data(random.randrange(SIZE, EXPANDED_SIZE - PAGE_SIZE, PAGE_SIZE), data3_len, random_string(data3_len)) snap3 = Snapshot(dev, data3, address) snap1.verify_data() snap2.verify_data() snap3.verify_data() assert \ dev.readat(SIZE, SIZE) == zero_char*(data3.offset-SIZE) + \ data3.content + zero_char*(EXPANDED_SIZE-data3.offset-data3.length)
def test_upgrade( grpc_engine_manager, # NOQA grpc_controller, # NOQA grpc_fixed_dir_replica1, grpc_fixed_dir_replica2, # NOQA grpc_extra_replica1, grpc_extra_replica2): # NOQA dev = get_dev(grpc_fixed_dir_replica1, grpc_fixed_dir_replica2, grpc_controller) offset = 0 length = 128 data = random_string(length) verify_data(dev, offset, data) # both set pointed to the same volume underlying r1_url = grpc_fixed_dir_replica1.url r2_url = grpc_fixed_dir_replica2.url upgrade_r1_url = grpc_extra_replica1.url upgrade_r2_url = grpc_extra_replica2.url v = grpc_controller.volume_start(replicas=[r1_url, r2_url]) assert v.replicaCount == 2 upgrade_e = upgrade_engine(grpc_engine_manager, LONGHORN_UPGRADE_BINARY, ENGINE_NAME, VOLUME_NAME, replicas=[upgrade_r1_url, upgrade_r2_url]) assert upgrade_e.spec.binary == LONGHORN_UPGRADE_BINARY verify_data(dev, offset, data) grpc_controller.client_upgrade(get_process_address(upgrade_e)) wait_for_process_running(grpc_engine_manager, ENGINE_NAME) info = grpc_controller.volume_get() assert info.endpoint == path.join(LONGHORN_DEV_DIR, VOLUME_NAME) # cannot start with same binary # with pytest.raises(grpc.RpcError): # grpc_engine_manager.engine_upgrade( # ENGINE_NAME, LONGHORN_UPGRADE_BINARY, # SIZE, [r1_url, r2_url]) # verify_data(dev, offset, data) # cannot start with wrong replica, would trigger rollback with pytest.raises(grpc.RpcError): upgrade_engine(grpc_engine_manager, LONGHORN_BINARY, ENGINE_NAME, VOLUME_NAME, ["random"]) verify_data(dev, offset, data) grpc_fixed_dir_replica1 = cleanup_replica(grpc_fixed_dir_replica1) grpc_fixed_dir_replica2 = cleanup_replica(grpc_fixed_dir_replica2) open_replica(grpc_fixed_dir_replica1) open_replica(grpc_fixed_dir_replica2) e = upgrade_engine(grpc_engine_manager, LONGHORN_BINARY, ENGINE_NAME, VOLUME_NAME, [r1_url, r2_url]) assert e.spec.binary == LONGHORN_BINARY verify_data(dev, offset, data) grpc_controller.client_upgrade(get_process_address(e)) wait_for_process_running(grpc_engine_manager, ENGINE_NAME) time.sleep(3) info = grpc_controller.volume_get() assert info.endpoint == path.join(LONGHORN_DEV_DIR, VOLUME_NAME)
def test_backup_S3_latest_unavailable( grpc_replica1, grpc_replica2, # NOQA grpc_controller, backup_targets): # NOQA for backup_target in backup_targets: if "s3://" not in backup_target: continue dev = get_dev(grpc_replica1, grpc_replica2, grpc_controller) address = grpc_controller.address volume_name = VOLUME_NAME engine_name = ENGINE_NAME offset = 0 length = 128 # initial backup snap1_data = random_string(length) verify_data(dev, offset, snap1_data) snap1_checksum = checksum_dev(dev) snap1 = cmd.snapshot_create(address) backup1_info = create_backup(address, snap1, backup_target) # backup to be unavailable snap2_data = random_string(length) verify_data(dev, offset, snap2_data) snap2 = cmd.snapshot_create(address) backup2_info = create_backup(address, snap2, backup_target) # the gc after the restore will clean up the missing backup cfg = findfile(BACKUP_DIR, "backup_" + backup2_info["Name"] + ".cfg") os.remove(cfg) # final full backup after unavailable backup snap3_data = random_string(length) verify_data(dev, offset, snap3_data) snap3_checksum = checksum_dev(dev) snap3 = cmd.snapshot_create(address) backup3_info = create_backup(address, snap3, backup_target) assert backup3_info["VolumeName"] == volume_name assert backup3_info["Size"] == BLOCK_SIZE_STR # write some stuff on head head_data = random_string(length) verify_data(dev, offset, head_data) # test restore of the initial backup reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup1_info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap1_data c = checksum_dev(dev) assert c == snap1_checksum # test a restore for the final backup reset_volume(grpc_controller, grpc_replica1, grpc_replica2) dev = get_blockdev(volume_name) restore_with_frontend(address, engine_name, backup3_info["URL"]) readed = read_dev(dev, offset, length) assert readed == snap3_data c = checksum_dev(dev) assert c == snap3_checksum rm_backups(address, engine_name, [backup1_info["URL"], backup3_info["URL"]]) cmd.sync_agent_server_reset(address) cleanup_controller(grpc_controller) cleanup_replica(grpc_replica1) cleanup_replica(grpc_replica2)