def test_migration_checkpoint_removal(start_connected_emptyhead_cluster): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that has been checkpointed t1 = Trial("__fake", **kwargs) runner.add_trial(t1) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1._checkpoint.value)) runner.step() # Recovery step for i in range(3): runner.step() assert t1.status == Trial.TERMINATED
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster, trainable_id): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, "sync_to_driver_fn": trainable_id == "__fake_remote", } # The following patches only affect __fake_remote. find_checkpoint_dir = TrainableUtil.find_checkpoint_dir with patch("ray.tune.logger.get_node_syncer") as mock_get_node_syncer: trainable_util = "ray.tune.ray_trial_executor.TrainableUtil" with patch(trainable_util + ".find_checkpoint_dir") as mock_find_dir: def mock_get_syncer_fn(local_dir, remote_dir, sync_function): client = mock_storage_client() return MockNodeSyncer(local_dir, remote_dir, client) mock_get_node_syncer.side_effect = mock_get_syncer_fn def mock_find_dir_fn(checkpoint_path): """Converts back to local path first.""" checkpoint_path = checkpoint_path[len(MOCK_REMOTE_DIR):] checkpoint_path = os.path.join("/", checkpoint_path) return find_checkpoint_dir(checkpoint_path) # __fake_remote trainables save to a separate "remote" directory. # TrainableUtil will not check this path unless we mock it. mock_find_dir.side_effect = mock_find_dir_fn # Test recovery of trial that has been checkpointed t1 = Trial(trainable_id, **kwargs) runner.add_trial(t1) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1.checkpoint.value)) runner.step() # collect result 3, kick off + fail result 4 runner.step() # Recovery step runner.step() # Process Recovery + step 4 for i in range(3): if t1.status != Trial.TERMINATED: runner.step() assert t1.status == Trial.TERMINATED, runner.debug_string()
def test_trial_migration(start_connected_emptyhead_cluster, trainable_id): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, "sync_to_driver_fn": trainable_id == "__fake", } # Test recovery of trial that hasn't been checkpointed t = Trial(trainable_id, **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(4): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial(trainable_id, **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # 3 result + start and fail 4 result runner.step() # Recovery step runner.step() # Process recovery runner.step() # result if t2.status != Trial.TERMINATED: runner.step() assert t2.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that won't be checkpointed kwargs = { "stopping_criterion": { "training_iteration": 3 }, "remote_checkpoint_dir": MOCK_REMOTE_DIR, "sync_to_driver_fn": trainable_id == "__fake", } t3 = Trial(trainable_id, **kwargs) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step if t3.status != Trial.ERROR: runner.step() assert t3.status == Trial.ERROR, runner.debug_string() with pytest.raises(TuneError): runner.step()
def test_trial_migration(start_connected_emptyhead_cluster): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(resources=dict(CPU=1)) assert cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that hasn't been checkpointed t = Trial("__fake", **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result is not None node2 = cluster.add_node(resources=dict(CPU=1)) cluster.remove_node(node) assert cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(3): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial("__fake", **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(resources=dict(CPU=1)) cluster.remove_node(node2) assert cluster.wait_for_nodes() runner.step() # Recovery step assert t2.last_result["training_iteration"] == 2 for i in range(1): runner.step() assert t2.status == Trial.TERMINATED # Test recovery of trial that won't be checkpointed t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}}) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(resources=dict(CPU=1)) cluster.remove_node(node3) assert cluster.wait_for_nodes() runner.step() # Error handling step assert t3.status == Trial.ERROR with pytest.raises(TuneError): runner.step()
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster, trainable_id): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() class _SyncerCallback(SyncerCallback): def _create_trial_syncer(self, trial: "Trial"): client = mock_storage_client() return MockNodeSyncer(trial.logdir, trial.logdir, client) syncer_callback = _SyncerCallback(None) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } # The following patches only affect __fake_remote. def hide_remote_path(path_function): def hidden_path_func(checkpoint_path): """Converts back to local path first.""" if MOCK_REMOTE_DIR in checkpoint_path: checkpoint_path = checkpoint_path[len(MOCK_REMOTE_DIR):] checkpoint_path = os.path.join("/", checkpoint_path) return path_function(checkpoint_path) return hidden_path_func trainable_util = "ray.tune.ray_trial_executor.TrainableUtil" _find_ckpt = trainable_util + ".find_checkpoint_dir" find_func = TrainableUtil.find_checkpoint_dir _pickle_ckpt = trainable_util + ".pickle_checkpoint" pickle_func = TrainableUtil.pickle_checkpoint with patch(_find_ckpt) as mock_find, patch(_pickle_ckpt) as mock_pkl_ckpt: # __fake_remote trainables save to a separate "remote" directory. # TrainableUtil will not check this path unless we mock it. mock_find.side_effect = hide_remote_path(find_func) mock_pkl_ckpt.side_effect = hide_remote_path(pickle_func) # Test recovery of trial that has been checkpointed t1 = Trial(trainable_id, **kwargs) runner.add_trial(t1) # Start trial, process result (x2), process save for _ in range(4): runner.step() assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1.checkpoint.value)) runner.step() # Collect result 3, kick off + fail result 4 runner.step() # Dispatch restore runner.step() # Process restore + step 4 for _ in range(3): if t1.status != Trial.TERMINATED: runner.step() assert t1.status == Trial.TERMINATED, runner.debug_string()
def test_trial_migration(start_connected_emptyhead_cluster, trainable_id): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() syncer_callback = _PerTrialSyncerCallback( lambda trial: trial.trainable_name == "__fake") runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } # Test recovery of trial that hasn't been checkpointed t = Trial(trainable_id, **kwargs) runner.add_trial(t) runner.step() # Start trial runner.step() # Process result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() # TODO(ujvl): Node failure does not propagate until a step after it # actually should. This is possibly a problem with `Cluster`. runner.step() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." # Process result (x2), process save, process result (x2), process save for _ in range(6): runner.step() assert t.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that has been checkpointed t2 = Trial(trainable_id, **kwargs) runner.add_trial(t2) # Start trial, process result (x2), process save for _ in range(4): runner.step() assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # Process result 3 + start and fail 4 result runner.step() # Dispatch restore runner.step() # Process restore runner.step() # Process result 5 if t2.status != Trial.TERMINATED: runner.step() # Process result 6, dispatch save runner.step() # Process save assert t2.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that won't be checkpointed kwargs = { "stopping_criterion": { "training_iteration": 3 }, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } t3 = Trial(trainable_id, **kwargs) runner.add_trial(t3) runner.step() # Start trial runner.step() # Process result 1 cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step if t3.status != Trial.ERROR: runner.step() assert t3.status == Trial.ERROR, runner.debug_string() with pytest.raises(TuneError): runner.step()
def test_trial_migration(start_connected_emptyhead_cluster): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that hasn't been checkpointed t = Trial("__fake", **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(3): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial("__fake", **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # Recovery step assert t2.last_result["training_iteration"] == 2 for i in range(1): runner.step() assert t2.status == Trial.TERMINATED # Test recovery of trial that won't be checkpointed t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}}) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step assert t3.status == Trial.ERROR with pytest.raises(TuneError): runner.step()