def to_air_checkpoint(self) -> Optional[Checkpoint]: from ray.tune.trainable.util import TrainableUtil checkpoint_data = self.dir_or_data if not checkpoint_data: return None if isinstance(checkpoint_data, ray.ObjectRef): checkpoint_data = ray.get(checkpoint_data) if isinstance(checkpoint_data, str): checkpoint_dir = TrainableUtil.find_checkpoint_dir(checkpoint_data) checkpoint = Checkpoint.from_directory(checkpoint_dir) elif isinstance(checkpoint_data, bytes): with tempfile.TemporaryDirectory() as tmpdir: TrainableUtil.create_from_pickle(checkpoint_data, tmpdir) # Double wrap in checkpoint so we hold the data in memory and # can remove the temp directory checkpoint = Checkpoint.from_dict( Checkpoint.from_directory(tmpdir).to_dict()) elif isinstance(checkpoint_data, dict): checkpoint = Checkpoint.from_dict(checkpoint_data) else: raise RuntimeError( f"Unknown checkpoint data type: {type(checkpoint_data)}") return checkpoint
def test_retry(ray_start_4_cpus): def train_func(): ckpt = session.get_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3)) ) analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 4
def test_reuse_checkpoint(ray_start_4_cpus): def train_func(config): itr = 0 ckpt = session.get_checkpoint() if ckpt is not None: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 5}}, ) [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 4 tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 10}}, ).restore(trial.local_dir) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5
def load_checkpoint_from_path(checkpoint_to_load: Union[str, Path]) -> Dict: """Utility function to load a checkpoint Dict from a path.""" checkpoint_path = Path(checkpoint_to_load).expanduser() if not checkpoint_path.exists(): raise ValueError(f"Checkpoint path {checkpoint_path} does not exist.") checkpoint = Checkpoint.from_directory(str(checkpoint_path)) return checkpoint.to_dict()
def restore(self, trial: Trial) -> None: """Restores training state from a given model checkpoint. Args: trial: The trial to be restored. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ checkpoint = trial.checkpoint if checkpoint.dir_or_data is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) checkpoint_dir = checkpoint.dir_or_data node_ip = checkpoint.node_ip if checkpoint.storage_mode == CheckpointStorage.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with self._change_working_directory(trial): trial.runner.restore_from_object.remote(checkpoint_dir) else: logger.debug("Trial %s: Attempting restore from %s", trial, checkpoint_dir) if (trial.uses_cloud_checkpointing or not trial.sync_on_checkpoint or not os.path.exists(checkpoint_dir)): # If using cloud checkpointing, trial will get cp from cloud. # If not syncing to driver, assume it has access to the cp # on the local fs. with self._change_working_directory(trial): remote = trial.runner.restore.remote( checkpoint_dir, node_ip) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where no cloud checkpoints are provided. logger.debug("Trial %s: Reading checkpoint into memory", trial) checkpoint_path = TrainableUtil.find_checkpoint_dir( checkpoint_dir) obj = Checkpoint.from_directory(checkpoint_path).to_bytes() with self._change_working_directory(trial): remote = trial.runner.restore_from_object.remote(obj) else: raise _AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` for remote " "storage-based restoration") self._futures[remote] = (_ExecutorEventType.RESTORING_RESULT, trial) trial.restoring_from = checkpoint
def test_tune_checkpoint_legacy(ray_start_4_cpus): def train_func(): for i in range(10): train.report(test=i) train.save_checkpoint(hello="world") trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) [trial] = tune.run(TestTrainable).trials checkpoint_path = trial.checkpoint.dir_or_data assert os.path.exists(checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["hello"] == "world"
def test_reuse_checkpoint_legacy(ray_start_4_cpus): def train_func(config): itr = 0 ckpt = train.load_checkpoint() if ckpt is not None: itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials checkpoint_path = trial.checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 4 analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path) trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5
def test_tune_checkpoint(ray_start_4_cpus): def train_func(): for i in range(9): session.report(dict(test=i)) session.report( dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world")) ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 5}}, ) [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data assert os.path.exists(checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["hello"] == "world"
def test_retry_legacy(ray_start_4_cpus): def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) analysis = tune.run(TestTrainable, max_failures=3) checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 4
def to_air_checkpoint(self) -> Optional[Checkpoint]: from ray.tune.trainable.util import TrainableUtil checkpoint_data = self.dir_or_data if not checkpoint_data: return None if isinstance(checkpoint_data, ray.ObjectRef): checkpoint_data = ray.get(checkpoint_data) if isinstance(checkpoint_data, str): try: checkpoint_dir = TrainableUtil.find_checkpoint_dir( checkpoint_data) except FileNotFoundError: if log_once("checkpoint_not_available"): logger.error( f"The requested checkpoint is not available on this node, " f"most likely because you are using Ray client or disabled " f"checkpoint synchronization. To avoid this, enable checkpoint " f"synchronization to cloud storage by specifying a " f"`SyncConfig`. The checkpoint may be available on a different " f"node - please check this location on worker nodes: " f"{checkpoint_data}") return None checkpoint = Checkpoint.from_directory(checkpoint_dir) elif isinstance(checkpoint_data, bytes): checkpoint = Checkpoint.from_bytes(checkpoint_data) elif isinstance(checkpoint_data, dict): checkpoint = Checkpoint.from_dict(checkpoint_data) else: raise RuntimeError( f"Unknown checkpoint data type: {type(checkpoint_data)}") return checkpoint
def function_trainable_directory(config): tmpdir = tempfile.mkdtemp("checkpoint_test") with open(os.path.join(tmpdir, "data.json"), "w") as f: json.dump({"checkpoint_data": 5}, f) session.report({"metric": 4}, checkpoint=Checkpoint.from_directory(tmpdir))