def _init_paths(self) -> None: """ Create a unique model directory for each training process. If a load path is provided, copy the checkpoint into the model directory of each training process. This model directory will be used to initialize an Estimator. We also update the paths in the CheckpointState metadata file to the new directory location. """ # Add suffix so that horovod processes don't overwrite each other. suffix = str(0) if not self.hvd_config.use else str(hvd.local_rank()) if self.load_path is None: self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix)) logging.debug(f"Estimator directory set to {self.estimator_dir}.") return for callback in self.train_hooks: if isinstance(callback, estimator.RunHook): callback.on_checkpoint_load(str(self.load_path)) self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix)) if self.estimator_dir.exists(): shutil.rmtree(str(self.estimator_dir)) logging.debug( f"Copying from {self.load_path} to {self.estimator_dir}.") shutil.copytree(str(self.load_path), str(self.estimator_dir)) # Calibrate the CheckpointState metadata file to the new location. estimator._update_checkpoint_path_in_state_file(self.estimator_dir) logging.debug(f"Load path set to {self.estimator_dir}.")
def _copy_latest_checkpoint(self, checkpoint_path: pathlib.Path) -> None: checkpoint_dir = os.path.dirname( self.estimator_trial_controller.estimator.latest_checkpoint()) shutil.copytree(checkpoint_dir, str(checkpoint_path)) # Calibrate the CheckpointState metadata file to the new location. estimator._update_checkpoint_path_in_state_file(checkpoint_path)
def _copy_latest_checkpoint(self, checkpoint_path: pathlib.Path) -> None: checkpoint_dir = os.path.dirname( self.estimator_trial_controller.estimator.latest_checkpoint()) # shuil.copytree doesn't like to copy into a directory, even an empty one. checkpoint_path.rmdir() shutil.copytree(checkpoint_dir, str(checkpoint_path)) # Calibrate the CheckpointState metadata file to the new location. estimator._update_checkpoint_path_in_state_file(checkpoint_path)
def _init_paths(self) -> None: """ Create a unique model directory for each training process. If a load path is provided, copy the checkpoint into the model directory of each training process. This model directory will be used to initialize an Estimator. We also update the paths in the CheckpointState metadata file to the new directory location. """ # Add suffix so that horovod processes don't overwrite each other. suffix = str(self.context.distributed.local_rank) if self.env.latest_checkpoint is None: self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix)) logging.debug(f"Estimator directory set to {self.estimator_dir}.") return logging.info( f"Restoring trial from checkpoint {self.env.latest_checkpoint}") with self.context._core.checkpoint.restore_path( self.env.latest_checkpoint) as load_path: for callback in self.train_hooks: if isinstance(callback, estimator.RunHook): callback.on_checkpoint_load(str(load_path)) self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix)) if self.estimator_dir.exists(): shutil.rmtree(str(self.estimator_dir)) logging.debug(f"Copying from {load_path} to {self.estimator_dir}.") shutil.copytree(str(load_path), str(self.estimator_dir)) # Calibrate the CheckpointState metadata file to the new location. estimator._update_checkpoint_path_in_state_file(self.estimator_dir) logging.debug(f"Load path set to {self.estimator_dir}.") # Load WorkloadSequencer state. wlsq_path = load_path / "workload_sequencer.pkl" if self.wlsq is not None and wlsq_path.exists(): with wlsq_path.open("rb") as f: self.wlsq.load_state(pickle.load(f))