def save_checkpoint( self, logdir: str, checkpoint: Dict, save_n_best: int = 3, main_metric: str = "raw_reward", minimize_metric: bool = False ): agent_rewards = checkpoint[main_metric] agent_metric = self.rewards2metric(agent_rewards) is_best = len(self.best_agents) == 0 or \ agent_metric > self.rewards2metric(self.best_agents[0][1]) suffix = f"{checkpoint['epoch']}" filepath = utils.save_checkpoint( logdir=f"{logdir}/checkpoints/", checkpoint=checkpoint, suffix=suffix, is_best=is_best, is_last=True ) self.best_agents.append((filepath, agent_rewards)) self.best_agents = sorted( self.best_agents, key=lambda x: x[1], reverse=not minimize_metric ) if len(self.best_agents) > save_n_best: last_item = self.best_agents.pop(-1) last_filepath = last_item[0] os.remove(last_filepath)
def _save_checkpoint(self): if self.epoch % self.save_period == 0: checkpoint = self.algorithm.pack_checkpoint() checkpoint["epoch"] = self.epoch filename = utils.save_checkpoint( logdir=self.logdir, checkpoint=checkpoint, suffix=str(self.epoch) ) print(f"Checkpoint saved to: {filename}")
def on_exception(self, state: _State): exception = state.exception if not utils.is_exception(exception): return try: checkpoint = _pack_state(state) suffix = self.get_checkpoint_suffix(checkpoint) suffix = f"{suffix}.exception_{exception.__class__.__name__}" utils.save_checkpoint( logdir=Path(f"{state.logdir}/checkpoints/"), checkpoint=checkpoint, suffix=suffix, is_best=False, is_last=False ) metrics = self.metrics metrics[suffix] = state.valid_metrics self.save_metric(state.logdir, metrics) except Exception: pass
def process_checkpoint( self, logdir: Union[str, Path], checkpoint: Dict, is_best: bool, main_metric: str = "loss", minimize_metric: bool = True ): suffix = self.get_checkpoint_suffix(checkpoint) utils.save_checkpoint( logdir=Path(f"{logdir}/checkpoints/"), checkpoint=checkpoint, suffix=f"{suffix}_full", is_best=is_best, is_last=True, special_suffix="_full" ) exclude = ["criterion", "optimizer", "scheduler"] checkpoint = { key: value for key, value in checkpoint.items() if all(z not in key for z in exclude) } filepath = utils.save_checkpoint( checkpoint=checkpoint, logdir=Path(f"{logdir}/checkpoints/"), suffix=suffix, is_best=is_best, is_last=True ) valid_metrics = checkpoint["valid_metrics"] checkpoint_metric = valid_metrics[main_metric] metrics_record = (filepath, checkpoint_metric, valid_metrics) self.top_best_metrics.append(metrics_record) self.metrics_history.append(metrics_record) self.truncate_checkpoints(minimize_metric=minimize_metric) metrics = self.process_metrics(valid_metrics) self.save_metric(logdir, metrics)
def process_checkpoint( self, logdir: Union[str, Path], checkpoint: Dict, batch_metrics: Dict[str, float], ): filepath = utils.save_checkpoint( logdir=Path(f"{logdir}/checkpoints/"), checkpoint=checkpoint, suffix=self.get_checkpoint_suffix(checkpoint), is_best=False, is_last=False ) self.last_checkpoints.append((filepath, batch_metrics)) self.truncate_checkpoints() self.metrics_history.append(batch_metrics) metrics = self.process_metrics() self.save_metric(logdir, metrics) print(f"\nSaved checkpoint at {filepath}")