예제 #1
0
    def save_checkpoint(
        self,
        logdir: str,
        checkpoint: Dict,
        save_n_best: int = 3,
        main_metric: str = "raw_reward",
        minimize_metric: bool = False
    ):
        agent_rewards = checkpoint[main_metric]
        agent_metric = self.rewards2metric(agent_rewards)

        is_best = len(self.best_agents) == 0 or \
            agent_metric > self.rewards2metric(self.best_agents[0][1])
        suffix = f"{checkpoint['epoch']}"
        filepath = utils.save_checkpoint(
            logdir=f"{logdir}/checkpoints/",
            checkpoint=checkpoint,
            suffix=suffix,
            is_best=is_best,
            is_last=True
        )

        self.best_agents.append((filepath, agent_rewards))
        self.best_agents = sorted(
            self.best_agents, key=lambda x: x[1], reverse=not minimize_metric
        )
        if len(self.best_agents) > save_n_best:
            last_item = self.best_agents.pop(-1)
            last_filepath = last_item[0]
            os.remove(last_filepath)
예제 #2
0
 def _save_checkpoint(self):
     if self.epoch % self.save_period == 0:
         checkpoint = self.algorithm.pack_checkpoint()
         checkpoint["epoch"] = self.epoch
         filename = utils.save_checkpoint(
             logdir=self.logdir,
             checkpoint=checkpoint,
             suffix=str(self.epoch)
         )
         print(f"Checkpoint saved to: {filename}")
예제 #3
0
    def on_exception(self, state: _State):
        exception = state.exception
        if not utils.is_exception(exception):
            return

        try:
            checkpoint = _pack_state(state)
            suffix = self.get_checkpoint_suffix(checkpoint)
            suffix = f"{suffix}.exception_{exception.__class__.__name__}"
            utils.save_checkpoint(
                logdir=Path(f"{state.logdir}/checkpoints/"),
                checkpoint=checkpoint,
                suffix=suffix,
                is_best=False,
                is_last=False
            )
            metrics = self.metrics
            metrics[suffix] = state.valid_metrics
            self.save_metric(state.logdir, metrics)
        except Exception:
            pass
예제 #4
0
    def process_checkpoint(
        self,
        logdir: Union[str, Path],
        checkpoint: Dict,
        is_best: bool,
        main_metric: str = "loss",
        minimize_metric: bool = True
    ):
        suffix = self.get_checkpoint_suffix(checkpoint)
        utils.save_checkpoint(
            logdir=Path(f"{logdir}/checkpoints/"),
            checkpoint=checkpoint,
            suffix=f"{suffix}_full",
            is_best=is_best,
            is_last=True,
            special_suffix="_full"
        )

        exclude = ["criterion", "optimizer", "scheduler"]
        checkpoint = {
            key: value
            for key, value in checkpoint.items()
            if all(z not in key for z in exclude)
        }
        filepath = utils.save_checkpoint(
            checkpoint=checkpoint,
            logdir=Path(f"{logdir}/checkpoints/"),
            suffix=suffix,
            is_best=is_best,
            is_last=True
        )

        valid_metrics = checkpoint["valid_metrics"]
        checkpoint_metric = valid_metrics[main_metric]
        metrics_record = (filepath, checkpoint_metric, valid_metrics)
        self.top_best_metrics.append(metrics_record)
        self.metrics_history.append(metrics_record)
        self.truncate_checkpoints(minimize_metric=minimize_metric)
        metrics = self.process_metrics(valid_metrics)
        self.save_metric(logdir, metrics)
예제 #5
0
    def process_checkpoint(
        self,
        logdir: Union[str, Path],
        checkpoint: Dict,
        batch_metrics: Dict[str, float],
    ):
        filepath = utils.save_checkpoint(
            logdir=Path(f"{logdir}/checkpoints/"),
            checkpoint=checkpoint,
            suffix=self.get_checkpoint_suffix(checkpoint),
            is_best=False,
            is_last=False
        )

        self.last_checkpoints.append((filepath, batch_metrics))
        self.truncate_checkpoints()

        self.metrics_history.append(batch_metrics)

        metrics = self.process_metrics()
        self.save_metric(logdir, metrics)
        print(f"\nSaved checkpoint at {filepath}")