def create_checkpoint( self, model_G: nn.Module, model_D: nn.Module, optimizer_G: optim.Optimizer, optimizer_R: optim.Optimizer, optimizer_D: optim.Optimizer, hyperparam_dict, ): model_G_dict = model_G.state_dict() model_D_dict = model_D.state_dict() optimizer_G_dict = optimizer_G.state_dict() optimizer_R_dict = optimizer_R.state_dict() optimizer_D_dict = optimizer_D.state_dict() state_dict = { "model_G_dict": model_G_dict, "model_D_dict": model_D_dict, "optimizer_G_dict": optimizer_G_dict, "optimizer_R_dict": optimizer_R_dict, "optimizer_D_dict": optimizer_D_dict, "timestamp": strftime("%I:%M%p GMT%z on %b %d, %Y", localtime()), } checkpoint = {**state_dict, **hyperparam_dict} return checkpoint
def save( self, model: nn.Module, optimizer: optim.Optimizer, scheduler: optim.lr_scheduler._LRScheduler, epoch: int, metric: float, ): if self.best_metric < metric: self.best_metric = metric self.best_epoch = epoch is_best = True else: is_best = False os.makedirs(self.root_dir, exist_ok=True) torch.save( { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "epoch": epoch, "best_epoch": self.best_epoch, "best_metric": self.best_metric, }, osp.join(self.root_dir, f"{epoch:02d}.pth"), ) if is_best: shutil.copy( osp.join(self.root_dir, f"{epoch:02d}.pth"), osp.join(self.root_dir, "best.pth"), )
def save_checkpoint(output_directory: str, epoch: int, model: nn.Module, optimizer: optim.Optimizer, best_acc1: float, best_acc5: float, best_epoch: int) -> None: """Save a checkpoint and the best model in the history. Args: output_directory (str): The output directory. epoch (int): The epoch of the current checkpoint. model (nn.Module): The model in current epoch. optimizer (optim.Optimizer): The optimizer in current epoch. best_acc1 (float): The best top-1 accuracy of the model in the history. best_acc5 (float): The best top-5 accuracy of the model in the history. best_epoch (int): The eopch of the best top-1 accuracy in the history. """ if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): model_without_parallel = model.module else: model_without_parallel = model ckpt = dict( epoch=epoch, state_dict=model_without_parallel.state_dict(), optimizer=optimizer.state_dict(), best_acc1=best_acc1, best_acc5=best_acc5, ) torchsave(ckpt, os.path.join(output_directory, "checkpoint.pth")) if epoch == best_epoch: torchsave(ckpt, os.path.join(output_directory, "best.pth"))
def get_ckpt_dict(model: nn.Module, optimizer: optim.Optimizer, epoch: int) -> dict: """Generate checkpoint dict. checkpoint dict format: { 'epoch': current epoch ([1, num_epochs]), 'model_state_dict': state_dict of model, 'optim_state_dict': state_dict of optimizer } if model is a module wrapper, use `model.module` Args: model (nn.Module): the model to be saved optimizer (optim.Optimizer): the optimizer to be saved epoch: current epoch Returns: checkpoint dict (dict): generated checkpoint dict """ if isinstance(model, DDP): _model = model.module else: _model = model return { 'epoch': epoch, 'model_state_dict': _model.state_dict(), 'optim_state_dict': optimizer.state_dict() }
def update_optimizer_params(optimizer: Optimizer, new_state) -> Optimizer: optim_state = optimizer.state_dict() if "params" in new_state["param_groups"][0].keys(): del new_state["param_groups"][0]["params"] optim_state["param_groups"][0].update(new_state["param_groups"][0]) optimizer.load_state_dict(optim_state) return optimizer
def saveCheckpoint(checkpoint_path, model: nn.Module, optimizer: optim.Optimizer, scheduler: optim.lr_scheduler.MultiStepLR, epoch, iteration): """ Save the training instance to .pth file Parameters ---------- checkpoint_path : str the directory of the model parameter model, optimizer, scheduler : nn.Module, optim.Optimizerm optim_lr_scheduler.MultiStepLR the neural network to save epoch : int (...) iteration : int (...) """ state = { 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), 'epoch': epoch, 'iteration': iteration, 'scheduler': scheduler.state_dict() } torch.save(state, checkpoint_path) return
def save_checkpoint( self, file: Union[Path, str], optimizer: Optimizer, epoch: int, split: int, loss: float, ): model_state = { "state_dict": self.state_dict(), "dictionary": self.dictionary, "is_forward_lm": self.is_forward_lm, "hidden_size": self.hidden_size, "nlayers": self.nlayers, "embedding_size": self.embedding_size, "nout": self.nout, "document_delimiter": self.document_delimiter, "dropout": self.dropout, "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch, "split": split, "loss": loss, } torch.save(model_state, str(file), pickle_protocol=4)
def _save_model(self, save_path: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, iteration: int, optimizer: Optimizer = None, save_as_best: bool = False, extra: dict = None, include_iteration: int = True, name: str = 'model'): extra_state = dict(iteration=iteration) if optimizer: extra_state['optimizer'] = optimizer.state_dict() if extra: extra_state.update(extra) if save_as_best: dir_path = os.path.join(save_path, '%s_best' % name) else: dir_name = '%s_%s' % (name, iteration) if include_iteration else name dir_path = os.path.join(save_path, dir_name) util.create_directories_dir(dir_path) # save model if isinstance(model, DataParallel): model.module.save_pretrained(dir_path) else: model.save_pretrained(dir_path) # save vocabulary tokenizer.save_pretrained(dir_path) # save extra state_path = os.path.join(dir_path, 'extra.state') torch.save(extra_state, state_path)
def save_checkpoint( epoch: int, model: nn.Module, optimizer: optim.Optimizer, models_path: Path, exp_name: str, epoch_metrics: Dict[str, float], model_ema: Optional[Any], amp_scaler: Optional[Any], scheduler: Optional[Any] = None, ) -> None: save_state = { "epoch": epoch + 1, # increment epoch (to not repeat then resume) "state_dict": get_state_dict(model, unwrap_model), "optimizer": optimizer.state_dict(), "val_loss": epoch_metrics["val_loss"], "val_score": epoch_metrics["val_score"], "threshold": epoch_metrics["threshold"], } if model_ema is not None: save_state["state_dict_ema"] = get_state_dict(model_ema, unwrap_model) if amp_scaler is not None: save_state[amp_scaler.state_dict_key] = amp_scaler.state_dict() if scheduler is not None: save_state["lr_scheduler"] = scheduler.state_dict() torch.save( save_state, f"{models_path}/{exp_name}.pth", )
def save_training( loc: str, params: Hyperparams, model: PretrainTaskDaLUKE, res: TrainResults, optimizer: Optimizer, scheduler, scaler = None, pu = None, ) -> list[str]: pu = pu if pu is not None else res.parameter_update paths = list() # Save tracked statistics paths += res.save(loc) paths += params.save(loc) # Save model paths.append(os.path.join(loc, TrainResults.subfolder, MODEL_OUT.format(i=pu))) torch.save(model.state_dict(), paths[-1]) # Save optimizer and scheduler states (these are dymanic over time) paths.append(os.path.join(loc, TrainResults.subfolder, OPTIMIZER_OUT.format(i=pu))) torch.save(optimizer.state_dict(), paths[-1]) paths.append(os.path.join(loc, TrainResults.subfolder, SCHEDULER_OUT.format(i=pu))) torch.save(scheduler.state_dict(), paths[-1]) # Save scaler if using fp16 if scaler: paths.append(os.path.join(loc, TrainResults.subfolder, SCALER_OUT.format(i=pu))) torch.save(scaler.state_dict(), paths[-1]) return paths
def optimizer_state(self, optimizer: Optimizer) -> dict: """ Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom plugins. """ if self.training_type_plugin and hasattr(self.training_type_plugin, "optimizer_state"): return self.training_type_plugin.optimizer_state(optimizer) return optimizer.state_dict()
def save_optimizer(optimizer: Optimizer, path: str): """ Save optimizer state for resuming training :param optimizer: :param path: """ torch.save(optimizer.state_dict(), path) print("Optimizer state saved.")
def save_snapshot(model: torch.nn.Module, optimizer: Optimizer, loss: float, epoch: int, train_history: pd.DataFrame, snapshot_file: str, multi_gpu=False): torch.save({ 'model': model.module.state_dict() if multi_gpu else model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'loss': loss, 'train_history': train_history.to_dict(), 'args': ' '.join(sys.argv[1:]) }, snapshot_file)
def training_backup(model: SavableModel, optimizer: optim.Optimizer, path: str, **kwargs) -> None: addon_dict = { 'optim_type': optimizer.__class__.__name__, 'optim_state_dict': optimizer.state_dict(), **kwargs } if 'optim_kwargs' not in addon_dict: addon_dict['optim_kwargs'] = {} model.save_model(path, addon_dict)
def save(self, path_to_checkpoints_dir: str, step: int, optimizer: Optimizer, scheduler: _LRScheduler) -> str: path_to_checkpoint = os.path.join(path_to_checkpoints_dir, f'model-{step}.pth') checkpoint = { 'state_dict': self.state_dict(), 'step': step, 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict() } torch.save(checkpoint, path_to_checkpoint) return path_to_checkpoint
def save(model: nn.Module, optimizer: optim.Optimizer, last_improvement: int, bleu4: float, is_best: bool): state = { 'bleu-4': bleu4, 'last_improvement': last_improvement, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(state, os.path.join(GlobalConfig.CHECKPOINT_PATH, f'{model.__class__.__name__}.pth')) if is_best: torch.save(state, os.path.join(GlobalConfig.CHECKPOINT_PATH, f'Best_{model.__class__.__name__}.pth'))
def optimizer_state(self, optimizer: Optimizer) -> dict: """ Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom plugins. Return: Optimizer state dict """ if self.ddp_plugin: return self.ddp_plugin.optimizer_state(optimizer) return optimizer.state_dict()
def save_checkpoint(model: nn.Module, optimizer: optim.Optimizer, epoch: int, loss: float, filepath: str): """Saves model and optimizer state to a filepath.""" torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'loss': loss, 'optimizer_state_dict': optimizer.state_dict(), }, filepath)
def training_backup(net: Dense, optimizer: optim.Optimizer, path: str, optim_kwargs=None) -> None: if optim_kwargs is None: optim_kwargs = {} dic = {'state_dict': net.state_dict(), 'params': ( net.nhidden, net.hsize, net.amod, net.p), 'optim_type': optimizer.__class__.__name__, 'optim_state_dict': optimizer.state_dict(), 'optim_kwargs': optim_kwargs} torch.save(dic, path)
def Save(self, filename, model: nn.Module, optimizer: optim.Optimizer, scheduler: optim.lr_scheduler): model_dict = model.state_dict() optimizer_dict = optimizer.state_dict() scheduler_dict = scheduler.state_dict() checkpoint = { "model_dict": model_dict, "optimizer_dict": optimizer_dict, "scheduler_dict": scheduler_dict } th.save(checkpoint, os.path.join(self.directory, filename))
def save_model(net: nn.Module, optimizer: optim.Optimizer, train_loss: float, val_loss: float, iteration: int, batch_size: int, epoch: int, path: str): path = str(path) state = dict(net=net.state_dict(), opt=optimizer.state_dict(), train_loss=train_loss, val_loss=val_loss, iteration=iteration, batch_size=batch_size, epoch=epoch) torch.save(state, path)
def save_checkpoint(P, epoch: int, model: nn.Module, optimiser: Optimizer, saliency_loss: float, decoder_loss: float) -> None: state = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimiser': optimiser.state_dict(), 'saliency_loss': saliency_loss, 'decoder_loss': decoder_loss, 'alpha': P.ALPHA } filename = '{}/cp-{}.pt'.format(P.CHECKPOINT_DIR, epoch+1) torch.save(state, filename)
def save_ckpt(model: Module, optimizer: Optimizer, checkpoint_path: str) -> None: """ Save model and optimizer checkpoint to continuer training """ torch.save( { "model": model.state_dict(), "optimizer": optimizer.state_dict(), }, checkpoint_path, ) print("Saved model and optimizer state to {}".format(checkpoint_path))
def create_checkpoint(self, model: nn.Module, optimizer: optim.Optimizer, hyperparam_dict): model_dict = model.state_dict() optimizer_dict = optimizer.state_dict() state_dict = { "model_dict": model_dict, "optimizer_dict": optimizer_dict, "timestamp": strftime("%I:%M%p GMT%z on %b %d, %Y", localtime()), } checkpoint = {**state_dict, **hyperparam_dict} return checkpoint
def create_checkpoint(self, model: nn.Module, optimizer: optim.Optimizer, hyperparam_dict): model_dict = model.state_dict() optimizer_dict = optimizer.state_dict() state_dict = { 'model_dict': model_dict, 'optimizer_dict': optimizer_dict, 'timestamp': strftime('%I:%M%p GMT%z on %b %d, %Y', localtime()) } checkpoint = {**state_dict, **hyperparam_dict} return checkpoint
def save_model_checkpoint(self, model: torch.nn.Module, optimizer: Optimizer, epoch: int, train_loss: float, val_loss: float) -> None: """Saves model when validation loss decrease.""" if self.verbose: self.trace_func(f'Validation loss decreased ' f'({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'val_loss': val_loss, }, self.model_path)
def _save_checkpoint(self, model: nn.Module, optimizer: optim.Optimizer, trainer_state): # from .. import __version__ torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'trainer_state': trainer_state, 'check_model_class': str(model.__class__), 'check_optimizer_class': str(optimizer.__class__), # 'check_trainer_version': __version__ # TODO }, os.path.join(self.save_directory, self.filename))
def training_backup(model: nn.Module, optimizer: optim.Optimizer, path: str, optim_kwargs=None) -> None: layers = model.layers if optim_kwargs is None: optim_kwargs = {} if not isinstance(layers, bool): layers = layers[1:] dic = {'state_dict': model.state_dict(), 'layers': layers, 'nlf': model.nlf, 'optim_type': optimizer.__class__.__name__, 'optim_state_dict': optimizer.state_dict(), 'optim_kwargs': optim_kwargs} torch.save(dic, path)
def training_backup(model: nn.Module, optimizer: optim.Optimizer, path: str, optim_kwargs=None) -> None: if optim_kwargs is None: optim_kwargs = {} dic = { 'state_dict': model.state_dict(), 'type': model.__class__.__name__, 'nlf': model.nlf, 'optim_type': optimizer.__class__.__name__, 'optim_state_dict': optimizer.state_dict(), 'optim_kwargs': optim_kwargs } torch.save(dic, path)
def save_checkpoint( result_path: str, epoch: int, model: nn.Module, optimizer: optim.Optimizer, best_loss: float, ) -> None: save_states = { "epoch": epoch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), "best_loss": best_loss, } torch.save(save_states, os.path.join(result_path, "checkpoint.pth"))