def save(epoch_idx: int, best_accuracy: float, gen_model: nn.Module, dis_model: nn.Module, gen_opt: torch.optim.Optimizer, dis_opt: torch.optim.Optimizer): checkpoint = { 'epoch': epoch_idx, 'best_acc': best_accuracy, 'lr': scheduler.gen_lr, 'generator': gen_model.state_dict(), 'discriminator': dis_model.state_dict(), 'gen_optimizer': gen_opt.state_dict(), 'dis_optimizer': dis_opt.state_dict() } torch.save(checkpoint, ds.SAVE_DIR + 'weights/checkpoint')
def save(self, model: nn.Module, step: int, optimizer: torch.optim.Optimizer, epoch: int, **kwargs): if self.prefix: checkpoint_path = os.path.join(self.dir, f"{self.prefix}_{str(step)}.pth") else: checkpoint_path = os.path.join(self.dir, str(step) + ".pth") save_state = { 'epoch': epoch + 1, 'optimizer': optimizer.state_dict(), } if self.only_weights: save_state['state_dict'] = model.state_dict() else: save_state['model'] = model save_state = dict(save_state, **kwargs) torch.save(save_state, checkpoint_path) print(f"Saved in {checkpoint_path}") popped = self.checkpoitns.append(checkpoint_path) if popped: try: os.remove(popped) print("removed") except OSError: pass
def save_results(output_dir: str, model: nn.Module, optimizer: torch.optim.Optimizer) -> None: if distributed.is_main_process(): logger.info("Dump the last model") torch.save(model.state_dict(), os.path.join(output_dir, "model.pt")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
def save_checkpoint(epoch: int, update: int, samples_seen: int, model: torch.nn.Module, optimizer: torch.optim.Optimizer, scheduler: Optional[ReduceLROnPlateau], numpy_epoch_random_state: Tuple, train_loss: float, best_valid_loss: float, best_valid_loss_index: int, best_valid_acc: float, filename: str) -> None: torch.save( { "epoch": epoch, "update": update, "samples_seen": samples_seen, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict() if scheduler is not None else None, "train_loss": train_loss, "best_valid_loss": best_valid_loss, "best_valid_loss_index": best_valid_loss_index, "best_valid_acc": best_valid_acc, "numpy_epoch_random_state": numpy_epoch_random_state, "numpy_last_random_state": np.random.get_state(), "torch_last_random_state": torch.random.get_rng_state() }, filename)
def save_checkpoint(self, model: torch.nn.Module, optimizer: torch.optim.Optimizer = None, is_best: bool = False): """ Save checkpoint under the path. Parameters ---------- model: ``torch.nn.Module``, required. The model to be saved optimizer: ``torch.optim.Optimizer``, optional. The optimizer to be saved (if provided) is_best: bool, optional, (default=False). If set false, would only be saved as ``checkpoint_#counter.th``; otherwise, would also be saved as ``best.th`` """ s_dict = {'model': model.state_dict()} if optimizer is not None: s_dict['optimizer'] = optimizer.state_dict() if is_best: torch.save(s_dict, os.path.join(self.path, 'best.th')) torch.save( s_dict, os.path.join(self.path, 'checkpoint_{}.th'.format(self.counter))) self.counter += 1 if self.counter > self.checkpoints_to_keep: os.remove( os.path.join( self.path, 'checkpoint_{}.th'.format(self.counter - self.checkpoints_to_keep - 1)))
def save_model_and_optimizer_with_info(self, model: torch.nn.Module, optimizer: torch.optim.Optimizer, info: dict) -> None: """Save model and optimizer state dictionaries to file given epoch info This is called automatically during :func:`update_for_epoch`. Does not save if there is no directory to save to (i.e. ``self.state_dir is None``). Format strings from ``self.params`` are formatted with the values from `info` to construct the base names of each file Parameters ---------- model : AcousticModel optimizer : torch.optim.Optimizer info : dict A dictionary with the entries "epoch", "es_resume_cd", "es_patience_cd", "rlr_resume_cd", "rlr_patience_cd", "lr", "train_met", "val_met", and any entries specified through :func:`add_entry` """ if self.state_dir is None: return if not os.path.isdir(self.state_dir): os.makedirs(self.state_dir) model_basename = self.params.saved_model_fmt.format(**info) optimizer_basename = self.params.saved_optimizer_fmt.format(**info) torch.save( model.state_dict(), os.path.join(self.state_dir, model_basename), ) torch.save( optimizer.state_dict(), os.path.join(self.state_dir, optimizer_basename), )
def train(model: torch.nn.Module, data_loader, epoch: int, num_iterations: int, batch_size: int, optimizer: torch.optim.Optimizer, criterion: torch.nn.Module, device: torch.device): model.train() epoch_loss = 0 # TODO pass epoch to loader for batch_id, data in enumerate(data_loader(num_iterations, batch_size)): inputs, targets = data[0].to(device), data[1].to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) print(f'Loss on batch: {loss}') loss.backward() optimizer.step() epoch_loss += loss.item() if (batch_id % 1000) == 0: torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'batch_loss': loss.item() }, os.path.join(CHECKPOINTS_DIR, f'model_LRW_train_{batch_id}.tar')) return epoch_loss / num_iterations
def save_checkpoint( filename: str, epoch: int, model: torch.nn.Module, optimizer: torch.optim.Optimizer, f1_score: float, vocabs: Dict[str, Any], cfg: DictConfig, ) -> None: model.cpu() path = os.path.join("checkpoints", filename) torch.save( { "cfg": conf2dict(cfg), "epoch": epoch, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "f1_score": f1_score, "vocabs": vocabs, }, path, ) log.info("Checkpoint saved to %s" % path) device, _ = get_device() model.to(device)
def save(self, best: bool, epoch: int, optimizer: torch.optim.Optimizer): filename = 'best.tar' if best else 'last.tar' print("Saving model as {}...".format(filename), end=' ') torch.save({'epoch': epoch, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, os.path.join(self.config.checkpoint_dir, filename)) print("Model saved.")
def save_checkpoint(model: torch.nn.Module, optim: torch.optim.Optimizer, config: Dict[str, Any], path: Path) -> None: torch.save( { 'model': model.state_dict(), 'optim': optim.state_dict(), 'config': config }, str(path))
def save(model: torch.nn.Module, optimizer: torch.optim.Optimizer, epoch: int, path: str): params = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch } torch.save(params, path)
def save_train_state(epoch: int, model: nn.Module, optimizer: torch.optim.Optimizer, scheduler, best_score: float, file_path): torch.save({ 'epoch': epoch, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'best_score': best_score, }, file_path)
def make_checkpoint(epoch: int, model: LSTM, loss_function: Union[SplitCrossEntropyLoss, CrossEntropyLoss], optimizer: torch.optim.Optimizer, use_apex=False, amp=None, prior: Union[str, nn.Module] = None, **kwargs): """ Packages network parameters into a picklable dictionary containing keys * epoch: current epoch * model: the network model * loss: the loss function * optimizer: the torch optimizer * use_apex: use nvidia apex for AMP or not * amp: the nvidia AMP object Parameters ---------- epoch : int The current epoch of training model : LSTM The network model loss_function : SplitCrossEntropyLoss or CrossEntropyLoss The loss function optimizer : torch.optim.optimizer The optimizer function use_apex : bool If mixed precision mode is activated. If this is true, the `amp` argument should be supplied as well. The default value is False. amp : The nvidia apex amp object, should contain information about state of training kwargs : Not used Returns ------- checkpoint: dict A picklable dict containing the checkpoint """ checkpoint = { 'epoch': epoch, 'model': model.state_dict(), 'loss': loss_function.state_dict(), 'optimizer': optimizer.state_dict(), } if use_apex: checkpoint['amp'] = amp.state_dict() if prior is not None and not isinstance(prior, str): checkpoint['prior'] = prior return checkpoint
def checkpoint_model( net: torch.nn.Module, optimizer: torch.optim.Optimizer, memory: deque, path: str, ): torch.save({ 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'memory': memory, }, path)
def dump_optimizer_state(optimizer: torch.optim.Optimizer): with torch.no_grad(): flat_metadata, flat_tensors = [], [] for elem in nested_flatten(optimizer.state_dict()): if isinstance(elem, torch.Tensor): flat_metadata.append( dict(type='tensor', index=len(flat_tensors))) flat_tensors.append(elem.cpu()) else: flat_metadata.append(dict(type='value', value=elem)) return flat_metadata, flat_tensors
def save_checkpoint(model: torch.nn.Module, optimizer: torch.optim.Optimizer, path: str, epoch: int) -> None: """Save a training checkpoint.""" checkpoint_path = _get_checkpoint_path(path, epoch) print(f"Saving checkpoint to {checkpoint_path}", flush=True) torch.save( { "epoch": epoch + 1, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), }, checkpoint_path)
def save_checkpoint(path: str, model: nn.Module, optimizer: torch.optim.Optimizer, lr_scheduler: "learning rate policy", epoch: int) -> None: path = path + "/model-optim-lr_sch-epoch.tar" torch.save( { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch }, path)
def save_model(self, model: torch.nn.Module, optimizer: torch.optim.Optimizer, epoch: int, acc: float, is_best: bool): path = self.__get_model_path(self.__get_model_filename(epoch, is_best)) with open(path, 'w') as f: torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': acc, 'optimizer': optimizer.state_dict() }, f)
def load_optimizer_state(optimizer: torch.optim.Optimizer, flat_metadata: Dict, flat_tensors: Sequence[torch.Tensor]): flat_optimizer_state = [] for elem in flat_metadata: if elem.get('type') == 'tensor' and isinstance(elem.get('index'), int): flat_optimizer_state.append(flat_tensors[elem['index']]) elif elem.get('type') == 'value' and 'value' in elem: flat_optimizer_state.append(elem['value']) with torch.no_grad(): return optimizer.load_state_dict( nested_pack(flat_optimizer_state, structure=optimizer.state_dict()))
def dump_optimizer_state(opt: torch.optim.Optimizer): """ Convert optimizer state into a format of DecentralizedAverager's get_current_state/load_state_from_peers """ with torch.no_grad(): flat_metadata, flat_tensors = [], [] for elem in nested_flatten(opt.state_dict()): if isinstance(elem, torch.Tensor): flat_metadata.append( dict(type='tensor', index=len(flat_tensors))) flat_tensors.append(elem.cpu()) else: flat_metadata.append(dict(type='value', value=elem)) return flat_metadata, flat_tensors
def save_checkpoint( command_history: CommandHistory, epoch: int, model: torch.jit.ScriptModule, optim: torch.optim.Optimizer, game_params: GameParams, model_params: ModelParams, optim_params: OptimParams, simulation_params: SimulationParams, execution_params: ExecutionParams, executor: ThreadPoolExecutor = None, ) -> None: checkpoint_dir = execution_params.checkpoint_dir save_uncompressed = execution_params.save_uncompressed checkpoint_name = f"checkpoint_{epoch}" checkpoint = { "command_history": command_history, "epoch": epoch, "model_state_dict": { k: v.cpu().clone() if isinstance(v, torch.Tensor) else copy.deepcopy(v) for k, v in model.state_dict().items() }, "optim_state_dict": { k: v.cpu().clone() if isinstance(v, torch.Tensor) else copy.deepcopy(v) for k, v in optim.state_dict().items() }, "game_params": game_params, "model_params": model_params, "optim_params": optim_params, "simulation_params": simulation_params, "execution_params": execution_params, } def saveit(): nonlocal save_uncompressed nonlocal checkpoint nonlocal checkpoint_dir if save_uncompressed: torch.save(checkpoint, checkpoint_dir / f"{checkpoint_name}.pt") else: # with zipfile.ZipFile(Path(checkpoint_dir) / f"{checkpoint_name}.zip", "w", allowZip64=True) as z: # with z.open(f"{checkpoint_name}.pt", "w", force_zip64=True) as f: # torch.save(checkpoint, f) with gzip.open(checkpoint_dir / f"{checkpoint_name}.pt.gz", "wb") as f: torch.save(checkpoint, f) if executor is not None: return executor.submit(saveit) else: saveit()
def save_model_checkpoint(model: torch.nn.Module, optimizer: torch.optim.Optimizer, criterion, epochs, replay_buffer, filename='./saved_model.pth'): torch.save( { 'epoch': epochs, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': criterion, 'replay_buffer': replay_buffer }, filename)
def _save_model(self, optimizer: torch.optim.Optimizer, epoch: int, loss: float, best_bleu: float) -> None: """ Save model. """ state = { "epoch": epoch, "best_bleu_score": best_bleu, "loss": loss, "model_state": self.model.state_dict(), "optimizer_state": optimizer.state_dict(), } model_path = os.path.join(self.experiment_path, "best_model.pt") torch.save(state, model_path) self.logger.info(f"New best bleu! Model saved to {model_path}")
def save_model(self, model: torch.nn.Module, optimizer: torch.optim.Optimizer, epoch: int, acc: float, is_best: bool): path = self.__get_model_path(self.__get_model_filename(epoch, is_best)) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'bw') as f: torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': acc, 'optimizer': optimizer.state_dict() }, f) if is_best: self.save_model(model, optimizer, epoch, acc, False)
def saveModel ( model_filename: str, schedulers: typing.List, epoch: int, network: torch.nn.Module, optimizer: torch.optim.Optimizer, validationLoss: typing.List[float], trainingLoss: typing.List[float], trainingDifference: typing.List[typing.Tuple[float, float]], validationDifference: typing.List[typing.Tuple[float, float]], uncertainty: typing.List[typing.Tuple[float, float]], testingDifference: typing.List[typing.Tuple[float, float]], anees: typing.List[typing.Tuple[float, float]] ) -> None: """ Saves the given model to the given file. Also saves the scheduler, last epoch, optimizer, and previous losses. :param model_filename: The file to save the model to. :param schedulers: The schedulers to save. :param epoch: The last epoch that the model was trained on. :param network: The network that is being saved. :param optimizer: The optimizer that is to be saved with the network. :param validationLoss: The history of validation losses. :param trainingLoss: The history of training losses. :param trainingDifference: The training differences over the epochs. :param validationDifference: The validation differences over the epochs. :param uncertainty: The uncertainty history of the network. :param testingDifference: The testing differences of the network over the epochs. :param anees: The history of Average Normalized Error Estimate Squared the network has had. """ # Don't save if we don't want to. if Config.getArgs ().dont_save: return # Save network to file. if "{}" in model_filename: model_filename = model_filename.format ( Config.version, epoch, Config.getArgs ().model_number ) Logger.log ( "Saving model to " + model_filename + ".", logger = "min" ) saveCheckpoint ( filepath = model_filename, currModel = { "model": network.state_dict (), "epoch": epoch, "optimizer": optimizer.state_dict (), "schedulers": schedulers, "version": Config.version, "trainingLoss": trainingLoss, "validationLoss": validationLoss, "validationDifference": validationDifference, "trainingDifference": trainingDifference, "uncertainty": uncertainty, "testingDifference": testingDifference, "anees": anees } )
def save_checkpoint( epoch: int, model: torch.nn.Module, optimizer: torch.optim.Optimizer, models_path: Path, exp_name: str, ) -> None: save_state = { "epoch": epoch + 1, # increment epoch (to not repeat then resume) "state_dict": get_state_dict(model, unwrap_model), "optimizer": optimizer.state_dict(), } torch.save( save_state, f"{models_path}/{exp_name}.pth", )
def save(epoch: int, model, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler, config): """ Pickles the models to hdd """ now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") out_dir = config.output_dir save_name = os.path.join(out_dir, 'epoch_{}_{}.pth'.format(epoch, now)) save_dict = { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), } torch.save(save_dict, save_name) print("Saved the model to hdd")
def run_model_selection(self, net: torch.nn.Module, optimizer: torch.optim.Optimizer, monitor_metrics: dict, epoch: int): # take the mean over all selection criteria in each epoch non_nan_scores = np.mean(np.array([[0 if (ii is None or np.isnan(ii)) else ii for ii in monitor_metrics['val'][sc]] for sc in self.cf.model_selection_criteria]), 0) epochs_scores = [ii for ii in non_nan_scores[1:]] # ranking of epochs according to model_selection_criterion epoch_ranking = np.argsort(epochs_scores, kind="stable")[::-1] + 1 #epochs start at 1 # if set in configs, epochs < min_save_thresh are discarded from saving process. epoch_ranking = epoch_ranking[epoch_ranking >= self.cf.min_save_thresh] # check if current epoch is among the top-k epochs. if epoch in epoch_ranking[:self.cf.save_n_models]: save_dir = os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch)) if not os.path.exists(save_dir): os.mkdir(save_dir) torch.save(net.state_dict(), os.path.join(save_dir, 'params.pth')) with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle: pickle.dump(monitor_metrics, handle) # save epoch_ranking to keep info for inference. np.save(os.path.join(self.cf.fold_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models]) np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models]) self.logger.info( "saving current epoch {} at rank {}".format(epoch, np.argwhere(epoch_ranking == epoch))) # delete params of the epoch that just fell out of the top-k epochs. for se in [int(ii.split('_')[0]) for ii in os.listdir(self.cf.fold_dir) if 'best_checkpoint' in ii]: if se in epoch_ranking[self.cf.save_n_models:]: subprocess.call('rm -rf {}'.format(os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(se))), shell=True) self.logger.info('deleting epoch {} at rank {}'.format(se, np.argwhere(epoch_ranking == se))) state = { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), } # save checkpoint of current epoch. save_dir = os.path.join(self.cf.fold_dir, 'last_checkpoint'.format(epoch)) if not os.path.exists(save_dir): os.mkdir(save_dir) torch.save(state, os.path.join(save_dir, 'params.pth')) np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models]) with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle: pickle.dump(monitor_metrics, handle)
def save_model( path: str, model: torch.nn.Module, optimizer: torch.optim.Optimizer ): """ Save a torch model to given output path. :param path: The path. :param model: The model to save. :param optimizer: The optimizer to save """ data_path = os.path.join(path, 'checkpoint.pth.tar') torch.save(model, os.path.join(path, 'model.pth')) torch.save({ 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, data_path)
def __init__(self, model: torch.nn.Module, optimizer: torch.optim.Optimizer, criterion: torch.nn.Module, logger: Logger, grad_clip: float = None): self.model = model self.optimizer = optimizer self.criterion = criterion self.logger = logger self.grad_clip = grad_clip self.history = {'lrs': [], 'losses': []} torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, 'init_params.pt')