Beispiel #1
0
    def create_checkpoint(
        self,
        model_G: nn.Module,
        model_D: nn.Module,
        optimizer_G: optim.Optimizer,
        optimizer_R: optim.Optimizer,
        optimizer_D: optim.Optimizer,
        hyperparam_dict,
    ):
        model_G_dict = model_G.state_dict()
        model_D_dict = model_D.state_dict()
        optimizer_G_dict = optimizer_G.state_dict()
        optimizer_R_dict = optimizer_R.state_dict()
        optimizer_D_dict = optimizer_D.state_dict()

        state_dict = {
            "model_G_dict": model_G_dict,
            "model_D_dict": model_D_dict,
            "optimizer_G_dict": optimizer_G_dict,
            "optimizer_R_dict": optimizer_R_dict,
            "optimizer_D_dict": optimizer_D_dict,
            "timestamp": strftime("%I:%M%p GMT%z on %b %d, %Y", localtime()),
        }
        checkpoint = {**state_dict, **hyperparam_dict}

        return checkpoint
Beispiel #2
0
    def save(
        self,
        model: nn.Module,
        optimizer: optim.Optimizer,
        scheduler: optim.lr_scheduler._LRScheduler,
        epoch: int,
        metric: float,
    ):
        if self.best_metric < metric:
            self.best_metric = metric
            self.best_epoch = epoch
            is_best = True
        else:
            is_best = False

        os.makedirs(self.root_dir, exist_ok=True)
        torch.save(
            {
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
                "epoch": epoch,
                "best_epoch": self.best_epoch,
                "best_metric": self.best_metric,
            },
            osp.join(self.root_dir, f"{epoch:02d}.pth"),
        )

        if is_best:
            shutil.copy(
                osp.join(self.root_dir, f"{epoch:02d}.pth"),
                osp.join(self.root_dir, "best.pth"),
            )
Beispiel #3
0
def save_checkpoint(output_directory: str, epoch: int, model: nn.Module,
                    optimizer: optim.Optimizer, best_acc1: float,
                    best_acc5: float, best_epoch: int) -> None:
    """Save a checkpoint and the best model in the history.

    Args:
        output_directory (str): The output directory.
        epoch (int): The epoch of the current checkpoint.
        model (nn.Module): The model in current epoch.
        optimizer (optim.Optimizer): The optimizer in current epoch.
        best_acc1 (float): The best top-1 accuracy of the model in the history.
        best_acc5 (float): The best top-5 accuracy of the model in the history.
        best_epoch (int): The eopch of the best top-1 accuracy in the history.
    """
    if isinstance(model,
                  (nn.DataParallel, nn.parallel.DistributedDataParallel)):
        model_without_parallel = model.module
    else:
        model_without_parallel = model
    ckpt = dict(
        epoch=epoch,
        state_dict=model_without_parallel.state_dict(),
        optimizer=optimizer.state_dict(),
        best_acc1=best_acc1,
        best_acc5=best_acc5,
    )
    torchsave(ckpt, os.path.join(output_directory, "checkpoint.pth"))
    if epoch == best_epoch:
        torchsave(ckpt, os.path.join(output_directory, "best.pth"))
Beispiel #4
0
def get_ckpt_dict(model: nn.Module, optimizer: optim.Optimizer,
                  epoch: int) -> dict:
    """Generate checkpoint dict.
    checkpoint dict format:
    {
        'epoch': current epoch ([1, num_epochs]),
        'model_state_dict': state_dict of model,
        'optim_state_dict': state_dict of optimizer
    }
    if model is a module wrapper, use `model.module`

    Args:
        model (nn.Module): the model to be saved
        optimizer (optim.Optimizer): the optimizer to be saved
        epoch: current epoch

    Returns:
        checkpoint dict (dict): generated checkpoint dict
    """

    if isinstance(model, DDP):
        _model = model.module
    else:
        _model = model
    return {
        'epoch': epoch,
        'model_state_dict': _model.state_dict(),
        'optim_state_dict': optimizer.state_dict()
    }
Beispiel #5
0
def update_optimizer_params(optimizer: Optimizer, new_state) -> Optimizer:
    optim_state = optimizer.state_dict()
    if "params" in new_state["param_groups"][0].keys():
        del new_state["param_groups"][0]["params"]
    optim_state["param_groups"][0].update(new_state["param_groups"][0])
    optimizer.load_state_dict(optim_state)
    return optimizer
Beispiel #6
0
def saveCheckpoint(checkpoint_path, model: nn.Module, optimizer: optim.Optimizer, scheduler: optim.lr_scheduler.MultiStepLR, epoch, iteration):
    """
    Save the training instance to .pth file

    Parameters
    ----------
    checkpoint_path : str
        the directory of the model parameter

    model, optimizer, scheduler : nn.Module, optim.Optimizerm optim_lr_scheduler.MultiStepLR
        the neural network to save

    epoch : int
        (...)

    iteration : int
        (...)
    """
    state = {
        'state_dict': model.state_dict(),
        'optimizer' : optimizer.state_dict(),
        'epoch': epoch,
        'iteration': iteration,
        'scheduler': scheduler.state_dict()
    }

    torch.save(state, checkpoint_path)

    return
Beispiel #7
0
    def save_checkpoint(
        self,
        file: Union[Path, str],
        optimizer: Optimizer,
        epoch: int,
        split: int,
        loss: float,
    ):
        model_state = {
            "state_dict": self.state_dict(),
            "dictionary": self.dictionary,
            "is_forward_lm": self.is_forward_lm,
            "hidden_size": self.hidden_size,
            "nlayers": self.nlayers,
            "embedding_size": self.embedding_size,
            "nout": self.nout,
            "document_delimiter": self.document_delimiter,
            "dropout": self.dropout,
            "optimizer_state_dict": optimizer.state_dict(),
            "epoch": epoch,
            "split": split,
            "loss": loss,
        }

        torch.save(model_state, str(file), pickle_protocol=4)
Beispiel #8
0
    def _save_model(self, save_path: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer,
                    iteration: int, optimizer: Optimizer = None, save_as_best: bool = False,
                    extra: dict = None, include_iteration: int = True, name: str = 'model'):
        extra_state = dict(iteration=iteration)

        if optimizer:
            extra_state['optimizer'] = optimizer.state_dict()

        if extra:
            extra_state.update(extra)

        if save_as_best:
            dir_path = os.path.join(save_path, '%s_best' % name)
        else:
            dir_name = '%s_%s' % (name, iteration) if include_iteration else name
            dir_path = os.path.join(save_path, dir_name)

        util.create_directories_dir(dir_path)

        # save model
        if isinstance(model, DataParallel):
            model.module.save_pretrained(dir_path)
        else:
            model.save_pretrained(dir_path)

        # save vocabulary
        tokenizer.save_pretrained(dir_path)

        # save extra
        state_path = os.path.join(dir_path, 'extra.state')
        torch.save(extra_state, state_path)
Beispiel #9
0
def save_checkpoint(
    epoch: int,
    model: nn.Module,
    optimizer: optim.Optimizer,
    models_path: Path,
    exp_name: str,
    epoch_metrics: Dict[str, float],
    model_ema: Optional[Any],
    amp_scaler: Optional[Any],
    scheduler: Optional[Any] = None,
) -> None:
    save_state = {
        "epoch": epoch + 1,  # increment epoch (to not repeat then resume)
        "state_dict": get_state_dict(model, unwrap_model),
        "optimizer": optimizer.state_dict(),
        "val_loss": epoch_metrics["val_loss"],
        "val_score": epoch_metrics["val_score"],
        "threshold": epoch_metrics["threshold"],
    }
    if model_ema is not None:
        save_state["state_dict_ema"] = get_state_dict(model_ema, unwrap_model)
    if amp_scaler is not None:
        save_state[amp_scaler.state_dict_key] = amp_scaler.state_dict()
    if scheduler is not None:
        save_state["lr_scheduler"] = scheduler.state_dict()
    torch.save(
        save_state,
        f"{models_path}/{exp_name}.pth",
    )
Beispiel #10
0
def save_training(
    loc:       str,
    params:    Hyperparams,
    model:     PretrainTaskDaLUKE,
    res:       TrainResults,
    optimizer: Optimizer,
    scheduler,
    scaler   = None,
    pu       = None,
) -> list[str]:
    pu = pu if pu is not None else res.parameter_update
    paths = list()
    # Save tracked statistics
    paths += res.save(loc)
    paths += params.save(loc)
    # Save model
    paths.append(os.path.join(loc, TrainResults.subfolder, MODEL_OUT.format(i=pu)))
    torch.save(model.state_dict(), paths[-1])
    # Save optimizer and scheduler states (these are dymanic over time)
    paths.append(os.path.join(loc, TrainResults.subfolder, OPTIMIZER_OUT.format(i=pu)))
    torch.save(optimizer.state_dict(), paths[-1])
    paths.append(os.path.join(loc, TrainResults.subfolder, SCHEDULER_OUT.format(i=pu)))
    torch.save(scheduler.state_dict(), paths[-1])
    # Save scaler if using fp16
    if scaler:
        paths.append(os.path.join(loc, TrainResults.subfolder, SCALER_OUT.format(i=pu)))
        torch.save(scaler.state_dict(), paths[-1])
    return paths
Beispiel #11
0
 def optimizer_state(self, optimizer: Optimizer) -> dict:
     """
     Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom
     plugins.
     """
     if self.training_type_plugin and hasattr(self.training_type_plugin, "optimizer_state"):
         return self.training_type_plugin.optimizer_state(optimizer)
     return optimizer.state_dict()
Beispiel #12
0
def save_optimizer(optimizer: Optimizer, path: str):
    """
    Save optimizer state for resuming training

    :param optimizer:
    :param path:
    """
    torch.save(optimizer.state_dict(), path)
    print("Optimizer state saved.")
Beispiel #13
0
def save_snapshot(model: torch.nn.Module, optimizer: Optimizer, loss: float, epoch: int, train_history: pd.DataFrame, snapshot_file: str, multi_gpu=False):
    torch.save({
        'model': model.module.state_dict() if multi_gpu else model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': epoch,
        'loss': loss,
        'train_history': train_history.to_dict(),
        'args': ' '.join(sys.argv[1:])
    }, snapshot_file)
Beispiel #14
0
def training_backup(model: SavableModel, optimizer: optim.Optimizer, path: str,
                    **kwargs) -> None:
    addon_dict = {
        'optim_type': optimizer.__class__.__name__,
        'optim_state_dict': optimizer.state_dict(),
        **kwargs
    }
    if 'optim_kwargs' not in addon_dict:
        addon_dict['optim_kwargs'] = {}
    model.save_model(path, addon_dict)
 def save(self, path_to_checkpoints_dir: str, step: int, optimizer: Optimizer, scheduler: _LRScheduler) -> str:
     path_to_checkpoint = os.path.join(path_to_checkpoints_dir, f'model-{step}.pth')
     checkpoint = {
         'state_dict': self.state_dict(),
         'step': step,
         'optimizer_state_dict': optimizer.state_dict(),
         'scheduler_state_dict': scheduler.state_dict()
     }
     torch.save(checkpoint, path_to_checkpoint)
     return path_to_checkpoint
Beispiel #16
0
def save(model: nn.Module, optimizer: optim.Optimizer, last_improvement: int, bleu4: float, is_best: bool):
    state = {
        'bleu-4': bleu4,
        'last_improvement': last_improvement,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(state, os.path.join(GlobalConfig.CHECKPOINT_PATH,  f'{model.__class__.__name__}.pth'))
    if is_best:
        torch.save(state, os.path.join(GlobalConfig.CHECKPOINT_PATH, f'Best_{model.__class__.__name__}.pth'))
 def optimizer_state(self, optimizer: Optimizer) -> dict:
     """
     Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom
     plugins.
     Return:
         Optimizer state dict
     """
     if self.ddp_plugin:
         return self.ddp_plugin.optimizer_state(optimizer)
     return optimizer.state_dict()
Beispiel #18
0
def save_checkpoint(model: nn.Module, optimizer: optim.Optimizer, epoch: int,
                    loss: float, filepath: str):
    """Saves model and optimizer state to a filepath."""
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'loss': loss,
            'optimizer_state_dict': optimizer.state_dict(),
        }, filepath)
Beispiel #19
0
def training_backup(net: Dense, optimizer: optim.Optimizer, path: str,
                    optim_kwargs=None) -> None:
    if optim_kwargs is None:
        optim_kwargs = {}
    dic = {'state_dict': net.state_dict(),
           'params': (
               net.nhidden, net.hsize, net.amod, net.p),
           'optim_type': optimizer.__class__.__name__,
           'optim_state_dict': optimizer.state_dict(),
           'optim_kwargs': optim_kwargs}
    torch.save(dic, path)
Beispiel #20
0
    def Save(self, filename, model: nn.Module, optimizer: optim.Optimizer, scheduler: optim.lr_scheduler):
        model_dict = model.state_dict()
        optimizer_dict = optimizer.state_dict()
        scheduler_dict = scheduler.state_dict()

        checkpoint = {
            "model_dict": model_dict,
            "optimizer_dict": optimizer_dict,
            "scheduler_dict": scheduler_dict
        }

        th.save(checkpoint, os.path.join(self.directory, filename))
Beispiel #21
0
def save_model(net: nn.Module, optimizer: optim.Optimizer, train_loss: float,
               val_loss: float, iteration: int, batch_size: int, epoch: int,
               path: str):
    path = str(path)
    state = dict(net=net.state_dict(),
                 opt=optimizer.state_dict(),
                 train_loss=train_loss,
                 val_loss=val_loss,
                 iteration=iteration,
                 batch_size=batch_size,
                 epoch=epoch)
    torch.save(state, path)
Beispiel #22
0
def save_checkpoint(P, epoch: int, model: nn.Module, optimiser: Optimizer, saliency_loss: float, decoder_loss: float) -> None:
  state = {
    'epoch': epoch + 1,
    'model': model.state_dict(),
    'optimiser': optimiser.state_dict(),
    'saliency_loss': saliency_loss,
    'decoder_loss': decoder_loss,
    'alpha': P.ALPHA
  }

  filename = '{}/cp-{}.pt'.format(P.CHECKPOINT_DIR, epoch+1)
  torch.save(state, filename)
Beispiel #23
0
def save_ckpt(model: Module, optimizer: Optimizer,
              checkpoint_path: str) -> None:
    """
    Save model and optimizer checkpoint to continuer training
    """
    torch.save(
        {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        },
        checkpoint_path,
    )
    print("Saved model and optimizer state to {}".format(checkpoint_path))
Beispiel #24
0
    def create_checkpoint(self, model: nn.Module, optimizer: optim.Optimizer,
                          hyperparam_dict):
        model_dict = model.state_dict()
        optimizer_dict = optimizer.state_dict()

        state_dict = {
            "model_dict": model_dict,
            "optimizer_dict": optimizer_dict,
            "timestamp": strftime("%I:%M%p GMT%z on %b %d, %Y", localtime()),
        }
        checkpoint = {**state_dict, **hyperparam_dict}

        return checkpoint
Beispiel #25
0
    def create_checkpoint(self, model: nn.Module, optimizer: optim.Optimizer,
                          hyperparam_dict):
        model_dict = model.state_dict()
        optimizer_dict = optimizer.state_dict()

        state_dict = {
            'model_dict': model_dict,
            'optimizer_dict': optimizer_dict,
            'timestamp': strftime('%I:%M%p GMT%z on %b %d, %Y', localtime())
        }
        checkpoint = {**state_dict, **hyperparam_dict}

        return checkpoint
Beispiel #26
0
 def save_model_checkpoint(self, model: torch.nn.Module, optimizer: Optimizer,
                           epoch: int, train_loss: float, val_loss: float) -> None:
   """Saves model when validation loss decrease."""
   if self.verbose:
     self.trace_func(f'Validation loss decreased '
                     f'({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
   torch.save({
     'epoch': epoch,
     'model_state_dict': model.state_dict(),
     'optimizer_state_dict': optimizer.state_dict(),
     'train_loss': train_loss,
     'val_loss': val_loss,
   }, self.model_path)
Beispiel #27
0
    def _save_checkpoint(self, model: nn.Module, optimizer: optim.Optimizer,
                         trainer_state):
        # from .. import __version__

        torch.save(
            {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'trainer_state': trainer_state,
                'check_model_class': str(model.__class__),
                'check_optimizer_class': str(optimizer.__class__),
                # 'check_trainer_version': __version__    # TODO
            },
            os.path.join(self.save_directory, self.filename))
Beispiel #28
0
def training_backup(model: nn.Module, optimizer: optim.Optimizer, path: str,
                    optim_kwargs=None) -> None:
    layers = model.layers
    if optim_kwargs is None:
        optim_kwargs = {}
    if not isinstance(layers, bool):
        layers = layers[1:]
    dic = {'state_dict': model.state_dict(),
           'layers': layers,
           'nlf': model.nlf,
           'optim_type': optimizer.__class__.__name__,
           'optim_state_dict': optimizer.state_dict(),
           'optim_kwargs': optim_kwargs}
    torch.save(dic, path)
Beispiel #29
0
def training_backup(model: nn.Module,
                    optimizer: optim.Optimizer,
                    path: str,
                    optim_kwargs=None) -> None:
    if optim_kwargs is None:
        optim_kwargs = {}
    dic = {
        'state_dict': model.state_dict(),
        'type': model.__class__.__name__,
        'nlf': model.nlf,
        'optim_type': optimizer.__class__.__name__,
        'optim_state_dict': optimizer.state_dict(),
        'optim_kwargs': optim_kwargs
    }
    torch.save(dic, path)
Beispiel #30
0
def save_checkpoint(
    result_path: str,
    epoch: int,
    model: nn.Module,
    optimizer: optim.Optimizer,
    best_loss: float,
) -> None:

    save_states = {
        "epoch": epoch,
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "best_loss": best_loss,
    }

    torch.save(save_states, os.path.join(result_path, "checkpoint.pth"))