Example #1
0
def save_checkpoint(model, optimizer, epoch):
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    if not dist.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    os.makedirs(get_checkpoint_dir(), exist_ok=True)
    # Omit the DDP wrapper in the multi-gpu setting
    sd = model.module.state_dict() if cfg.NUM_GPUS > 1 else model.state_dict()
    # Record the state
    if isinstance(optimizer, list):
        checkpoint = {
            "epoch": epoch,
            "model_state": sd,
            "optimizer_w_state": optimizer[0].state_dict(),
            "optimizer_a_state": optimizer[1].state_dict(),
            "cfg": cfg.dump(),
        }
    else:
        checkpoint = {
            "epoch": epoch,
            "model_state": sd,
            "optimizer_state": optimizer.state_dict(),
            "cfg": cfg.dump(),
        }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    torch.save(checkpoint, checkpoint_file)
    return checkpoint_file
Example #2
0
def save_checkpoint(info, model_state, optimizer_state, epoch, cfg):
    
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    # if not dist.is_master_proc():
    #     return
    # Ensure that the checkpoint dir exists
    os.makedirs(cfg.EPISODE_DIR, exist_ok=True)

    # Record the state
    checkpoint = {
        "epoch": epoch,
        "model_state": model_state,
        "optimizer_state": optimizer_state,
        "cfg": cfg.dump(),
    }
    global _NAME_PREFIX
    _NAME_PREFIX = info + '_' + _NAME_PREFIX

    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch, cfg.EPISODE_DIR)
    torch.save(checkpoint, checkpoint_file)
    # print("Model checkpoint saved at path: {}".format(checkpoint_file))

    _NAME_PREFIX = 'model_epoch_'
    return checkpoint_file
Example #3
0
def save_checkpoint(model, optimizer, epoch):
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    if not dist.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    os.makedirs(get_checkpoint_dir(), exist_ok=True)
    # Record the state
    checkpoint = {
        "epoch": epoch,
        "model_state": unwrap_model(model).state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "cfg": cfg.dump(),
    }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    torch.save(checkpoint, checkpoint_file)
    return checkpoint_file
Example #4
0
def save_checkpoint(model, optimizer, epoch):
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    if not du.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    os.makedirs(get_checkpoint_dir(), exist_ok=True)
    # Omit the DDP wrapper in the multi-gpu setting
    sd = model.module.state_dict() if cfg.NUM_GPUS > 1 else model.state_dict()
    # Record the state
    checkpoint = {
        'epoch': epoch,
        'model_state': sd,
        'optimizer_state': optimizer.state_dict(),
        'cfg': cfg.dump()
    }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    torch.save(checkpoint, checkpoint_file)
    return checkpoint_file
Example #5
0
def save_checkpoint(model, optimizer, epoch, best):
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    if not dist.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    pathmgr.mkdirs(get_checkpoint_dir())
    # Record the state
    checkpoint = {
        "epoch": epoch,
        "model_state": unwrap_model(model).state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "cfg": cfg.dump(),
    }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    with pathmgr.open(checkpoint_file, "wb") as f:
        torch.save(checkpoint, f)
    # If best copy checkpoint to the best checkpoint
    if best:
        pathmgr.copy(checkpoint_file, get_checkpoint_best())
    return checkpoint_file
Example #6
0
def save_checkpoint(model, model_ema, optimizer, epoch, test_err, ema_err):
    """Saves a checkpoint and also the best weights so far in a best checkpoint."""
    # Save checkpoints only from the main process
    if not dist.is_main_proc():
        return
    # Ensure that the checkpoint dir exists
    pathmgr.mkdirs(get_checkpoint_dir())
    # Record the state
    checkpoint = {
        "epoch": epoch,
        "test_err": test_err,
        "ema_err": ema_err,
        "model_state": unwrap_model(model).state_dict(),
        "ema_state": unwrap_model(model_ema).state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "cfg": cfg.dump(),
    }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    with pathmgr.open(checkpoint_file, "wb") as f:
        torch.save(checkpoint, f)
    # Store the best model and model_ema weights so far
    if not pathmgr.exists(get_checkpoint_best()):
        pathmgr.copy(checkpoint_file, get_checkpoint_best())
    else:
        with pathmgr.open(get_checkpoint_best(), "rb") as f:
            best = torch.load(f, map_location="cpu")
        # Select the best model weights and the best model_ema weights
        if test_err < best["test_err"] or ema_err < best["ema_err"]:
            if test_err < best["test_err"]:
                best["model_state"] = checkpoint["model_state"]
                best["test_err"] = test_err
            if ema_err < best["ema_err"]:
                best["ema_state"] = checkpoint["ema_state"]
                best["ema_err"] = ema_err
            with pathmgr.open(get_checkpoint_best(), "wb") as f:
                torch.save(best, f)
    return checkpoint_file
Example #7
0
def dump_cfg(cfg_file, cfg):
    """Dumps the config to the specified location."""
    with open(cfg_file, "w") as f:
        cfg.dump(stream=f)