Beispiel #1
0
def setup_env():
    """Sets up environment for training or testing."""
    if dist.is_master_proc():
        # Ensure that the output dir exists
        pathmgr.mkdirs(cfg.OUT_DIR)
        # Save the config
        config.dump_cfg()
    # Setup logging
    logging.setup_logging()
    # Log torch, cuda, and cudnn versions
    version = [
        torch.__version__, torch.version.cuda,
        torch.backends.cudnn.version()
    ]
    logger.info(
        "PyTorch Version: torch={}, cuda={}, cudnn={}".format(*version))
    env = "".join(
        [f"{key}: {value}\n" for key, value in sorted(os.environ.items())])
    logger.info(f"os.environ:\n{env}")
    # Log the config as both human readable and as a json
    logger.info("Config:\n{}".format(cfg)) if cfg.VERBOSE else ()
    logger.info(logging.dump_log_data(cfg, "cfg", None))
    # Fix the RNG seeds (see RNG comment in core/config.py for discussion)
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    random.seed(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
Beispiel #2
0
def setup_logging():
    """Sets up the logging."""
    # Enable logging only for the master process
    if dist.is_master_proc():
        #     # Clear the root logger to prevent any existing logging config
        #     # (e.g. set by another module) from messing with our setup
        #     logging.root.handlers = []
        #     # Construct logging configuration
        #     logging_config = {"level": logging.INFO, "format": _FORMAT}
        #     # Log either to stdout or to a file
        #     if cfg.LOG_DEST == "stdout":
        #         logging_config["stream"] = sys.stdout
        #     else:
        #         logging_config["filename"] = os.path.join(cfg.OUT_DIR, _LOG_FILE)
        #     # Configure logging
        #     logging.basicConfig(**logging_config)
        # else:
        #     _suppress_print()
        logging.basicConfig(
            level=logging.DEBUG,
            format=colored("[%(asctime)s]", "green") + "  %(message)s",
            datefmt="%m/%d %H:%M:%S",
            handlers=[
                logging.FileHandler(os.path.join(cfg.OUT_DIR, cfg.LOG_DEST)),
                logging.StreamHandler()
            ])
Beispiel #3
0
def save_checkpoint(model, optimizer, epoch):
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    if not dist.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    os.makedirs(get_checkpoint_dir(), exist_ok=True)
    # Omit the DDP wrapper in the multi-gpu setting
    sd = model.module.state_dict() if cfg.NUM_GPUS > 1 else model.state_dict()
    # Record the state
    if isinstance(optimizer, list):
        checkpoint = {
            "epoch": epoch,
            "model_state": sd,
            "optimizer_w_state": optimizer[0].state_dict(),
            "optimizer_a_state": optimizer[1].state_dict(),
            "cfg": cfg.dump(),
        }
    else:
        checkpoint = {
            "epoch": epoch,
            "model_state": sd,
            "optimizer_state": optimizer.state_dict(),
            "cfg": cfg.dump(),
        }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    torch.save(checkpoint, checkpoint_file)
    return checkpoint_file
Beispiel #4
0
def get_weights_file(weights_file):
    """Download weights file if stored as a URL."""
    download = dist.is_master_proc(local=True)
    weights_file = cache_url(weights_file,
                             cfg.DOWNLOAD_CACHE,
                             download=download)
    if cfg.NUM_GPUS > 1:
        torch.distributed.barrier()
    return weights_file
Beispiel #5
0
def setup_logging():
    """Sets up the logging."""
    # Enable logging only for the master process
    if dist.is_master_proc():
        logging.basicConfig(
            level=logging.DEBUG,
            format=colored("[%(asctime)s]", "green") + "  %(message)s",
            datefmt="%m/%d %H:%M:%S",
            handlers=[
                logging.FileHandler(os.path.join(cfg.OUT_DIR, cfg.LOG_DEST)),
                logging.StreamHandler(),
            ],
        )
Beispiel #6
0
def setup_env():
    """Sets up environment for training or testing."""
    if dist.is_master_proc():
        # Ensure that the output dir exists
        os.makedirs(cfg.OUT_DIR, exist_ok=True)
        # Save the config
        config.dump_cfg()
    # Setup logging
    logging.setup_logging()
    # Log the config
    logger.info("Config:\n{}".format(cfg))
    # Fix the RNG seeds (see RNG comment in core/config.py for discussion)
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
Beispiel #7
0
def setup_logging():
    """Sets up the logging."""
    # Enable logging only for the master process
    if dist.is_master_proc():
        # Clear the root logger to prevent any existing logging config
        # (e.g. set by another module) from messing with our setup
        logging.root.handlers = []
        # Construct logging configuration
        logging_config = {"level": logging.INFO, "format": _FORMAT}
        # Log either to stdout or to a file
        if cfg.LOG_DEST == "stdout":
            logging_config["stream"] = sys.stdout
        else:
            logging_config["filename"] = os.path.join(cfg.OUT_DIR, _LOG_FILE)
        # Configure logging
        logging.basicConfig(**logging_config)
    else:
        _suppress_print()
Beispiel #8
0
def save_checkpoint(model, optimizer, epoch):
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    if not dist.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    os.makedirs(get_checkpoint_dir(), exist_ok=True)
    # Record the state
    checkpoint = {
        "epoch": epoch,
        "model_state": unwrap_model(model).state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "cfg": cfg.dump(),
    }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    torch.save(checkpoint, checkpoint_file)
    return checkpoint_file
Beispiel #9
0
def save_checkpoint(model, optimizer, epoch, best):
    """Saves a checkpoint."""
    # Save checkpoints only from the master process
    if not dist.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    pathmgr.mkdirs(get_checkpoint_dir())
    # Record the state
    checkpoint = {
        "epoch": epoch,
        "model_state": unwrap_model(model).state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "cfg": cfg.dump(),
    }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    with pathmgr.open(checkpoint_file, "wb") as f:
        torch.save(checkpoint, f)
    # If best copy checkpoint to the best checkpoint
    if best:
        pathmgr.copy(checkpoint_file, get_checkpoint_best())
    return checkpoint_file
Beispiel #10
0
def save_checkpoint(model, model_ema, optimizer, epoch, test_err, ema_err):
    """Saves a checkpoint and also the best weights so far in a best checkpoint."""
    # Save checkpoints only from the master process
    if not dist.is_master_proc():
        return
    # Ensure that the checkpoint dir exists
    pathmgr.mkdirs(get_checkpoint_dir())
    # Record the state
    checkpoint = {
        "epoch": epoch,
        "test_err": test_err,
        "ema_err": ema_err,
        "model_state": unwrap_model(model).state_dict(),
        "ema_state": unwrap_model(model_ema).state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "cfg": cfg.dump(),
    }
    # Write the checkpoint
    checkpoint_file = get_checkpoint(epoch + 1)
    with pathmgr.open(checkpoint_file, "wb") as f:
        torch.save(checkpoint, f)
    # Store the best model and model_ema weights so far
    if not pathmgr.exists(get_checkpoint_best()):
        pathmgr.copy(checkpoint_file, get_checkpoint_best())
    else:
        with pathmgr.open(get_checkpoint_best(), "rb") as f:
            best = torch.load(f, map_location="cpu")
        # Select the best model weights and the best model_ema weights
        if test_err < best["test_err"] or ema_err < best["ema_err"]:
            if test_err < best["test_err"]:
                best["model_state"] = checkpoint["model_state"]
                best["test_err"] = test_err
            if ema_err < best["ema_err"]:
                best["ema_state"] = checkpoint["ema_state"]
                best["ema_err"] = ema_err
            with pathmgr.open(get_checkpoint_best(), "wb") as f:
                torch.save(best, f)
    return checkpoint_file