def setup_env(): """Sets up environment for training or testing.""" if dist.is_main_proc(): # Ensure that the output dir exists pathmgr.mkdirs(cfg.OUT_DIR) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log torch, cuda, and cudnn versions version = [ torch.__version__, torch.version.cuda, torch.backends.cudnn.version() ] logger.info( "PyTorch Version: torch={}, cuda={}, cudnn={}".format(*version)) env = "".join( [f"{key}: {value}\n" for key, value in sorted(os.environ.items())]) logger.info(f"os.environ:\n{env}") # Log the config as both human readable and as a json logger.info("Config:\n{}".format(cfg)) if cfg.VERBOSE else () logger.info(logging.dump_log_data(cfg, "cfg", None)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) random.seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def setup_env(): """Sets up environment for training or testing.""" if dist.is_master_proc(): # Ensure that the output dir exists os.makedirs(cfg.OUT_DIR, exist_ok=True) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log the config logger.info("Config:\n{}".format(cfg)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def test_model(): """Evaluates the model.""" # Setup logging logging.setup_logging() # Show the config logger.info("Config:\n{}".format(cfg)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK # Build the model (before the loaders to speed up debugging) model = builders.build_model() logger.info("Model:\n{}".format(model)) logger.info(logging.dump_json_stats(net.complexity(model))) # Compute precise time if cfg.PREC_TIME.ENABLED: logger.info("Computing precise time...") loss_fun = builders.build_loss_fun() prec_time = net.compute_precise_time(model, loss_fun) logger.info(logging.dump_json_stats(prec_time)) net.reset_bn_stats(model) # Load model weights checkpoint.load_checkpoint(cfg.TEST.WEIGHTS, model) logger.info("Loaded model weights from: {}".format(cfg.TEST.WEIGHTS)) # Create data loaders test_loader = loader.construct_test_loader() # Create meters test_meter = meters.TestMeter(len(test_loader)) # Evaluate the model test_epoch(test_loader, model, test_meter, 0)
def train_model(): """Trains the model.""" # Setup logging logging.setup_logging() # Show the config logger.info("Config:\n{}".format(cfg)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK # Build the model (before the loaders to speed up debugging) model = builders.build_model() logger.info("Model:\n{}".format(model)) logger.info(logging.dump_json_stats(net.complexity(model))) # Define the loss function loss_fun = builders.build_loss_fun() # Construct the optimizer optimizer = optim.construct_optimizer(model) # Load checkpoint or initial weights start_epoch = 0 if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint(): last_checkpoint = checkpoint.get_last_checkpoint() checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model, optimizer) logger.info("Loaded checkpoint from: {}".format(last_checkpoint)) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.WEIGHTS: checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model) logger.info("Loaded initial weights from: {}".format( cfg.TRAIN.WEIGHTS)) # Compute precise time if start_epoch == 0 and cfg.PREC_TIME.ENABLED: logger.info("Computing precise time...") prec_time = net.compute_precise_time(model, loss_fun) logger.info(logging.dump_json_stats(prec_time)) net.reset_bn_stats(model) # Create data loaders train_loader = loader.construct_train_loader() test_loader = loader.construct_test_loader() # Create meters train_meter = meters.TrainMeter(len(train_loader)) test_meter = meters.TestMeter(len(test_loader)) # Perform the training loop logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH): # Train for one epoch train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch) # Compute precise BN stats if cfg.BN.USE_PRECISE_STATS: net.compute_precise_bn_stats(model, train_loader) # Save a checkpoint if checkpoint.is_checkpoint_epoch(cur_epoch): checkpoint_file = checkpoint.save_checkpoint( model, optimizer, cur_epoch) logger.info("Wrote checkpoint to: {}".format(checkpoint_file)) # Evaluate the model if is_eval_epoch(cur_epoch): test_epoch(test_loader, model, test_meter, cur_epoch)